Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,19 @@ repo.getIssues(false).ifPresent(issueData -> issueData.forEach(issue -> {
System.out.println(comment.user.username + ": " + comment.body));
}));
```

### Further data processing

The data extracted by this tool can be further processed, for example using the `run-issues.py` script from the tool [`codeface-extraction`](https://github.com/se-sic/codeface-extraction). This organizes and unifies the issue data into a single csv-like .list file. It also allows for synchronization with data from other data extraction tools, such as `codeface`.

### `referenced` events

`referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed.
Known causes of this include:

- a commit was rebased and changed/removed
- an external repository was deleted
- the commit's branch was deleted

Note that the commit might still be reachable until the automatic garbage collection has removed it from the remote repository.
In itself, this is not problematic. However, when further processing the data using `codeface-extraction`, this may lead to these `referenced` events being present in the final data, even though they should be filtered out as part of the issue processing.
9 changes: 7 additions & 2 deletions src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* Copyright (C) 2016-2018 Florian Heck
* Copyright (C) 2019 Thomas Bock
* Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
Expand Down Expand Up @@ -86,8 +87,12 @@ public void postDeserialize(EventData.ReferencedEventData result, JsonElement sr
}

result.commit = repo.getGithubCommit(hash.getAsString()).orElseGet(() -> {
LOG.warning("Found commit unknown to GitHub and local git repo: " + hash);
return null;
LOG.warning("Found commit unknown to GitHub and local git repo: " + hash + " Retry using url...");
JsonElement url = src.getAsJsonObject().get("commit_url");
return repo.getGithubCommitUrl(hash.getAsString(), url.getAsString()).orElseGet(() -> {
LOG.warning("Could not find commit: " + hash);
return null;
});
});
}

Expand Down
21 changes: 21 additions & 0 deletions src/de/uni_passau/fim/gitwrapper/GitHubCommit.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* Copyright (C) 2019 Thomas Bock
* Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
Expand All @@ -26,6 +27,7 @@ public class GitHubCommit extends Commit {
private String authorUsername;
private String committerUsername;
private boolean addedToPullRequest = false;
private boolean external = false;

/**
* Constructs a new {@link GitHubCommit} with the given <code>id</code> made in the <code>repo</code>.
Expand Down Expand Up @@ -119,4 +121,23 @@ public boolean isAddedToPullRequest() {
void setAddedToPullRequest(boolean added) {
this.addedToPullRequest = added;
}

/**
* Returns whether this commit is an external commit.
*
* @return whether this commit is an external commit
*/
boolean isExternal() {
return this.external;
}

/**
* Sets whether this commit is an external commit
*
* @param external this commit is an external commit
*/
void setExternal(boolean external) {
this.external = external;
}

}
37 changes: 36 additions & 1 deletion src/de/uni_passau/fim/gitwrapper/GitHubRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (C) 2016-2020 Florian Heck
* Copyright (C) 2018 Claus Hunsen
* Copyright (C) 2019-2021 Thomas Bock
* Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
Expand Down Expand Up @@ -352,6 +353,8 @@ public Optional<List<IssueData>> getIssues(boolean includePullRequests, OffsetDa
}
else timeLimit = "";
Type finalType = type;
// For debugging, you may add additional parameters to the string. For example, '/issues?creator=sleo&state=all'
// will fetch issues created by user 'sleo' and all related issues and commits.
getJSONStringFromPath("/issues?state=all" + timeLimit).map(json -> {
List<IssueData> data;
try {
Expand All @@ -367,7 +370,7 @@ public Optional<List<IssueData>> getIssues(boolean includePullRequests, OffsetDa
threadPool.submit(() -> data.parallelStream().forEach(IssueData::freeze));

} catch (JsonSyntaxException e) {
LOG.warning("Encountered invalid JSON: " + json);
LOG.warning("Encountered invalid JSON: " + json + "\n\n" + e.getMessage() + "\n\n" + e);
return null;
}
return data;
Expand Down Expand Up @@ -1028,6 +1031,38 @@ Optional<GitHubCommit> getGithubCommit(String hash) {
});
}

Optional<GitHubCommit> getGithubCommitUrl(String hash, String url) {
if (offline.get()) {
return Optional.of(getGHCommitUnchecked(DummyCommit.DUMMY_COMMIT_ID));
} else {
try {
Optional<GitHubCommit> res = getJSONStringFromURL(url).map(commitInfo ->
gson.fromJson(commitInfo, new TypeToken<GitHubCommit>() {}.getType()));
checkedHashes.put(hash, res);
if (res.isPresent()) {
res.get().setExternal(true);
}
return res;
} catch (JsonSyntaxException e) {
/* For whatever reason, the JSON String is malformed, perhaps due to ill-encoded characters
* in patches within the files element of the JSON String.
* Due to that, get the JSON String again and remove the content of the files element of the
* JSON String, as it is not needed for further processing.
*/
LOG.info("Malformed JSON String when querying data for commit " + url + ". Neglect files element.");
String jsonStringFromURL = getJSONStringFromURL(url).get();
jsonStringFromURL = StringUtils.substringBefore(jsonStringFromURL, "\"files\":[");
jsonStringFromURL = jsonStringFromURL + "\"files\":[]}";
Optional<GitHubCommit> res = Optional.of(gson.fromJson(jsonStringFromURL, new TypeToken<GitHubCommit>() {}.getType()));
checkedHashes.put(hash, res);
if (res.isPresent()) {
res.get().setExternal(true);
}
return res;
}
}
}

/**
* Creates a new Commit with the given data, and tries to fill in the missing data from the local Repository
*
Expand Down
20 changes: 19 additions & 1 deletion src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* Copyright (C) 2016-2018 Florian Heck
* Copyright (C) 2019-2020 Thomas Bock
* Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
Expand Down Expand Up @@ -76,7 +77,13 @@ private List<ReferencedLink<GitHubCommit>> parseCommits(IssueData issue) {
.filter(eventData -> eventData instanceof EventData.ReferencedEventData)
// filter out errors from referencing commits
.filter(eventData -> ((EventData.ReferencedEventData) eventData).commit != null)
.map(eventData -> new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue"));
.map(eventData -> {
if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).isExternal()) {
return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssueExternal");
} else {
return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue");
}
});

// Parse commits from reviews and reviews' comments
if (issue.isPullRequest()) {
Expand Down Expand Up @@ -262,6 +269,16 @@ private List<String> extractHashtags(String text, boolean onlyInSameRepo) {
}
Pattern hashtagPattern;

// filter out everything in code block
String[] texts = text.split("```");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < texts.length; i++) {
if (i % 2 == 0) {
sb.append(texts[i]);
}
}
text = sb.toString();

if (onlyInSameRepo) {
String repoName = repo.getRepoName();
String repoUser = repo.getRepoUser();
Expand Down Expand Up @@ -379,6 +396,7 @@ public void postDeserialize(IssueData result, JsonElement src, Gson gson) {
Optional<List<ReferencedLink<String>>> comments = repo.getComments(lookup);
result.setComments(comments.orElse(Collections.emptyList()));
}

if (result.getEventsList() == null) {
Optional<List<EventData>> events = repo.getEvents(lookup);
result.setEvents(events.orElse(Collections.emptyList()));
Expand Down