Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions author_postprocessing/author_postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#
# Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
# Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
Expand Down Expand Up @@ -178,20 +179,25 @@ def is_github_noreply_author(name, email):
commit_data_file = path.join(data_path, commits_list)
commit_data = csv_writer.read_from_csv(commit_data_file)
commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data}

author_name_to_data = {author[1]: author[1:3] for author in author_data_new}
issue_data_new = []

for event in issue_data:
# replace author if necessary
if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
# extract commit hash from event info 1
commit_hash = event[12]

# extract author name from event info 2 while cutting excess '"'
name = event[13][1:-1]
# extract commit author from commit data, if available
if commit_hash in commit_hash_to_author:
event[9] = commit_hash_to_author[commit_hash][0]
event[10] = commit_hash_to_author[commit_hash][1]
issue_data_new.append(event)
elif name in author_name_to_data:
event[9] = author_name_to_data[name][0]
event[10] = author_name_to_data[name][1]
issue_data_new.append(event)
else:
# the added commit is not part of the commit data. In most cases, this is due to merge commits
# appearing in another pull request, as Codeface does not keep track of merge commits. As we
Expand Down
156 changes: 137 additions & 19 deletions issue_processing/issue_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
# Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
# Copyright 2019 by Thomas Bock <bockthom@fim.uni-passau.de>
# Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to extract Github issue data from json files.
Expand All @@ -30,6 +31,7 @@
import sys
import urllib
from datetime import datetime, timedelta
import math

import operator
from codeface.cli import log
Expand All @@ -41,7 +43,7 @@
from csv_writer import csv_writer

# known types from JIRA and GitHub default labels
known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"}
known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"}

# known resolutions from JIRA and GitHub default labels
known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce",
Expand Down Expand Up @@ -74,13 +76,19 @@ def run():
# 1) load the list of issues
issues = load(__srcdir)
# 2) re-format the issues
issues = reformat_issues(issues)
reformat_issues(issues)
# create an empty dict for external connected events, meaning connected
# events that connect to an issue in another repository
external_connected_events = dict()
# 3) merges all issue events into one list
issues = merge_issue_events(issues)
# this step returns a dict containing all connected events that can be matched to the correct issues later
filtered_connected_events = merge_issue_events(issues, external_connected_events)
# 4) re-format the eventsList of the issues
issues = reformat_events(issues)
# this step also reconstructs the connections previously stored
# in 'external_connected_events' and 'filtered_connected_events'
reformat_events(issues, filtered_connected_events, external_connected_events)
# 5) update user data with Codeface database and dump username-to-name/e-mail list
issues = insert_user_data(issues, __conf, __resdir)
insert_user_data(issues, __conf, __resdir)
# 6) dump result to disk
print_to_disk(issues, __resdir)

Expand Down Expand Up @@ -238,7 +246,10 @@ def reformat_issues(issue_data):
for issue in issue_data:

# empty container for issue types
issue["type"] = []
if issue["type"] is None:
issue["type"] = []
else:
issue["type"] = [issue["type"]["name"].lower()]

# empty container for issue resolutions
issue["resolution"] = []
Expand Down Expand Up @@ -279,10 +290,10 @@ def reformat_issues(issue_data):
else:
issue["type"].append("issue")

return issue_data
return


def merge_issue_events(issue_data):
def merge_issue_events(issue_data, external_connected_events):
"""
All issue events are merged together in the eventsList. This simplifies processing in later steps.

Expand All @@ -293,6 +304,7 @@ def merge_issue_events(issue_data):
log.info("Merge issue events ...")

issue_data_to_update = dict()
connected_events = dict()

for issue in issue_data:

Expand Down Expand Up @@ -361,6 +373,7 @@ def merge_issue_events(issue_data):
# it is a commit which was added to the pull request
if rel_commit["type"] == "commitAddedToPullRequest":
rel_commit["event"] = "commit_added"
rel_commit["event_info_2"] = rel_commit["commit"]["author"]

# if the related commit was mentioned in an issue comment:
elif rel_commit["type"] == "commitMentionedInIssue":
Expand Down Expand Up @@ -488,6 +501,32 @@ def merge_issue_events(issue_data):
event["ref_target"] = event["user"]
event["user"] = event["assigner"]

# if event is merged event, save the hash of the merge commit in event_info_1
if event["event"] == "merged":
event["event_info_1"] = event["commit"]["hash"]

# if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction
if event["event"] == "connected":
if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]:
# if there is already a connected event at this time by this user, add this event to the list
connected_events[event["created_at"]]["issues"].append(issue["number"])
elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \
and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]:
# same as above, but accounting for a possible difference in timestamps of 1 second between matching events
connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"])
event["created_at"] = subtract_seconds_from_time(event["created_at"], 1)
elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \
and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]:
# same as above, with offset calculated in the other direction
connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"])
event["created_at"] = subtract_seconds_from_time(event["created_at"], -1)
else:
# if there is no connected event yet at this timestamp, create a new entry for this event
connected_info = dict()
connected_info["issues"] = [issue["number"]]
connected_info["user"] = issue["user"]
connected_events[event["created_at"]] = connected_info

# merge events, relatedCommits, relatedIssues and comment lists
issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[
"relatedCommits"] + issue["reviewsList"]
Expand All @@ -499,16 +538,53 @@ def merge_issue_events(issue_data):
# sorts eventsList by time
issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])

# filter out connected events which cannot be perfectly matched
filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items()))

# updates all the issues by the temporarily stored referenced_by events
for key, value in issue_data_to_update.iteritems():
for issue in issue_data:
if issue["number"] == value["number"]:
issue["eventsList"] = issue["eventsList"] + value["eventsList"]

return issue_data


def reformat_events(issue_data):
return filtered_connected_events


def filter_connected_events(key, value, external_connected_events):
num_issues = len(value["issues"])
# if only a single connected event exists at this time, it has to be connecting to an external issue
if num_issues == 1:
external_connected_events[key] = value
return False
# if 2 connected events exist, matching them is trivial
if num_issues == 2:
return True
occurences = {x: value["issues"].count(x) for x in set(value["issues"])}
# otherwise, if it is an even number, check if it can be easily matched,
# meaning that exactly half the events occur in the same issue
if num_issues % 2 == 0 and num_issues/2 in occurences.values():
# duplicate issue list for matching the issues later
value["multi_issues_copy"] = list(value["issues"])
return True
# if it is an odd number, check if it can be easily matched
# meaning that exactly half (rounded up) the events occur in the same issue
if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values():
for sub_key, sub_value in occurences.iteritems():
# then, assign one of them as an external connected event and proceed as in previous case
if sub_value == (num_issues + 1)/2:
new_entry = dict()
new_entry["user"] = value["user"]
new_entry["issues"] = [sub_key]
external_connected_events[key] = new_entry
value["issues"].remove(sub_key)
# duplicate issue list for matching the issues later
value["multi_issues_copy"] = list(value["issues"])
return True
# no other variants can be easily matched
return False


def reformat_events(issue_data, filtered_connected_events, external_connected_events):
"""
Re-format event information dependent on the event type.

Expand Down Expand Up @@ -538,6 +614,35 @@ def reformat_events(issue_data):
if not event["ref_target"] is None and not event["ref_target"] == "":
users = update_user_dict(users, event["ref_target"])

# reconstruction of connections
if event["event"] == "connected":
if event["created_at"] in external_connected_events \
and issue["number"] in external_connected_events[event["created_at"]]["issues"]:
# if the event is an external connected event, mark it as such and remove this issue from the list
event["event_info_1"] = "external"
external_connected_events[event["created_at"]]["issues"].remove(issue["number"])
elif event["created_at"] in filtered_connected_events \
and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]:
# if it is instead an internal connected event
value = filtered_connected_events[event["created_at"]]
if len(value["issues"]) == 2:
# and we only have 2 issues in the list, connect to the other issue
event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
else:
# and we have more than two issues, count each issue's occurences
occurences = {x: value["issues"].count(x) for x in set(value["issues"])}
if occurences[issue["number"]] == max(occurences.values()):
# if our issue is the most common one, that means it is the common denominator
# for all connected events at this time
# so this event connects to any other issue
# which is then removed from a copied list to avoid duplications
number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
value["multi_issues_copy"].remove(number)
event["event_info_1"] = number
else:
# otherwise, connect this event to the common denominator
event["event_info_1"] = max(occurences, key=occurences.get)

# as the user dictionary is created, start re-formating the event information of all issues
for issue in issue_data:

Expand All @@ -555,21 +660,24 @@ def reformat_events(issue_data):
if event["event"] == "closed":
event["event"] = "state_updated"
event["event_info_1"] = "closed" # new state
event["event_info_2"] = "open" # old state
if event["commit"] is not None:
event["event_info_2"] = event["commit"]["hash"]
else:
event["event_info_2"] = event["state_reason"]
issue["state_new"] = "closed"

elif event["event"] == "reopened":
event["event"] = "state_updated"
event["event_info_1"] = "open" # new state
event["event_info_2"] = "closed" # old state
event["event_info_2"] = event["state_reason"]
issue["state_new"] = "reopened"

elif event["event"] == "labeled":
label = event["label"]["name"].lower()
event["event_info_1"] = label

# if the label is in this list, it also is a type of the issue
if label in known_types:
if label in known_types and label not in issue["type"]:
issue["type"].append(str(label))

# creates an event for type updates and adds it to the eventsList
Expand Down Expand Up @@ -634,7 +742,10 @@ def reformat_events(issue_data):
# "state_new" and "resolution" of the issue give the information about the state and the resolution of
# the issue when the comment was written, because the eventsList is sorted by time
event["event_info_1"] = issue["state_new"]
event["event_info_2"] = issue["resolution"]
if "contains_suggestion" in event:
event["event_info_2"] = event["contains_suggestion"]
else:
event["event_info_2"] = False

elif event["event"] == "referenced" and not event["commit"] is None:
# remove "referenced" events originating from commits
Expand All @@ -648,7 +759,7 @@ def reformat_events(issue_data):
for event_to_remove in events_to_remove:
issue["eventsList"].remove(event_to_remove)

return issue_data
return


def insert_user_data(issues, conf, resdir):
Expand Down Expand Up @@ -745,6 +856,9 @@ def get_user_from_id(idx, buffer_db=user_buffer):
for event in issue["eventsList"]:
event["user"] = get_id_and_update_user(event["user"])

if event["event"] == "commit_added":
event["event_info_2"] = get_id_and_update_user(event["event_info_2"])

# check database for the reference-target user if needed
if event["ref_target"] != "":
event["ref_target"] = get_id_and_update_user(event["ref_target"])
Expand All @@ -758,6 +872,10 @@ def get_user_from_id(idx, buffer_db=user_buffer):
for event in issue["eventsList"]:
event["user"] = get_user_from_id(event["user"])

# for commit_added events, save the commit's author's name in event_info_2
if event["event"] == "commit_added":
event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"]

# get the reference-target user if needed
if event["ref_target"] != "":
event["ref_target"] = get_user_from_id(event["ref_target"])
Expand All @@ -778,7 +896,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
username_dump = os.path.join(resdir, "usernames.list")
csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0]))

return issues
return


def print_to_disk(issues, results_folder):
Expand All @@ -805,7 +923,7 @@ def print_to_disk(issues, results_folder):
json.dumps(issue["resolution"]),
issue["created_at"],
issue["closed_at"],
json.dumps([]), # components
json.dumps([issue["subIssues"]]), # components
event["event"],
event["user"]["name"],
event["user"]["email"],
Expand Down
14 changes: 9 additions & 5 deletions issue_processing/jira_issue_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
# Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
# Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
# Copyright 2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to extract Jira issue data from xml files.
Expand Down Expand Up @@ -125,7 +126,7 @@ def run():
referenced_issue["history"].append(referenced_by)

# 5) update user data with Codeface database
processed_issues = insert_user_data(processed_issues, __conf)
insert_user_data(processed_issues, __conf)
# 6) dump result to disk
print_to_disk(processed_issues, __resdir)
# # 7) export for Gephi
Expand Down Expand Up @@ -300,9 +301,12 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys):
link = issue_x.getElementsByTagName("link")[0]
issue["url"] = link.firstChild.data

type = issue_x.getElementsByTagName("type")[0]
issue["type"] = type.firstChild.data
issue["type_list"] = ["issue", str(type.firstChild.data.lower())]
type = issue_x.getElementsByTagName("type")[0].firstChild.data
# rename 'new feature' type to 'feature' to be in line with the github original issue type
if type == "New Feature":
type = "Feature"
issue["type"] = type
issue["type_list"] = ["issue", str(type.lower())]

status = issue_x.getElementsByTagName("status")[0]
issue["state"] = status.firstChild.data
Expand Down Expand Up @@ -686,7 +690,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
event["event_info_2"] = assigned_user["email"]

log.debug("number of issues after insert_user_data: '{}'".format(len(issues)))
return issues
return


def print_to_disk(issues, results_folder):
Expand Down