Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions anonymization/anonymization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
# Copyright 2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
# Copyright 2022 by Christian Hechtl <hechtl@cs.uni-saarland.de>
# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to anonymize authors and issue titles after the extraction from the Codeface database was performed.
Expand All @@ -29,15 +30,14 @@
import sys
from os import path, walk, makedirs
from os.path import abspath
from shutil import copy

from codeface.cli import log
from codeface.configuration import Configuration
from codeface.dbmanager import DBManager
from logging import getLogger

from codeface_utils.configuration import Configuration
from csv_writer import csv_writer


log = getLogger(__name__)

##
# RUN POSTPROCESSING
##
Expand Down Expand Up @@ -104,13 +104,13 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F

# Don't anonymize the deleted user as this one might be needed for filtering (but add it to the dictionary)
if orig_author == "Deleted user" and orig_email == "ghost@github.com":
if not (orig_author, orig_email) in author_to_anonymized_author:
if (orig_author, orig_email) not in author_to_anonymized_author:
author_to_anonymized_author[(orig_author, orig_email)] = (orig_author, orig_email)
else:
# check whether (name, e-mail) pair isn't already present in the dictionary
if not (orig_author, orig_email) in author_to_anonymized_author:
if (orig_author, orig_email) not in author_to_anonymized_author:
# check if just the name (without e-mail address) isn't already present in the dictionary
if not orig_author in author_to_anonymized_author:
if orig_author not in author_to_anonymized_author:
# if the author has an empty name, only anonymize their e-mail address
if not author[1] == "":
author[1] = ("developer" + str(i))
Expand Down Expand Up @@ -141,7 +141,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F


# Check for all files in the result directory of the project whether they need to be anonymized
for filepath, dirnames, filenames in walk(data_path):
for filepath, _, filenames in walk(data_path):

# (1) Anonymize authors lists
if authors_list in filenames:
Expand Down Expand Up @@ -170,7 +170,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F
# anonymize authors
author_data, i, author_to_anonymized_author = \
anonymize_authors(author_data, i, author_to_anonymized_author)

author_data_gender, i_gender, author_to_anonymized_author_gender = \
anonymize_authors(author_data_gender, i_gender, author_to_anonymized_author_gender, name_only = True)

Expand Down Expand Up @@ -343,7 +343,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F
gender_data_new = []

for author in gender_data:
if author[0] in author_to_anonymized_author_gender.keys():
if author[0] in list(author_to_anonymized_author_gender.keys()):
new_person = author_to_anonymized_author_gender[author[0]]
author[0] = new_person[0]
gender_data_new.append(author)
Expand Down Expand Up @@ -395,7 +395,7 @@ def run():
# process arguments
# - First make all the args absolute
__resdir = abspath(args.resdir)
__codeface_conf, __project_conf = map(abspath, (args.config, args.project))
__codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))

# load configuration
__conf = Configuration.load(__codeface_conf, __project_conf)
Expand Down
59 changes: 38 additions & 21 deletions author_postprocessing/author_postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#
# Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
# Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
Expand Down Expand Up @@ -42,14 +44,24 @@
from os import path, walk, makedirs
from os.path import abspath
from shutil import copy
from logging import getLogger

from codeface.cli import log
from codeface.configuration import Configuration
from codeface.dbmanager import DBManager

from codeface_utils.configuration import Configuration
from csv_writer import csv_writer


log = getLogger(__name__)

##
# GLOBAL VARIABLES
##

# global variable containing all known copilot users and the name and mail adress copilot users will be assigned
known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"}
copilot_unified_name = "Copilot"
copilot_unified_email = "copilot@example.com"


##
# RUN POSTPROCESSING
##
Expand All @@ -67,7 +79,7 @@ def perform_data_backup(results_path, results_path_backup):
log.info("Backup folder already exists. No backup is to be performed.")
return

for filepath, dirnames, filenames in walk(results_path):
for filepath, _, filenames in walk(results_path):
for filename in filenames:
if filename.endswith(".list"):
current_file = path.join(filepath, filename)
Expand All @@ -78,7 +90,7 @@ def perform_data_backup(results_path, results_path_backup):
copy(current_file, backup_file)


def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list):
def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True):
"""
Replace the author "GitHub <noreply@github.com>" in both commit and GitHub issue data by the correct author.
The author "GitHub <noreply@github.com>" is automatically inserted as the committer of a commit that is made when
Expand All @@ -89,14 +101,15 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
"GitHub <noreply@github.com>" are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which
reference the author "GitHub <noreply@github.com>" are removed from the GitHub issue data. In addition, remove the
author "GitHub <noreply@github.com>" also from the author data and bot data and remove e-mails that have been sent
by this author.
by this author. This method also unifies all known copilot users into a single user if desired.

:param data_path: the path to the project data that is to be fixed
:param issues_github_list: file name of the github issue data
:param commits_list: file name of the corresponding commit data
:param authors_list: file name of the corresponding author data
:param emails_list: file name of the corresponding email data
:param bots_list: file name of the corresponding bot data
:param unify_copilot_users: whether to unify known copilot users into a single user
"""
github_user = "GitHub"
github_email = "noreply@github.com"
Expand All @@ -119,7 +132,7 @@ def is_github_noreply_author(name, email):


# Check for all files in the result directory of the project whether they need to be adjusted
for filepath, dirnames, filenames in walk(data_path):
for filepath, _, filenames in walk(data_path):

# (1) Remove author 'GitHub <noreply@github.com>' from authors list
if authors_list in filenames:
Expand Down Expand Up @@ -148,7 +161,7 @@ def is_github_noreply_author(name, email):
if not is_github_noreply_author(email[0], email[1]):
email_data_new.append(email)
else:
log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1])
log.warning("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1])
csv_writer.write_to_csv(f, email_data_new)


Expand Down Expand Up @@ -178,39 +191,43 @@ def is_github_noreply_author(name, email):
commit_data_file = path.join(data_path, commits_list)
commit_data = csv_writer.read_from_csv(commit_data_file)
commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data}

author_name_to_data = {author[1]: author[1:3] for author in author_data_new}
issue_data_new = []

for event in issue_data:
# replace author if necessary
if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
# extract commit hash from event info 1
commit_hash = event[12]

name = event[13][1:-1]
# extract commit author from commit data, if available
if commit_hash in commit_hash_to_author:
event[9] = commit_hash_to_author[commit_hash][0]
event[10] = commit_hash_to_author[commit_hash][1]
issue_data_new.append(event)
elif name in author_name_to_data:
event[9] = author_name_to_data[name][0]
event[10] = author_name_to_data[name][1]
issue_data_new.append(event)
else:
# the added commit is not part of the commit data. In most cases, this is due to merge commits
# appearing in another pull request, as Codeface does not keep track of merge commits. As we
# ignore merge commits in the commit data, we consistently ignore them also if they are added
# to a pull request. Hence, the corresponding "commit_added" event will be removed now (i.e.,
# not added to the new issue data any more).
log.warn("Commit %s is added in the GitHub issue data, but not part of the commit data. " +
"Remove the corresponding 'commit_added' event from the issue data...", commit_hash)
log.warning("Commit %s is added in the GitHub issue data, but not part of the commit data. " +
"Remove the corresponding 'commit_added' event from the issue data...", commit_hash)
elif is_github_noreply_author(event[9], event[10]):
# the event is authored by 'GitHub <noreply@github.com>', but is not a "commit_added" event, so we
# neglect this event and remove it now (i.e., not add it to the new issue data any more).
log.warn("Event %s is authored by %s <%s>. Remove this event form the issue data...",
event[8], event[9], event[10])
log.warning("Event %s is authored by %s <%s>. Remove this event form the issue data...",
event[8], event[9], event[10])
elif (is_github_noreply_author(event[12], event[13][1:-1])
and (event[8] == mentioned_event or event[8] == subscribed_event)):
# the event references 'GitHub <noreply@github.com>', so we neglect this event and remove it now
# (i.e., not add it to the new issue data any more).
log.warn("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...",
event[8], event[9], event[10], event[12], event[13])
log.warning("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...",
event[8], event[9], event[10], event[12], event[13])
else:
issue_data_new.append(event)

Expand All @@ -229,7 +246,7 @@ def is_github_noreply_author(name, email):
if not is_github_noreply_author(entry[0], entry[1]):
bot_data_new.append(entry)
else:
log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1])
log.warning("Remove entry %s <%s> from bots list.", entry[0], entry[1])

csv_writer.write_to_csv(f, bot_data_new)

Expand Down Expand Up @@ -285,7 +302,7 @@ def run_postprocessing(conf, resdir, backup_data):
return

# Check for all files in the result directory of the project whether they need to be adjusted
for filepath, dirnames, filenames in walk(data_path):
for filepath, _, filenames in walk(data_path):

# (1) Adjust authors lists
if authors_list in filenames:
Expand All @@ -302,7 +319,7 @@ def run_postprocessing(conf, resdir, backup_data):

for author in author_data:
# keep author entry only if it should not be removed
if not author in author_data_to_remove:
if author not in author_data_to_remove:
author_data_new.append(author)
csv_writer.write_to_csv(f, author_data_new)

Expand Down Expand Up @@ -469,7 +486,7 @@ def run():
# process arguments
# - First make all the args absolute
__resdir = abspath(args.resdir)
__codeface_conf, __project_conf = map(abspath, (args.config, args.project))
__codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))
__backup_data = args.backup

# load configuration
Expand Down
24 changes: 12 additions & 12 deletions bot_processing/bot_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,23 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Copyright 2021-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to extract information on bot/human users from csv files.
"""

import argparse
import httplib
import os
import sys
import urllib

import operator
from codeface.cli import log
from codeface.configuration import Configuration
from logging import getLogger

from codeface_utils.configuration import Configuration
from csv_writer import csv_writer


log = getLogger(__name__)

def run():
# get all needed paths and arguments for the method call.
parser = argparse.ArgumentParser(prog='codeface-extraction-bots-github', description='Codeface extraction')
Expand All @@ -39,7 +39,7 @@ def run():

# parse arguments
args = parser.parse_args(sys.argv[1:])
__codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project))
__codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project)))

# create configuration
__conf = Configuration.load(__codeface_conf, __project_conf)
Expand Down Expand Up @@ -75,7 +75,7 @@ def load_bot_data(bot_file, header = True):
:return: the read bot data
"""

log.devinfo("Read bot data from file '{}'...".format(bot_file))
log.info("Read bot data from file '{}'...".format(bot_file))

# check if file exists and exit early if not
if not os.path.exists(bot_file):
Expand All @@ -99,7 +99,7 @@ def load_user_data(user_data_file):
:return: the read user data
"""

log.devinfo("Read user data from file '{}'...".format(user_data_file))
log.info("Read user data from file '{}'...".format(user_data_file))

# check if file exists and exit early if not
if not os.path.exists(user_data_file):
Expand Down Expand Up @@ -192,12 +192,12 @@ def add_user_data(bot_data, user_data, known_bots_file):
continue

# get user information if available
if user[0] in user_buffer.keys():
if user[0] in list(user_buffer.keys()):
bot_reduced["user"] = user_buffer[user[0]]
bot_reduced["prediction"] = user[-1]
bot_data_reduced.append(bot_reduced)
else:
log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))
log.warning("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))

# check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly
bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced)
Expand All @@ -224,7 +224,7 @@ def print_to_disk(bot_data, results_folder):
user["user"]["email"],
user["prediction"]
)
if not entry in lines:
if entry not in lines:
lines.append(entry)

# write to output file
Expand Down
14 changes: 8 additions & 6 deletions codeface_extraction/codeface_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@
# Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
# Copyright 2016, 2018-2019 by Thomas Bock <bockthom@fim.uni-passau.de>
# Copyright 2018 by Barbara Eckl <ecklbarb@fim.uni-passau.de>
# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
# All Rights Reserved.
"""
This file is able to extract developer--artifact relations from the Codeface database.
"""

import argparse
import sys
from logging import getLogger
from os.path import abspath

from codeface.cli import log
from codeface.configuration import Configuration
from codeface.dbmanager import DBManager

import extractions
from . import extractions
from csv_writer import csv_writer
from codeface_utils.dbmanager import DBManager
from codeface_utils.configuration import Configuration

# create logger
log = getLogger(__name__)

##
# RUN FOR ALL PROJECTS
Expand Down Expand Up @@ -119,7 +121,7 @@ def run():
# process arguments
# - First make all the args absolute
__resdir = abspath(args.resdir)
__codeface_conf, __project_conf = map(abspath, (args.config, args.project))
__codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))
__extract_commit_messages = args.commit_messages
__extract_impl = args.implementation
__extract_on_range_level = args.range
Expand Down
Loading