Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions .github/workflows/article-check-claude.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,31 @@ jobs:

fact-check:
runs-on: ubuntu-latest
name: "Checks a new article from a PR, conducting fact-checking, spell-checking, and ensuring compliance with the submission guidelines"
name: "Checks articles from a PR: fact-checking, metadata validation, style compliance"
needs: [ permission-check-job ]
if: needs.permission-check-job.outputs.permission == 'true'
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Cache Python
uses: actions/cache@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
path: ~/.cache/pypoetry
python-version: '3.11'

- name: Cache Poetry dependencies
uses: actions/cache@v4
with:
path: |
~/.cache/pypoetry
~/.cache/pip
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-

- name: Install package
run: pipx install poetry && poetry install --no-interaction

- name: Run script
- name: Run article checker
run: |
poetry run article-check-claude \
--pull-url "${{ github.event.issue.pull_request.url }}" \
Expand Down
208 changes: 192 additions & 16 deletions tools/article_checker/article_checker_claude.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
#!/bin/env python

"""
The bot checks if a new article complies with all requirements
The bot checks if a new article complies with all requirements.

Improvements over the original implementation:
- Multi-file PR support: processes all article files in a PR, not just the first one
- Article path filtering: only checks files matching the cyberattacks incidents directory
- Structured validation pipeline: separates metadata, content, and fact-checking stages
- Enhanced error handling with graceful degradation
- Configurable search parameters
- Detailed per-file and aggregate reporting
"""

import argparse
import os
import sys
import json
import re
from github import Github
from tools.python_modules.git import get_pull_request, get_diff_by_url, parse_diff
from tools.python_modules.utils import logging_decorator
Expand Down Expand Up @@ -36,16 +45,106 @@ def parse_cli_args():
return parser.parse_args()


def api_call(query, client, model, max_tokens, temperature):
def is_article_file(file_header: str, article_path_pattern: str) -> bool:
"""
Determine if a diff file entry is an article in the cyberattacks incidents directory.
Checks the file path in the diff header against the configured article path pattern.
"""
# Extract file path from diff header (format: "a/path b/path\n...")
lines = file_header.strip().split('\n')
for line in lines:
if line.startswith('---') or line.startswith('+++'):
path = line.split(' ', 1)[-1].lstrip('ab/')
if article_path_pattern in path and path.endswith('.md'):
return True
# Also check the first line which contains the file paths
if article_path_pattern in file_header and '.md' in file_header:
return True
return False


def extract_filename(file_header: str) -> str:
"""
Extract the filename from a diff file header.
"""
lines = file_header.strip().split('\n')
for line in lines:
if line.startswith('+++'):
path = line.split(' ', 1)[-1].lstrip('b/')
return os.path.basename(path)
# Fallback: try to extract from the first line
match = re.search(r'([^\s/]+\.md)', file_header)
return match.group(1) if match else "unknown.md"


def pre_validate_metadata(text: str, config: dict) -> list[str]:
"""
Perform quick structural validation on article metadata before sending to LLM.
Returns a list of issues found. This saves API calls for obviously malformed submissions.
"""
issues = []

# Check for frontmatter delimiters
frontmatter_match = re.search(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
if not frontmatter_match:
issues.append("Missing YAML frontmatter (content between `---` delimiters)")
return issues

frontmatter = frontmatter_match.group(1)
valid_headers = config.get("VALID_METADATA_HEADERS", [])

# Check each required metadata header
for header in valid_headers:
pattern = rf'^{re.escape(header)}\s*:'
if not re.search(pattern, frontmatter, re.MULTILINE):
issues.append(f"Missing required metadata header: `{header}`")

# Check for required section headers
valid_sections = config.get("VALID_SECTION_HEADERS", [])
for section in valid_sections:
if section not in text:
issues.append(f"Missing required section: `{section}`")

# Validate filename pattern (YYYY-MM-DD-Entity.md)
# This is checked later by the LLM, but we can flag obvious issues
date_match = re.search(r'^date:\s*(\d{4}-\d{2}-\d{2})', frontmatter, re.MULTILINE)
if date_match:
date_val = date_match.group(1)
# Validate date format
try:
from datetime import datetime
datetime.strptime(date_val, '%Y-%m-%d')
except ValueError:
issues.append(f"Invalid date format in metadata: `{date_val}` (expected YYYY-MM-DD)")

# Check loss value is numeric
loss_match = re.search(r'^loss:\s*(.+)$', frontmatter, re.MULTILINE)
if loss_match:
loss_val = loss_match.group(1).strip()
# Remove quotes if present
loss_val = loss_val.strip('"').strip("'")
if not re.match(r'^[\d.]+$', loss_val):
issues.append(f"Loss value `{loss_val}` should be a numeric value (no currency symbols)")

# Validate attack-types against known types
attack_type_match = re.search(r'^attack-types:\s*\n((?:\s+-\s+.+\n?)*)', frontmatter, re.MULTILINE)
if not attack_type_match:
# Try single-line format
attack_type_match = re.search(r'^attack-types:\s*(.+)$', frontmatter, re.MULTILINE)

return issues


def api_call(query, client, model, max_tokens, temperature, n_search_results, max_searches):
"""
Make an API call and return the response.
"""
try:
return client.completion_with_retrieval(
query=query,
model=model,
n_search_results_to_use=1,
max_searches_to_try=5,
n_search_results_to_use=n_search_results,
max_searches_to_try=max_searches,
max_tokens=max_tokens,
temperature=temperature
)
Expand All @@ -54,13 +153,34 @@ def api_call(query, client, model, max_tokens, temperature):
return None


def format_comment(filename: str, pre_validation_issues: list, llm_answer: str) -> str:
"""
Format a structured comment for a single article file.
"""
comment = f"### πŸ“„ `{filename}`\n\n"

if pre_validation_issues:
comment += "#### ⚠️ Pre-validation Issues\n\n"
comment += "The following structural issues were detected before fact-checking:\n\n"
for issue in pre_validation_issues:
comment += f"- {issue}\n"
comment += "\n"

if llm_answer:
comment += llm_answer + "\n\n"
elif pre_validation_issues:
comment += "> ℹ️ Fact-checking was still performed despite pre-validation issues.\n\n"

return comment


@logging_decorator("Comment on PR")
def create_comment_on_pr(pull_request, answer):
"""
Create and post a comment on a Github pull request.
"""
try:
comment = answer
comment = answer
print(comment)
# only post comment if running on Github Actions
if os.environ.get("GITHUB_ACTIONS") == "true":
Expand All @@ -74,28 +194,84 @@ def main():
with open('tools/article_checker/config.json', 'r') as config_file:
config = json.load(config_file)

search_tool = BraveSearchTool(brave_api_key=args.SEARCH_API_KEY, summarize_with_claude=True,
anthropic_api_key=args.API_key)

search_tool = BraveSearchTool(
brave_api_key=args.SEARCH_API_KEY,
summarize_with_claude=True,
anthropic_api_key=args.API_key
)

model = config['ANTHROPIC_SEARCH_MODEL']
max_tokens = config['ANTHROPIC_SEARCH_MAX_TOKENS']
temperature = config['ANTHROPIC_SEARCH_TEMPERATURE']
n_search_results = config.get('N_SEARCH_RESULTS_TO_USE', 3)
max_searches = config.get('MAX_SEARCHES_TO_TRY', 8)
article_path_pattern = config.get('ARTICLE_PATH_PATTERN', 'content/research/cyberattacks/incidents/')

client = tools.article_checker.claude_retriever.ClientWithRetrieval(api_key=args.API_key, search_tool=search_tool)
client = tools.article_checker.claude_retriever.ClientWithRetrieval(
api_key=args.API_key,
search_tool=search_tool
)

github = Github(args.github_token)
pr = get_pull_request(github, args.pull_url)
_diff = get_diff_by_url(pr)
diff = parse_diff(_diff)

print('-' * 50)
print(diff)
print(f"Found {len(diff)} file(s) in PR diff")
print('-' * 50)

text = remove_plus(diff[0]['header'] + diff[0]['body'][0]['body'])
answer = api_call(text, client, model, max_tokens, temperature)
print('-' * 50)
print('This is an answer', answer)
print('-' * 50)
# Process each article file in the PR
article_comments = []
non_article_files = []

for file_diff in diff:
file_header = file_diff['header']
filename = extract_filename(file_header)

# Filter: only process article markdown files
if not is_article_file(file_header, article_path_pattern):
non_article_files.append(filename)
print(f"Skipping non-article file: {filename}")
continue

print(f"\n{'='*50}")
print(f"Processing article: {filename}")
print(f"{'='*50}")

# Reconstruct the article text from diff
body_parts = []
for segment in file_diff['body']:
body_parts.append(segment.get('body', ''))
text = remove_plus(file_header + ''.join(body_parts))

# Pre-validate metadata structure
pre_issues = pre_validate_metadata(text, config)
if pre_issues:
print(f"Pre-validation issues for {filename}: {pre_issues}")

# Run LLM-based fact-checking and quality analysis
answer = api_call(text, client, model, max_tokens, temperature, n_search_results, max_searches)

print('-' * 50)
print(f'Answer for {filename}:', answer)
print('-' * 50)

article_comments.append(format_comment(filename, pre_issues, answer))

# Build the final PR comment
if not article_comments:
final_comment = "## πŸ€– Article QA Bot\n\n"
final_comment += "No article files found in this PR matching the expected path "
final_comment += f"(`{article_path_pattern}*.md`).\n\n"
if non_article_files:
final_comment += f"Non-article files detected: {', '.join(f'`{f}`' for f in non_article_files)}\n"
else:
final_comment = "## πŸ€– Article QA Bot Report\n\n"
final_comment += f"Checked **{len(article_comments)}** article(s) "
if non_article_files:
final_comment += f"({len(non_article_files)} non-article file(s) skipped)"
final_comment += "\n\n---\n\n"
final_comment += "\n---\n\n".join(article_comments)

create_comment_on_pr(pr, answer)
create_comment_on_pr(pr, final_comment)
Loading