From 2b8162e43ce368b43bc752d01d9809f934272827 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 21:24:51 +0000 Subject: [PATCH 1/7] Initial plan From a4c49754fd1412314d81e2922980c63bd22e31e8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 21:28:45 +0000 Subject: [PATCH 2/7] Update validation logic in __test__/runTest.py with new key rules and remote field validation Co-authored-by: lalalaurentiu <67306273+lalalaurentiu@users.noreply.github.com> --- __test__/runTest.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/__test__/runTest.py b/__test__/runTest.py index 3af98a2..e8b5588 100644 --- a/__test__/runTest.py +++ b/__test__/runTest.py @@ -19,15 +19,45 @@ scraper_obj = json.loads(matches[0]) - keys = ['job_title', 'job_link', 'city', 'county','country', 'company'] + # Define required and optional keys + required_keys = ['company', 'job_title', 'job_link'] + optional_keys = ['city', 'county', 'remote'] + allowed_keys = required_keys + optional_keys + + # Define allowed remote values + allowed_remote_values = ['remote', 'on-site', 'hybrid'] for job in scraper_obj: + # Check that all required keys are present + for req_key in required_keys: + if req_key not in job: + raise Exception(f"Required key '{req_key}' is missing! \n {job}") + + # Check each key in the job for key, value in job.items(): - if key not in keys: - raise Exception(f"Key {key} is not allowed!") + # Reject keys that are not in the allowed list + if key not in allowed_keys: + raise Exception(f"Key '{key}' is not allowed! Allowed keys are: {', '.join(allowed_keys)}") - if value == None: - raise Exception(f"Key {key} has no value! \n {job}") + # Check that required keys have non-None values + if key in required_keys and value == None: + raise Exception(f"Required key '{key}' has no value! \n {job}") + + # Validate remote field format + if key == 'remote': + # Remote must be a list + if not isinstance(value, list): + raise Exception(f"Key 'remote' must be a list, got {type(value).__name__}! \n {job}") + + # Check each value in the remote list + for remote_val in value: + # Must be lowercase + if remote_val != remote_val.lower(): + raise Exception(f"Remote value '{remote_val}' must be lowercase! \n {job}") + + # Must be in allowed values + if remote_val not in allowed_remote_values: + raise Exception(f"Remote value '{remote_val}' is not allowed! Allowed values are: {', '.join(allowed_remote_values)} \n {job}") print(f'✅ {file}') else: From e5eda94ded445774b31c8668e63db5437479a1e0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 21:29:13 +0000 Subject: [PATCH 3/7] Add comprehensive tests for scraper validation logic Co-authored-by: lalalaurentiu <67306273+lalalaurentiu@users.noreply.github.com> --- __test__/test_validation.py | 318 ++++++++++++++++++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100755 __test__/test_validation.py diff --git a/__test__/test_validation.py b/__test__/test_validation.py new file mode 100755 index 0000000..7ad8096 --- /dev/null +++ b/__test__/test_validation.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Test script to validate the scraper validation logic. +This tests various valid and invalid scraper outputs. +""" + +import json +import sys +import os + +# Add the parent directory to the path to import from __test__ +sys.path.insert(0, '/home/runner/work/based_scraper_py/based_scraper_py') + +# Import the validation logic (we'll test it directly) +required_keys = ['company', 'job_title', 'job_link'] +optional_keys = ['city', 'county', 'remote'] +allowed_keys = required_keys + optional_keys +allowed_remote_values = ['remote', 'on-site', 'hybrid'] + +def validate_job(job): + """Validate a single job object according to the new rules""" + # Check that all required keys are present + for req_key in required_keys: + if req_key not in job: + raise Exception(f"Required key '{req_key}' is missing! \n {job}") + + # Check each key in the job + for key, value in job.items(): + # Reject keys that are not in the allowed list + if key not in allowed_keys: + raise Exception(f"Key '{key}' is not allowed! Allowed keys are: {', '.join(allowed_keys)}") + + # Check that required keys have non-None values + if key in required_keys and value == None: + raise Exception(f"Required key '{key}' has no value! \n {job}") + + # Validate remote field format + if key == 'remote': + # Remote must be a list + if not isinstance(value, list): + raise Exception(f"Key 'remote' must be a list, got {type(value).__name__}! \n {job}") + + # Check each value in the remote list + for remote_val in value: + # Must be lowercase + if remote_val != remote_val.lower(): + raise Exception(f"Remote value '{remote_val}' must be lowercase! \n {job}") + + # Must be in allowed values + if remote_val not in allowed_remote_values: + raise Exception(f"Remote value '{remote_val}' is not allowed! Allowed values are: {', '.join(allowed_remote_values)} \n {job}") + +def test_valid_cases(): + """Test valid scraper outputs""" + print("Testing valid cases...") + + # Test 1: Minimal valid job (only required fields) + job1 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job1" + } + try: + validate_job(job1) + print("✅ Test 1 passed: Minimal valid job") + except Exception as e: + print(f"❌ Test 1 failed: {e}") + return False + + # Test 2: Job with city and county + job2 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job2", + "city": "București", + "county": "București" + } + try: + validate_job(job2) + print("✅ Test 2 passed: Job with city and county") + except Exception as e: + print(f"❌ Test 2 failed: {e}") + return False + + # Test 3: Job with remote field (single value) + job3 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job3", + "remote": ["remote"] + } + try: + validate_job(job3) + print("✅ Test 3 passed: Job with remote=['remote']") + except Exception as e: + print(f"❌ Test 3 failed: {e}") + return False + + # Test 4: Job with remote field (multiple values) + job4 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job4", + "remote": ["on-site", "remote"] + } + try: + validate_job(job4) + print("✅ Test 4 passed: Job with remote=['on-site', 'remote']") + except Exception as e: + print(f"❌ Test 4 failed: {e}") + return False + + # Test 5: Job with hybrid remote + job5 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job5", + "remote": ["hybrid"] + } + try: + validate_job(job5) + print("✅ Test 5 passed: Job with remote=['hybrid']") + except Exception as e: + print(f"❌ Test 5 failed: {e}") + return False + + # Test 6: Job with all allowed keys + job6 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job6", + "city": "Cluj-Napoca", + "county": "Cluj", + "remote": ["remote", "on-site", "hybrid"] + } + try: + validate_job(job6) + print("✅ Test 6 passed: Job with all allowed keys") + except Exception as e: + print(f"❌ Test 6 failed: {e}") + return False + + # Test 7: Job with empty remote list + job7 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job7", + "remote": [] + } + try: + validate_job(job7) + print("✅ Test 7 passed: Job with empty remote list") + except Exception as e: + print(f"❌ Test 7 failed: {e}") + return False + + return True + +def test_invalid_cases(): + """Test invalid scraper outputs that should fail""" + print("\nTesting invalid cases...") + + # Test 1: Missing required key (company) + job1 = { + "job_title": "Software Engineer", + "job_link": "https://example.com/job1" + } + try: + validate_job(job1) + print("❌ Test 1 failed: Should reject missing 'company' key") + return False + except Exception as e: + if "Required key 'company' is missing" in str(e): + print("✅ Test 1 passed: Correctly rejects missing required key") + else: + print(f"❌ Test 1 failed with wrong error: {e}") + return False + + # Test 2: Missing required key (job_title) + job2 = { + "company": "TestCompany", + "job_link": "https://example.com/job2" + } + try: + validate_job(job2) + print("❌ Test 2 failed: Should reject missing 'job_title' key") + return False + except Exception as e: + if "Required key 'job_title' is missing" in str(e): + print("✅ Test 2 passed: Correctly rejects missing 'job_title'") + else: + print(f"❌ Test 2 failed with wrong error: {e}") + return False + + # Test 3: Missing required key (job_link) + job3 = { + "company": "TestCompany", + "job_title": "Software Engineer" + } + try: + validate_job(job3) + print("❌ Test 3 failed: Should reject missing 'job_link' key") + return False + except Exception as e: + if "Required key 'job_link' is missing" in str(e): + print("✅ Test 3 passed: Correctly rejects missing 'job_link'") + else: + print(f"❌ Test 3 failed with wrong error: {e}") + return False + + # Test 4: Extra key not allowed (country) + job4 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job4", + "country": "Romania" + } + try: + validate_job(job4) + print("❌ Test 4 failed: Should reject 'country' key") + return False + except Exception as e: + if "Key 'country' is not allowed" in str(e): + print("✅ Test 4 passed: Correctly rejects 'country' key") + else: + print(f"❌ Test 4 failed with wrong error: {e}") + return False + + # Test 5: Remote as string instead of list + job5 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job5", + "remote": "remote" + } + try: + validate_job(job5) + print("❌ Test 5 failed: Should reject remote as string") + return False + except Exception as e: + if "must be a list" in str(e): + print("✅ Test 5 passed: Correctly rejects remote as string") + else: + print(f"❌ Test 5 failed with wrong error: {e}") + return False + + # Test 6: Remote with uppercase value + job6 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job6", + "remote": ["Remote"] + } + try: + validate_job(job6) + print("❌ Test 6 failed: Should reject uppercase remote value") + return False + except Exception as e: + if "must be lowercase" in str(e): + print("✅ Test 6 passed: Correctly rejects uppercase remote value") + else: + print(f"❌ Test 6 failed with wrong error: {e}") + return False + + # Test 7: Remote with invalid value + job7 = { + "company": "TestCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/job7", + "remote": ["invalid"] + } + try: + validate_job(job7) + print("❌ Test 7 failed: Should reject invalid remote value") + return False + except Exception as e: + if "is not allowed" in str(e) and "invalid" in str(e): + print("✅ Test 7 passed: Correctly rejects invalid remote value") + else: + print(f"❌ Test 7 failed with wrong error: {e}") + return False + + # Test 8: Required key with None value + job8 = { + "company": None, + "job_title": "Software Engineer", + "job_link": "https://example.com/job8" + } + try: + validate_job(job8) + print("❌ Test 8 failed: Should reject None value for required key") + return False + except Exception as e: + if "has no value" in str(e): + print("✅ Test 8 passed: Correctly rejects None value for required key") + else: + print(f"❌ Test 8 failed with wrong error: {e}") + return False + + return True + +if __name__ == "__main__": + print("=" * 60) + print("Testing Scraper Validation Logic") + print("=" * 60) + + valid_passed = test_valid_cases() + invalid_passed = test_invalid_cases() + + print("\n" + "=" * 60) + if valid_passed and invalid_passed: + print("✅ All tests passed!") + print("=" * 60) + sys.exit(0) + else: + print("❌ Some tests failed!") + print("=" * 60) + sys.exit(1) From 0f47bc8cf74fced52b837fd9b6c6889894c6bf21 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 21:32:35 +0000 Subject: [PATCH 4/7] Fix regex pattern to support nested arrays in JSON output Co-authored-by: lalalaurentiu <67306273+lalalaurentiu@users.noreply.github.com> --- __test__/runTest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/__test__/runTest.py b/__test__/runTest.py index e8b5588..c5a8f51 100644 --- a/__test__/runTest.py +++ b/__test__/runTest.py @@ -14,7 +14,7 @@ if file.startswith('sites/'): run_file = subprocess.run(["python3", os.getcwd() + "/" + file], capture_output=True).stdout.decode('utf-8') - pattern = re.compile(r"(\[.*?\])", re.DOTALL) + pattern = re.compile(r"(\[.*\])", re.DOTALL) matches = pattern.findall(run_file) scraper_obj = json.loads(matches[0]) From b53a479738ad2fa10502b82ad0f6fc0e2c22dcf1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 21:33:15 +0000 Subject: [PATCH 5/7] Add comprehensive documentation to validation test script Co-authored-by: lalalaurentiu <67306273+lalalaurentiu@users.noreply.github.com> --- __test__/runTest.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/__test__/runTest.py b/__test__/runTest.py index c5a8f51..907e6d6 100644 --- a/__test__/runTest.py +++ b/__test__/runTest.py @@ -1,3 +1,35 @@ +""" +Scraper Validation Test Script + +This script validates scrapers by checking their output against required rules. + +Validation Rules: +----------------- +1. Required keys (must be present with non-None values): + - company: The company name + - job_title: The job title + - job_link: URL to the job posting + +2. Optional keys (may be present): + - city: City location(s) + - county: County location(s) + - remote: Work arrangement options (must be a list) + +3. Remote field validation (when present): + - Must be a list (even if empty) + - All values must be lowercase + - Only allowed values: 'remote', 'on-site', 'hybrid' + - Examples: + * {"remote": ["remote"]} + * {"remote": ["on-site", "hybrid"]} + * {"remote": []} + +4. All other keys are rejected. + +The script runs all changed scraper files (in sites/ directory) and validates +their JSON output against these rules. +""" + import subprocess import json import re From 99d56f11e006fc1c0f7dd65507c076c0dc29d7cf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 30 Sep 2025 21:33:49 +0000 Subject: [PATCH 6/7] Add comprehensive README for test directory with validation rules and examples Co-authored-by: lalalaurentiu <67306273+lalalaurentiu@users.noreply.github.com> --- __test__/README.md | 135 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 __test__/README.md diff --git a/__test__/README.md b/__test__/README.md new file mode 100644 index 0000000..bee5d11 --- /dev/null +++ b/__test__/README.md @@ -0,0 +1,135 @@ +# Scraper Validation Tests + +This directory contains validation tests for the scraper system. + +## Files + +### runTest.py +Main validation script that checks scrapers against the required output format. +It runs all changed scraper files and validates their JSON output. + +**Validation Rules:** + +1. **Required keys** (must be present with non-None values): + - `company`: The company name + - `job_title`: The job title + - `job_link`: URL to the job posting + +2. **Optional keys** (may be present): + - `city`: City location(s) + - `county`: County location(s) + - `remote`: Work arrangement options (must be a list) + +3. **Remote field validation** (when present): + - Must be a list (even if empty) + - All values must be lowercase + - Only allowed values: `remote`, `on-site`, `hybrid` + - Valid examples: + ```json + {"remote": ["remote"]} + {"remote": ["on-site", "hybrid"]} + {"remote": ["remote", "on-site", "hybrid"]} + {"remote": []} + ``` + +4. **All other keys are rejected** - no additional fields are allowed. + +### test_validation.py +Unit tests for the validation logic. Tests both valid and invalid scraper outputs +to ensure the validation rules work correctly. + +Run with: +```bash +python3 __test__/test_validation.py +``` + +### publish.py +Script for publishing validated jobs to the API. + +## Running Tests + +To run the validation tests: + +```bash +# Run unit tests +python3 __test__/test_validation.py + +# Run validation on changed scrapers +python3 __test__/runTest.py +``` + +## Example Valid Scraper Output + +```json +[ + { + "company": "ExampleCompany", + "job_title": "Software Engineer", + "job_link": "https://example.com/jobs/123", + "city": "București", + "county": "București", + "remote": ["remote", "on-site"] + }, + { + "company": "ExampleCompany", + "job_title": "Data Scientist", + "job_link": "https://example.com/jobs/124" + } +] +``` + +## Common Validation Errors + +### ❌ Using 'country' field +```json +{ + "company": "Test", + "job_title": "Engineer", + "job_link": "https://example.com/job", + "country": "Romania" // NOT ALLOWED +} +``` +**Error:** `Key 'country' is not allowed! Allowed keys are: company, job_title, job_link, city, county, remote` + +### ❌ Remote as string +```json +{ + "company": "Test", + "job_title": "Engineer", + "job_link": "https://example.com/job", + "remote": "remote" // Should be ["remote"] +} +``` +**Error:** `Key 'remote' must be a list, got str!` + +### ❌ Uppercase remote value +```json +{ + "company": "Test", + "job_title": "Engineer", + "job_link": "https://example.com/job", + "remote": ["Remote"] // Should be lowercase +} +``` +**Error:** `Remote value 'Remote' must be lowercase!` + +### ❌ Invalid remote value +```json +{ + "company": "Test", + "job_title": "Engineer", + "job_link": "https://example.com/job", + "remote": ["work-from-home"] // Not in allowed list +} +``` +**Error:** `Remote value 'work-from-home' is not allowed! Allowed values are: remote, on-site, hybrid` + +### ❌ Missing required field +```json +{ + "job_title": "Engineer", + "job_link": "https://example.com/job" + // Missing 'company' +} +``` +**Error:** `Required key 'company' is missing!` From 25c1a7d10e5d108255c8f85c6a3f944d5253591f Mon Sep 17 00:00:00 2001 From: lalalaurentiu Date: Thu, 2 Oct 2025 07:11:23 +0300 Subject: [PATCH 7/7] Refactor file path handling in scraper validation test script --- __test__/runTest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/__test__/runTest.py b/__test__/runTest.py index 907e6d6..c8d8833 100644 --- a/__test__/runTest.py +++ b/__test__/runTest.py @@ -44,7 +44,10 @@ for file in files: print(f'Running {file} ...') if file.startswith('sites/'): - run_file = subprocess.run(["python3", os.getcwd() + "/" + file], capture_output=True).stdout.decode('utf-8') + directory = os.path.abspath(file).rsplit('/', 1)[0].replace('__test__/', '') + file_name = file.rsplit('/', 1)[1] + + run_file = subprocess.run(["python3", directory + "/" + file_name], capture_output=True).stdout.decode('utf-8') pattern = re.compile(r"(\[.*\])", re.DOTALL) matches = pattern.findall(run_file)