From be84378d0560df8fc6b5a429f11a7637e3215747 Mon Sep 17 00:00:00 2001 From: lalalaurentiu Date: Fri, 14 Nov 2025 07:10:23 +0200 Subject: [PATCH 1/4] Update Vodafone scraper to use new API endpoint and correct data extraction logic --- sites/vodafone.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sites/vodafone.py b/sites/vodafone.py index 726eef0..88475b8 100644 --- a/sites/vodafone.py +++ b/sites/vodafone.py @@ -5,7 +5,7 @@ _counties = GetCounty() -url = "https://jobs.vodafone.com/api/apply/v2/jobs?domain=vodafone.com&start=10&num=10&exclude_pid=563018680721259&location=Romania&pid=563018680721259&domain=vodafone.com&sort_by=relevance" +url = "https://jobs.vodafone.com/api/pcsx/search?domain=vodafone.com&query=&location=Romania&start=0&" company = "Vodafone" jobs = [] @@ -13,13 +13,13 @@ scraper = Scraper(url) scraper.get_from_url(url, "JSON") -total_jobs = scraper.markup["count"] +total_jobs = scraper.markup["data"]["count"] step = 10 pages = ceil(total_jobs / step) for page in range(0, pages): - for job in scraper.markup["positions"]: + for job in scraper.markup["data"]["positions"]: locations = job["location"].split(",") country = locations[-1].strip() consol = locations[0].strip() @@ -32,7 +32,7 @@ jobs.append( create_job( job_title=job["name"], - job_link=job["canonicalPositionUrl"], + job_link="https://jobs.vodafone.com" + job["positionUrl"], city=city, country="Romania", company=company, @@ -40,7 +40,7 @@ ) ) - url = f"https://jobs.vodafone.com/api/apply/v2/jobs?domain=vodafone.com&start={page * step}&num={step}&exclude_pid=563018680721259&location=Romania&pid=563018680721259&domain=vodafone.com&sort_by=relevance" + url = f"https://jobs.vodafone.com/api/pcsx/search?domain=vodafone.com&query=&location=Romania&start={ (page + 1) * step }&" scraper.get_from_url(url, "JSON") publish_or_update(jobs) From 94fffc094c6c99fedfdbc5828643a052d89002c7 Mon Sep 17 00:00:00 2001 From: lalalaurentiu Date: Mon, 17 Nov 2025 20:25:44 +0200 Subject: [PATCH 2/4] Add verify=False to URL requests in SiemensEnergy scraper for improved SSL handling --- sites/siemensenergy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sites/siemensenergy.py b/sites/siemensenergy.py index a41f475..be1d0b3 100644 --- a/sites/siemensenergy.py +++ b/sites/siemensenergy.py @@ -10,7 +10,7 @@ finalJobs = list() scraper = Scraper() -scraper.get_from_url(url) +scraper.get_from_url(url, verify=False) totalJobs = int( scraper.find("div", {"class": "list-controls__text__legend"}) @@ -26,7 +26,7 @@ "https://jobs.siemens-energy.com/en_US/jobs/Jobs/?29454=964547&29454_format=11381&listFilterMode=1&folderRecordsPerPage=20&folderOffset=" + str(page * 20) ) - scraper.get_from_url(url) + scraper.get_from_url(url, verify=False) jobs = scraper.find_all("details", {"class": "article--result--container"}) From f760cadbcb7d0ec967a3e7426380575bb64fea44 Mon Sep 17 00:00:00 2001 From: lalalaurentiu Date: Tue, 18 Nov 2025 20:11:41 +0200 Subject: [PATCH 3/4] Refactor Arcadis scraper to improve job data extraction and update API endpoint --- sites/arcadis.py | 64 ++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 43 deletions(-) diff --git a/sites/arcadis.py b/sites/arcadis.py index 745e0ca..7e5de7c 100644 --- a/sites/arcadis.py +++ b/sites/arcadis.py @@ -4,69 +4,47 @@ publish_logo, create_job, show_jobs, - translate_city, - acurate_city_and_county, get_jobtype ) from getCounty import GetCounty from math import ceil -_counties = GetCounty() -url = "https://jobs.arcadis.com/api/apply/v2/jobs?domain=arcadis.com&location=Romania&domain=arcadis.com&sort_by=relevance" +start = 0 +url = f"https://jobs.arcadis.com/api/pcsx/search?domain=arcadis.com&query=&location=Romania&start={start}&sort_by=distance&filter_include_remote=1" + company = "Arcadis" scraper = Scraper() -headers = { - "Content-Type": "application/json", -} -scraper.set_headers(headers) - -exclude_city = acurate_city_and_county( - Iasi={"city": "Iasi", "county": "Iasi"}, Moldavia={"city": "Iasi", "county": "Iasi"} -) scraper.get_from_url(url, type="JSON") -pages = ceil(scraper.markup.get("count") / 10) +pages = ceil(scraper.markup.get("data").get("count") / 10) jobs = list() for page in range(1, pages + 1): - jobs_objects = scraper.markup.get("positions") + jobs_objects = scraper.markup.get("data").get("positions") for job in jobs_objects: job_title = job.get("name") - job_link = job.get("canonicalPositionUrl") + job_link = "https://jobs.arcadis.com" + job.get("positionUrl") country = "Romania" - remote = get_jobtype(job.get("work_location_option")) - - cities = [] - cities.extend( - [ - translate_city(location.split(",")[0].strip()) - for location in job.get("locations") - if location.split(",")[-1].strip() == "Romania" - ] + remote = get_jobtype(job.get("workLocationOption", "")) + + job = create_job( + job_title=job_title, + job_link=job_link, + company=company, + country=country, + remote=remote, ) - counties = [] - for city in cities: - if exclude_city.get(city): - counties.append(exclude_city.get(city).get("county")) - else: - counties.extend(_counties.get_county(city) if _counties.get_county(city) else []) - jobs.append( - create_job( - job_title=job_title, - job_link=job_link, - company=company, - country=country, - city=cities, - county=counties, - remote=remote, - ) - ) + jobs.append(job) + start = page * 10 - scrper = Scraper() - scraper.get_from_url(url + f"&start={start}&num=10", type="JSON") + scraper = Scraper() + scraper.get_from_url( + f"https://jobs.arcadis.com/api/pcsx/search?domain=arcadis.com&query=&location=Romania&start={start}&sort_by=distance&filter_include_remote=1", + type="JSON", + ) publish_or_update(jobs) publish_logo( From 38eabcc6f760f9d263bca2b6e0a7f0e68a738272 Mon Sep 17 00:00:00 2001 From: lalalaurentiu Date: Wed, 19 Nov 2025 17:57:42 +0200 Subject: [PATCH 4/4] Refactor Danone scraper to enhance job data extraction and update URL handling --- sites/danone.py | 56 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/sites/danone.py b/sites/danone.py index c14481e..aea6609 100644 --- a/sites/danone.py +++ b/sites/danone.py @@ -1,30 +1,50 @@ from scraper.Scraper import Scraper -from utils import publish_or_update, publish_logo, show_jobs, translate_city +from utils import publish_or_update, publish_logo, show_jobs, translate_city, get_jobtype from getCounty import GetCounty _counties = GetCounty() -url = "https://careers.danone.com/bin/jobs.json?countries=Romania&locale=en&limit=100" +url = "https://careers.danone.com/content/corp/global/career-rebrand/global/en/jobs.results.html?10000_group.propertyvalues.property=jcr%3Acontent%2Fdata%2Fmaster%2Fcountry&10000_group.propertyvalues.operation=equals&10000_group.propertyvalues.66_values=Romania&10004_group.propertyvalues.property=jcr%3Acontent%2Fdata%2Fmaster%2Fcountry&10004_group.propertyvalues.operation=equals&10004_group.propertyvalues.66_values=Romania&layout=teaserList&p.offset=0&p.limit=12" company = {"company": "Danone"} scraper = Scraper() -scraper.get_from_url(url, "JSON") - -jobs = scraper.markup.get("results") - -finalJobs = [ - { - "job_title": job.get("title"), - "job_link": "https://careers.danone.com/en-global/jobs/" + job.get("url"), - "company": company.get("company"), - "country": "Romania", - "city": translate_city(job.get("city").title()), - "county": _counties.get_county(translate_city(job.get("city").title())), - "remote": job.get("workFromHome").replace("Field", "") - } - for job in jobs -] + +scraper.get_from_url(url) + + +jobs = scraper.find_all("div", {"class": "dn-jobdetails__job-card"}) + + +finalJobs = [] + +while jobs: + for job in jobs: + job_title = job.find("h3", {"class": "job-card__title"}).text.strip() + job_url = "https://careers.danone.com" + job.find("a")["href"].strip() + location = job.find("h4", {"class": "job-card__city"}).text.strip() + city = translate_city(location.split(",")[0].strip()) + county = _counties.get_county(city) + remote = get_jobtype( + job.find("h4", {"class": "job-card__workFromHome"}).text.strip() + ) + + finalJobs.append( + { + "job_title": job_title, + "job_link": job_url, + "city": city, + "country": "Romania", + "county": county, + "company": company.get("company"), + "remote": remote, + } + ) + scraper.get_from_url( + f"https://careers.danone.com/content/corp/global/career-rebrand/global/en/jobs.results.html?10000_group.propertyvalues.property=jcr%3Acontent%2Fdata%2Fmaster%2Fcountry&10000_group.propertyvalues.operation=equals&10000_group.propertyvalues.66_values=Romania&10004_group.propertyvalues.property=jcr%3Acontent%2Fdata%2Fmaster%2Fcountry&10004_group.propertyvalues.operation=equals&10004_group.propertyvalues.66_values=Romania&layout=teaserList&p.offset={len(finalJobs)}&p.limit=12" + ) + jobs = scraper.find_all("div", {"class": "dn-jobdetails__job-card"}) + publish_or_update(finalJobs)