Skip to content

Commit 47c90eb

Browse files
committed
enh:scrape connections
1 parent 0148cfd commit 47c90eb

File tree

5 files changed

+136
-15
lines changed

5 files changed

+136
-15
lines changed

README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ _why pay $100/mo for LSN when you could do it for free and get a nice csv to go
99
- Scrapes staff from a company on **LinkedIn**
1010
- Obtains skills, experiences, certifications & more
1111
- Or fetch individuals users / comments on posts
12+
- Scrape your own LinkedIn connections with details
1213
- Aggregates the employees in a Pandas DataFrame
1314

1415
[Video Guide for StaffSpy](https://youtu.be/DNFmjvpZBTs) - _updated for release v0.2.18_
@@ -61,10 +62,17 @@ companies = account.scrape_companies(
6162
company_names=['openai', 'microsoft']
6263
)
6364

65+
# fetch connections
66+
connections = account.scrape_connections(
67+
extra_profile_data=True,
68+
max_results=50
69+
)
70+
6471
staff.to_csv("staff.csv", index=False)
6572
users.to_csv("users.csv", index=False)
6673
comments.to_csv("comments.csv", index=False)
6774
companies.to_csv("companies.csv", index=False)
75+
connections.to_csv("connections.csv", index=False)
6876
```
6977

7078
#### Browser login
@@ -169,6 +177,16 @@ Optional
169177
```
170178

171179

180+
### Parameters for `scrape_connections()`
181+
182+
```plaintext
183+
├── max_results (int):
184+
| maximum number of connections to fetch (default is all)
185+
|
186+
├── extra_profile_data (bool):
187+
| gets all profile info
188+
```
189+
172190
### LinkedIn notes
173191

174192
- only 1000 max results per search

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "staffspy"
3-
version = "0.2.22"
3+
version = "0.2.23"
44
description = "Staff scraper library for LinkedIn"
55
authors = ["Cullen Watson <cullen@bunsly.com>"]
66
readme = "README.md"

staffspy/__init__.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
Login,
1414
parse_company_data,
1515
extract_emails_from_text,
16+
clean_df,
1617
)
1718
from staffspy.utils.driver_type import DriverType
1819

@@ -84,18 +85,10 @@ def scrape_staff(
8485
self.on_block = True
8586
staff_dicts = [staff.to_dict() for staff in staff]
8687
staff_df = pd.DataFrame(staff_dicts)
87-
if "estimated_age" in staff_df.columns:
88-
staff_df["estimated_age"] = staff_df["estimated_age"].astype("Int64")
89-
if "followers" in staff_df.columns:
90-
staff_df["followers"] = staff_df["followers"].astype("Int64")
91-
if "connections" in staff_df.columns:
92-
staff_df["connections"] = staff_df["connections"].astype("Int64")
93-
if "mutuals" in staff_df.columns:
94-
staff_df["mutuals"] = staff_df["mutuals"].astype("Int64")
95-
9688
if staff_df.empty:
9789
return staff_df
9890

91+
staff_df = clean_df(staff_df)
9992
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
10093
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
10194
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
@@ -203,3 +196,27 @@ def scrape_companies(
203196
return pd.DataFrame()
204197

205198
return pd.concat(company_dfs, ignore_index=True)
199+
200+
def scrape_connections(
201+
self,
202+
max_results: int = 10**8,
203+
extra_profile_data: bool = False,
204+
) -> pd.DataFrame:
205+
"""Scrape connections from Linkedin"""
206+
if self.on_block:
207+
return logger.error(
208+
"Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed."
209+
)
210+
li_scraper = LinkedInScraper(self.session)
211+
212+
connections = li_scraper.scrape_connections(
213+
max_results=max_results,
214+
extra_profile_data=extra_profile_data,
215+
)
216+
connections_df = pd.DataFrame()
217+
if connections:
218+
staff_dicts = [staff.to_dict() for staff in connections]
219+
connections_df = pd.DataFrame(staff_dicts)
220+
connections_df = clean_df(connections_df)
221+
222+
return connections_df

staffspy/linkedin/linkedin.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class LinkedInScraper:
3333
public_user_id_ep = (
3434
"https://www.linkedin.com/voyager/api/identity/profiles/{user_id}/profileView"
3535
)
36+
connections_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashClusters.dfcd3603c2779eddd541f572936f4324&queryName=SearchClusterCollection&variables=(query:(queryParameters:(resultType:List(FOLLOWERS)),flagshipSearchIntent:MYNETWORK_CURATION_HUB,includeFiltersInResponse:true),count:50,origin:CurationHub,start:{offset})"
3637
block_user_ep = "https://www.linkedin.com/voyager/api/voyagerTrustDashContentReportingForm?action=entityBlock"
3738

3839
def __init__(self, session: requests.Session):
@@ -164,9 +165,11 @@ def parse_staff(self, elements: list[dict]):
164165
person = card.get("item", {}).get("entityResult", {})
165166
if not person:
166167
continue
167-
pattern = r"urn:li:fsd_profile:([^,]+),SEARCH_SRP"
168+
pattern = (
169+
r"urn:li:fsd_profile:([^,]+),(?:SEARCH_SRP|MYNETWORK_CURATION_HUB)"
170+
)
168171
match = re.search(pattern, person["entityUrn"])
169-
linkedin_id = match.group(1)
172+
linkedin_id = match.group(1) if match else None
170173
person_urn = person["trackingUrn"].split(":")[-1]
171174

172175
name = person["title"]["text"].strip()
@@ -224,7 +227,7 @@ def fetch_staff(self, offset: int):
224227
try:
225228
res_json = res.json()
226229
except json.decoder.JSONDecodeError:
227-
logger.debug(res.text[:200])
230+
logger.debug(res.text)
228231
return None, 0
229232

230233
try:
@@ -239,6 +242,78 @@ def fetch_staff(self, offset: int):
239242
new_staff = self.parse_staff(elements) if elements else []
240243
return new_staff, total_count
241244

245+
def fetch_connections_page(self, offset: int):
246+
self.session.headers["x-li-graphql-pegasus-client"] = "true"
247+
res = self.session.get(self.connections_ep.format(offset=offset))
248+
self.session.headers.pop("x-li-graphql-pegasus-client", "")
249+
if not res.ok:
250+
logger.debug(f"employees, status code - {res.status_code}")
251+
if res.status_code == 400:
252+
raise BadCookies("Outdated login, delete the session file to log in again")
253+
elif res.status_code == 429:
254+
raise TooManyRequests("429 Too Many Requests")
255+
if not res.ok:
256+
return
257+
try:
258+
res_json = res.json()
259+
except json.decoder.JSONDecodeError:
260+
logger.debug(res.text)
261+
return
262+
263+
try:
264+
elements = res_json["data"]["searchDashClustersByAll"]["elements"]
265+
total_count = res_json["data"]["searchDashClustersByAll"]["metadata"][
266+
"totalResultCount"
267+
]
268+
269+
except (KeyError, IndexError, TypeError):
270+
logger.debug(res_json)
271+
return
272+
273+
new_staff = self.parse_staff(elements) if elements else []
274+
return new_staff, total_count
275+
276+
def scrape_connections(
277+
self,
278+
max_results: int = 10**8,
279+
extra_profile_data: bool = False,
280+
):
281+
self.search_term = "connections"
282+
staff_list: list[Staff] = []
283+
284+
try:
285+
initial_staff, total_search_result_count = self.fetch_connections_page(0)
286+
if initial_staff:
287+
staff_list.extend(initial_staff)
288+
289+
self.num_staff = min(total_search_result_count, max_results)
290+
for offset in range(50, self.num_staff, 50):
291+
staff, _ = self.fetch_connections_page(offset)
292+
logger.debug(
293+
f"Connections from search: {len(staff)} new, {len(staff_list) + len(staff)} total"
294+
)
295+
if not staff:
296+
break
297+
staff_list.extend(staff)
298+
except (BadCookies, TooManyRequests) as e:
299+
self.on_block = True
300+
logger.error(f"Exiting early due to fatal error: {str(e)}")
301+
return staff_list[:max_results]
302+
303+
reduced_staff_list = staff_list[:max_results]
304+
305+
non_restricted = list(
306+
filter(lambda x: x.name != "LinkedIn Member", reduced_staff_list)
307+
)
308+
309+
if extra_profile_data:
310+
try:
311+
for i, employee in enumerate(non_restricted, start=1):
312+
self.fetch_all_info_for_employee(employee, i)
313+
except TooManyRequests as e:
314+
logger.error(str(e))
315+
return reduced_staff_list
316+
242317
def fetch_location_id(self):
243318
"""Fetch the location id for the location to be used in LinkedIn search"""
244319
ep = self.location_id_ep.format(location=quote(self.raw_location))
@@ -333,7 +408,6 @@ def scrape_staff(
333408
return staff_list[:max_results]
334409

335410
reduced_staff_list = staff_list[:max_results]
336-
337411
non_restricted = list(
338412
filter(lambda x: x.name != "LinkedIn Member", reduced_staff_list)
339413
)
@@ -352,7 +426,7 @@ def scrape_staff(
352426
def fetch_all_info_for_employee(self, employee: Staff, index: int):
353427
"""Simultaniously fetch all the data for an employee"""
354428
logger.info(
355-
f"Fetching employee data for {employee.id} {index:>4} / {self.num_staff} - {employee.profile_link}"
429+
f"Fetching data for account {employee.id} {index:>4} / {self.num_staff} - {employee.profile_link}"
356430
)
357431

358432
with ThreadPoolExecutor(max_workers=7) as executor:

staffspy/utils/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,5 +435,17 @@ def parse_company_data(json_data, search_term=None):
435435
return company_df
436436

437437

438+
def clean_df(staff_df):
439+
if "estimated_age" in staff_df.columns:
440+
staff_df["estimated_age"] = staff_df["estimated_age"].astype("Int64")
441+
if "followers" in staff_df.columns:
442+
staff_df["followers"] = staff_df["followers"].astype("Int64")
443+
if "connections" in staff_df.columns:
444+
staff_df["connections"] = staff_df["connections"].astype("Int64")
445+
if "mutuals" in staff_df.columns:
446+
staff_df["mutuals"] = staff_df["mutuals"].astype("Int64")
447+
return staff_df
448+
449+
438450
if __name__ == "__main__":
439451
p = parse_dates("May 2018 - Jun 2024")

0 commit comments

Comments
 (0)