@@ -33,6 +33,7 @@ class LinkedInScraper:
3333 public_user_id_ep = (
3434 "https://www.linkedin.com/voyager/api/identity/profiles/{user_id}/profileView"
3535 )
36+ connections_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashClusters.dfcd3603c2779eddd541f572936f4324&queryName=SearchClusterCollection&variables=(query:(queryParameters:(resultType:List(FOLLOWERS)),flagshipSearchIntent:MYNETWORK_CURATION_HUB,includeFiltersInResponse:true),count:50,origin:CurationHub,start:{offset})"
3637 block_user_ep = "https://www.linkedin.com/voyager/api/voyagerTrustDashContentReportingForm?action=entityBlock"
3738
3839 def __init__ (self , session : requests .Session ):
@@ -164,9 +165,11 @@ def parse_staff(self, elements: list[dict]):
164165 person = card .get ("item" , {}).get ("entityResult" , {})
165166 if not person :
166167 continue
167- pattern = r"urn:li:fsd_profile:([^,]+),SEARCH_SRP"
168+ pattern = (
169+ r"urn:li:fsd_profile:([^,]+),(?:SEARCH_SRP|MYNETWORK_CURATION_HUB)"
170+ )
168171 match = re .search (pattern , person ["entityUrn" ])
169- linkedin_id = match .group (1 )
172+ linkedin_id = match .group (1 ) if match else None
170173 person_urn = person ["trackingUrn" ].split (":" )[- 1 ]
171174
172175 name = person ["title" ]["text" ].strip ()
@@ -224,7 +227,7 @@ def fetch_staff(self, offset: int):
224227 try :
225228 res_json = res .json ()
226229 except json .decoder .JSONDecodeError :
227- logger .debug (res .text [: 200 ] )
230+ logger .debug (res .text )
228231 return None , 0
229232
230233 try :
@@ -239,6 +242,78 @@ def fetch_staff(self, offset: int):
239242 new_staff = self .parse_staff (elements ) if elements else []
240243 return new_staff , total_count
241244
245+ def fetch_connections_page (self , offset : int ):
246+ self .session .headers ["x-li-graphql-pegasus-client" ] = "true"
247+ res = self .session .get (self .connections_ep .format (offset = offset ))
248+ self .session .headers .pop ("x-li-graphql-pegasus-client" , "" )
249+ if not res .ok :
250+ logger .debug (f"employees, status code - { res .status_code } " )
251+ if res .status_code == 400 :
252+ raise BadCookies ("Outdated login, delete the session file to log in again" )
253+ elif res .status_code == 429 :
254+ raise TooManyRequests ("429 Too Many Requests" )
255+ if not res .ok :
256+ return
257+ try :
258+ res_json = res .json ()
259+ except json .decoder .JSONDecodeError :
260+ logger .debug (res .text )
261+ return
262+
263+ try :
264+ elements = res_json ["data" ]["searchDashClustersByAll" ]["elements" ]
265+ total_count = res_json ["data" ]["searchDashClustersByAll" ]["metadata" ][
266+ "totalResultCount"
267+ ]
268+
269+ except (KeyError , IndexError , TypeError ):
270+ logger .debug (res_json )
271+ return
272+
273+ new_staff = self .parse_staff (elements ) if elements else []
274+ return new_staff , total_count
275+
276+ def scrape_connections (
277+ self ,
278+ max_results : int = 10 ** 8 ,
279+ extra_profile_data : bool = False ,
280+ ):
281+ self .search_term = "connections"
282+ staff_list : list [Staff ] = []
283+
284+ try :
285+ initial_staff , total_search_result_count = self .fetch_connections_page (0 )
286+ if initial_staff :
287+ staff_list .extend (initial_staff )
288+
289+ self .num_staff = min (total_search_result_count , max_results )
290+ for offset in range (50 , self .num_staff , 50 ):
291+ staff , _ = self .fetch_connections_page (offset )
292+ logger .debug (
293+ f"Connections from search: { len (staff )} new, { len (staff_list ) + len (staff )} total"
294+ )
295+ if not staff :
296+ break
297+ staff_list .extend (staff )
298+ except (BadCookies , TooManyRequests ) as e :
299+ self .on_block = True
300+ logger .error (f"Exiting early due to fatal error: { str (e )} " )
301+ return staff_list [:max_results ]
302+
303+ reduced_staff_list = staff_list [:max_results ]
304+
305+ non_restricted = list (
306+ filter (lambda x : x .name != "LinkedIn Member" , reduced_staff_list )
307+ )
308+
309+ if extra_profile_data :
310+ try :
311+ for i , employee in enumerate (non_restricted , start = 1 ):
312+ self .fetch_all_info_for_employee (employee , i )
313+ except TooManyRequests as e :
314+ logger .error (str (e ))
315+ return reduced_staff_list
316+
242317 def fetch_location_id (self ):
243318 """Fetch the location id for the location to be used in LinkedIn search"""
244319 ep = self .location_id_ep .format (location = quote (self .raw_location ))
@@ -333,7 +408,6 @@ def scrape_staff(
333408 return staff_list [:max_results ]
334409
335410 reduced_staff_list = staff_list [:max_results ]
336-
337411 non_restricted = list (
338412 filter (lambda x : x .name != "LinkedIn Member" , reduced_staff_list )
339413 )
@@ -352,7 +426,7 @@ def scrape_staff(
352426 def fetch_all_info_for_employee (self , employee : Staff , index : int ):
353427 """Simultaniously fetch all the data for an employee"""
354428 logger .info (
355- f"Fetching employee data for { employee .id } { index :>4} / { self .num_staff } - { employee .profile_link } "
429+ f"Fetching data for account { employee .id } { index :>4} / { self .num_staff } - { employee .profile_link } "
356430 )
357431
358432 with ThreadPoolExecutor (max_workers = 7 ) as executor :
0 commit comments