enh:scrape connections

cullenwatson · cullenwatson · commit 47c90ebfc0a6 · 2024-12-30T22:51:15.000-06:00
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ _why pay $100/mo for LSN when you could do it for free and get a nice csv to go
 - Scrapes staff from a company on **LinkedIn**
 - Obtains skills, experiences, certifications & more
 - Or fetch individuals users / comments on posts
+- Scrape your own LinkedIn connections with details
 - Aggregates the employees in a Pandas DataFrame
 
 [Video Guide for StaffSpy](https://youtu.be/DNFmjvpZBTs) - _updated for release v0.2.18_
@@ -61,10 +62,17 @@ companies = account.scrape_companies(
     company_names=['openai', 'microsoft']
 )
 
+# fetch connections
+connections = account.scrape_connections(
+    extra_profile_data=True,
+    max_results=50
+)
+
 staff.to_csv("staff.csv", index=False)
 users.to_csv("users.csv", index=False)
 comments.to_csv("comments.csv", index=False)
 companies.to_csv("companies.csv", index=False)
+connections.to_csv("connections.csv", index=False)
 ```
 
 #### Browser login
@@ -169,6 +177,16 @@ Optional
 ```
 
 
+### Parameters for `scrape_connections()`
+
+```plaintext
+├── max_results (int):
+|    maximum number of connections to fetch (default is all)
+|
+├── extra_profile_data (bool):
+|    gets all profile info
+```
+
 ### LinkedIn notes
 
     - only 1000 max results per search
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.2.22"
+version = "0.2.23"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <cullen@bunsly.com>"]
 readme = "README.md"
diff --git a/staffspy/__init__.py b/staffspy/__init__.py
@@ -13,6 +13,7 @@
     Login,
     parse_company_data,
     extract_emails_from_text,
+    clean_df,
 )
 from staffspy.utils.driver_type import DriverType
 
@@ -84,18 +85,10 @@ def scrape_staff(
             self.on_block = True
         staff_dicts = [staff.to_dict() for staff in staff]
         staff_df = pd.DataFrame(staff_dicts)
-        if "estimated_age" in staff_df.columns:
-            staff_df["estimated_age"] = staff_df["estimated_age"].astype("Int64")
-        if "followers" in staff_df.columns:
-            staff_df["followers"] = staff_df["followers"].astype("Int64")
-        if "connections" in staff_df.columns:
-            staff_df["connections"] = staff_df["connections"].astype("Int64")
-        if "mutuals" in staff_df.columns:
-            staff_df["mutuals"] = staff_df["mutuals"].astype("Int64")
-
         if staff_df.empty:
             return staff_df
 
+        staff_df = clean_df(staff_df)
         linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
         non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
         staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
@@ -203,3 +196,27 @@ def scrape_companies(
             return pd.DataFrame()
 
         return pd.concat(company_dfs, ignore_index=True)
+
+    def scrape_connections(
+        self,
+        max_results: int = 10**8,
+        extra_profile_data: bool = False,
+    ) -> pd.DataFrame:
+        """Scrape connections from Linkedin"""
+        if self.on_block:
+            return logger.error(
+                "Account is on cooldown as a safety precaution after receiving a 429 (TooManyRequests) from LinkedIn. Please recreate a new LinkedInAccount to proceed."
+            )
+        li_scraper = LinkedInScraper(self.session)
+
+        connections = li_scraper.scrape_connections(
+            max_results=max_results,
+            extra_profile_data=extra_profile_data,
+        )
+        connections_df = pd.DataFrame()
+        if connections:
+            staff_dicts = [staff.to_dict() for staff in connections]
+            connections_df = pd.DataFrame(staff_dicts)
+            connections_df = clean_df(connections_df)
+
+        return connections_df
diff --git a/staffspy/linkedin/linkedin.py b/staffspy/linkedin/linkedin.py
@@ -33,6 +33,7 @@ class LinkedInScraper:
     public_user_id_ep = (
         "https://www.linkedin.com/voyager/api/identity/profiles/{user_id}/profileView"
     )
+    connections_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashClusters.dfcd3603c2779eddd541f572936f4324&queryName=SearchClusterCollection&variables=(query:(queryParameters:(resultType:List(FOLLOWERS)),flagshipSearchIntent:MYNETWORK_CURATION_HUB,includeFiltersInResponse:true),count:50,origin:CurationHub,start:{offset})"
     block_user_ep = "https://www.linkedin.com/voyager/api/voyagerTrustDashContentReportingForm?action=entityBlock"
 
     def __init__(self, session: requests.Session):
@@ -164,9 +165,11 @@ def parse_staff(self, elements: list[dict]):
                 person = card.get("item", {}).get("entityResult", {})
                 if not person:
                     continue
-                pattern = r"urn:li:fsd_profile:([^,]+),SEARCH_SRP"
+                pattern = (
+                    r"urn:li:fsd_profile:([^,]+),(?:SEARCH_SRP|MYNETWORK_CURATION_HUB)"
+                )
                 match = re.search(pattern, person["entityUrn"])
-                linkedin_id = match.group(1)
+                linkedin_id = match.group(1) if match else None
                 person_urn = person["trackingUrn"].split(":")[-1]
 
                 name = person["title"]["text"].strip()
@@ -224,7 +227,7 @@ def fetch_staff(self, offset: int):
         try:
             res_json = res.json()
         except json.decoder.JSONDecodeError:
-            logger.debug(res.text[:200])
+            logger.debug(res.text)
             return None, 0
 
         try:
@@ -239,6 +242,78 @@ def fetch_staff(self, offset: int):
         new_staff = self.parse_staff(elements) if elements else []
         return new_staff, total_count
 
+    def fetch_connections_page(self, offset: int):
+        self.session.headers["x-li-graphql-pegasus-client"] = "true"
+        res = self.session.get(self.connections_ep.format(offset=offset))
+        self.session.headers.pop("x-li-graphql-pegasus-client", "")
+        if not res.ok:
+            logger.debug(f"employees, status code - {res.status_code}")
+        if res.status_code == 400:
+            raise BadCookies("Outdated login, delete the session file to log in again")
+        elif res.status_code == 429:
+            raise TooManyRequests("429 Too Many Requests")
+        if not res.ok:
+            return
+        try:
+            res_json = res.json()
+        except json.decoder.JSONDecodeError:
+            logger.debug(res.text)
+            return
+
+        try:
+            elements = res_json["data"]["searchDashClustersByAll"]["elements"]
+            total_count = res_json["data"]["searchDashClustersByAll"]["metadata"][
+                "totalResultCount"
+            ]
+
+        except (KeyError, IndexError, TypeError):
+            logger.debug(res_json)
+            return
+
+        new_staff = self.parse_staff(elements) if elements else []
+        return new_staff, total_count
+
+    def scrape_connections(
+        self,
+        max_results: int = 10**8,
+        extra_profile_data: bool = False,
+    ):
+        self.search_term = "connections"
+        staff_list: list[Staff] = []
+
+        try:
+            initial_staff, total_search_result_count = self.fetch_connections_page(0)
+            if initial_staff:
+                staff_list.extend(initial_staff)
+
+            self.num_staff = min(total_search_result_count, max_results)
+            for offset in range(50, self.num_staff, 50):
+                staff, _ = self.fetch_connections_page(offset)
+                logger.debug(
+                    f"Connections from search: {len(staff)} new, {len(staff_list) + len(staff)} total"
+                )
+                if not staff:
+                    break
+                staff_list.extend(staff)
+        except (BadCookies, TooManyRequests) as e:
+            self.on_block = True
+            logger.error(f"Exiting early due to fatal error: {str(e)}")
+            return staff_list[:max_results]
+
+        reduced_staff_list = staff_list[:max_results]
+
+        non_restricted = list(
+            filter(lambda x: x.name != "LinkedIn Member", reduced_staff_list)
+        )
+
+        if extra_profile_data:
+            try:
+                for i, employee in enumerate(non_restricted, start=1):
+                    self.fetch_all_info_for_employee(employee, i)
+            except TooManyRequests as e:
+                logger.error(str(e))
+        return reduced_staff_list
+
     def fetch_location_id(self):
         """Fetch the location id for the location to be used in LinkedIn search"""
         ep = self.location_id_ep.format(location=quote(self.raw_location))
@@ -333,7 +408,6 @@ def scrape_staff(
             return staff_list[:max_results]
 
         reduced_staff_list = staff_list[:max_results]
-
         non_restricted = list(
             filter(lambda x: x.name != "LinkedIn Member", reduced_staff_list)
         )
@@ -352,7 +426,7 @@ def scrape_staff(
     def fetch_all_info_for_employee(self, employee: Staff, index: int):
         """Simultaniously fetch all the data for an employee"""
         logger.info(
-            f"Fetching employee data for {employee.id} {index:>4} / {self.num_staff} - {employee.profile_link}"
+            f"Fetching data for account {employee.id} {index:>4} / {self.num_staff} - {employee.profile_link}"
         )
 
         with ThreadPoolExecutor(max_workers=7) as executor:
diff --git a/staffspy/utils/utils.py b/staffspy/utils/utils.py
@@ -435,5 +435,17 @@ def parse_company_data(json_data, search_term=None):
     return company_df
 
 
+def clean_df(staff_df):
+    if "estimated_age" in staff_df.columns:
+        staff_df["estimated_age"] = staff_df["estimated_age"].astype("Int64")
+    if "followers" in staff_df.columns:
+        staff_df["followers"] = staff_df["followers"].astype("Int64")
+    if "connections" in staff_df.columns:
+        staff_df["connections"] = staff_df["connections"].astype("Int64")
+    if "mutuals" in staff_df.columns:
+        staff_df["mutuals"] = staff_df["mutuals"].astype("Int64")
+    return staff_df
+
+
 if __name__ == "__main__":
     p = parse_dates("May 2018 - Jun 2024")