LinkedInScraper.py

# LinkedInScraper.py

import json
import logging
import os
import re
import time
from typing import Any, Callable, Dict, List, Optional

from bs4 import BeautifulSoup as bs
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


class LinkedInScraper:
    """
    A class to scrape data from a LinkedIn profile.
    It's designed to be fault-tolerant and efficient.
    """

    def __init__(self, driver: WebDriver, save_to_file: bool = False):
        self.driver = driver
        self.save = save_to_file
        self.wait = WebDriverWait(self.driver, 15)

    def scrape(self, url: str) -> str:
        """
        Public method to orchestrate the scraping of a single profile.
        """
        try:
            self._navigate_to_profile(url)
            profile_soup = bs(self.driver.page_source, "lxml")

            # --- Basic Information ---
            name = self._get_name(profile_soup)
            location = self._get_location(profile_soup)
            logging.info(f"Scraping data for: {name}")

            # --- Section Scraping ---
            experience = self._scrape_section('experience', 'experiences', self._parse_experience_item)
            education = self._scrape_section('education', 'education', self._parse_education_item)
            volunteering = self._scrape_section('volunteering_experience', 'volunteer-experiences', self._parse_volunteer_item)
            skills = self._scrape_section('skills', 'skills', self._parse_skill_item)

            # --- Compile and Save/Return Output ---
            output = {
                "url": self.driver.current_url,
                "name": name,
                "location": location,
                "experience": experience,
                "education": education,
                "volunteering": volunteering,
                "skills": skills,
            }
            json_output = json.dumps(output, indent=4)

            if self.save:
                self._save_output_to_file(json_output)

            return json_output

        except Exception as e:
            logging.error(f"An error occurred while scraping {url}: {e}")
            return json.dumps({"error": str(e), "url": url})

    def _navigate_to_profile(self, url: str) -> None:
        """Navigates to the profile URL and waits for a key element to load."""
        logging.info(f"Navigating to {url}")
        self.driver.get(url)
        try:
            # Wait for the main profile card to be present
            self.wait.until(EC.presence_of_element_located((By.ID, 'profile-content')))
        except TimeoutException:
            logging.warning("Could not load main profile element. The page may be private or invalid.")
            raise

    def _get_name(self, soup: bs) -> Optional[str]:
        try:
            name_tag = soup.select_one('section div span a h1')
            if name_tag:
                name = name_tag.get_text(strip=True)
            return name if name else None
        except AttributeError:
            logging.warning("Could not find name element.")
            return None

    def _get_location(self, soup: bs) -> Optional[str]:
        try:
            return soup.find("span", class_="text-body-small inline t-black--light break-words").get_text(strip=True)
        except AttributeError:
            logging.warning("Could not find location element.")
            return None

    def _scrape_section(self, section_id: str, details_suffix: str, parser: Callable[[WebElement], Dict]) -> List[Dict]:
        """Generic method to scrape a profile section."""
        section_list = []
        base_url = self.driver.current_url.split('?')[0].strip('/')
        details_url = f"{base_url}/details/{details_suffix}/"
        
        # Check if a "Show all" button exists by looking for the link
        try:
            if details_suffix == "skills":
                show_all_button = self.wait.until(EC.presence_of_element_located((
                    By.XPATH,
                    "//div[contains(@class, 'pv-action')]//a[contains(@href, '/details/skills') and contains(@class, 'artdeco-button') and .//span[starts-with(normalize-space(.), 'Show all')]]"
                )))
            else:
                # Static ID case
                details_link_id = f"navigation-index-see-all-{details_suffix}"
                show_all_button = self.wait.until(EC.presence_of_element_located((
                    By.ID,
                    details_link_id
                )))
            show_all_button.click()
            logging.info(f"Found 'Show all' button for '{section_id}'. Navigating to details page.")


            # Wait for the list on the details page to load
            list_container_class = "scaffold-finite-scroll__content"
            self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, list_container_class)))

            # add a scroll to ensure all items are loaded
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)  # Allow time for lazy loading
            
            
            # Re-parse the soup after navigation
            page_soup = bs(self.driver.page_source, "lxml")
            container = page_soup.find("div", class_=list_container_class)
            items = container.find_all("li", class_="pvs-list__paged-list-item", recursive=True) if container else []
            logging.info(f"Found {len(items)} items in '{section_id}' section.")
            logging.info(f"Scraping full list for '{section_id}' from details page.")
            self.driver.back() # Go back to main profile page for next section

        except TimeoutException:
            # If "Show all" button isn't found, scrape from main profile
            logging.info(f"No 'Show all' button for '{section_id}'. Scraping from main profile.")
            page_soup = bs(self.driver.page_source, "lxml")
            section_anchor = page_soup.find("div", id=section_id)
            if not section_anchor:
                logging.warning(f"Section '{section_id}' not found on the profile.")
                return []
            items = section_anchor.find_next("ul").find_all("li", recursive=False)
            
        for item in items:
            parsed_item = parser(item)
            if parsed_item:
                section_list.append(parsed_item)
        return section_list

    # --- INDIVIDUAL PARSERS ---
    # These contain the most brittle logic and are easiest to fix when isolated.
    # NOTE: Relies on `visually-hidden` spans for accessibility which is subject to change.
    def _parse_experience_item(self, item: bs) -> Optional[Dict]:
        try:
            data = {"title": None, "company": None, "employmentType": None, "startDate": None, "endDate": None, "duration": None, "location": None}
            # This selector is more specific and robust
            title_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
            data["title"] = title_el.get_text(strip=True) if title_el else None
            
            # Get all other metadata from a single parent
            meta_elements = item.select("span.t-14.t-normal")
            if len(meta_elements) > 0:
                # Company & Employment Type (e.g., "Google · Full-time")
                # remove span with 'visually-hidden' class before extracting text
                company_clone = meta_elements[0].__copy__()
                for vh in company_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                company_text = company_clone.get_text(strip=True).split('·')
                data["company"] = company_text[0].strip()
                if len(company_text) > 1:
                    data["employmentType"] = company_text[1].strip()

            if len(meta_elements) > 1:
                # Duration & Dates (e.g., "Jan 2022 - Present · 1 yr 5 mos")
                # Remove span with 'visually-hidden' class before extracting text
                meta_clone = meta_elements[1].__copy__()
                for vh in meta_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                duration_text = meta_clone.get_text(strip=True).split('·')
                dates = duration_text[0].strip().split('-')
                data["startDate"] = dates[0].strip()
                data["endDate"] = dates[1].strip() if len(dates) > 1 else "Present"
                if len(duration_text) > 1:
                    data["duration"] = duration_text[1].strip()

            if len(meta_elements) > 2:
                # Location (e.g., "Mountain View, California, United States")
                # Remove span with 'visually-hidden' class before extracting text
                location_clone = meta_elements[2].__copy__()
                for vh in location_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                data["location"] = location_clone.get_text(strip=True).replace('·', '-').strip()

            return data
        except Exception as e:
            logging.debug(f"Could not parse experience item: {e}")
            return None

    def _parse_education_item(self, item: bs) -> Optional[Dict]:
        try:
            data = {"school": None, "degree": None, "fieldOfStudy": None, "startDate": None, "endDate": None}
            # School name
            school_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
            data["school"] = school_el.get_text(strip=True) if school_el else None

            # Degree & field of study
            meta_elements = item.select("span.t-14.t-normal")
            if len(meta_elements) > 0:
                degree_clone = meta_elements[0].__copy__()
                for vh in degree_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                degree_text = degree_clone.get_text(strip=True).split(',')
                data["degree"] = degree_text[0].strip()
                if len(degree_text) > 1:
                    data["fieldOfStudy"] = degree_text[1].strip()

            # Duration
            if len(meta_elements) > 1:
                duration_clone = meta_elements[1].__copy__()
                for vh in duration_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                duration_text = duration_clone.get_text(strip=True).split('-')
                data["startDate"] = duration_text[0].strip()
                if len(duration_text) > 1:
                    data["endDate"] = duration_text[1].strip()

            return data
        except Exception as e:
            logging.debug(f"Could not parse education item: {e}")
            return None

    def _parse_volunteer_item(self, item: bs) -> Optional[Dict]:
        try:
            data = {"organization": None, "role": None, "startDate": None, "endDate": None, "duration": None}
            # Role
            role_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
            data["role"] = role_el.get_text(strip=True) if role_el else None

            # Meta elements (organization, duration)
            meta_elements = item.select("span.t-14.t-normal")
            if len(meta_elements) > 0:
                org_clone = meta_elements[0].__copy__()
                for vh in org_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                data["organization"] = org_clone.get_text(strip=True)

            if len(meta_elements) > 1:
                duration_clone = meta_elements[1].__copy__()
                for vh in duration_clone.find_all("span", class_="visually-hidden"):
                    vh.decompose()
                duration_text = duration_clone.get_text(strip=True).split('·')
                dates = duration_text[0].strip().split('-')
                data["startDate"] = dates[0].strip()
                data["endDate"] = dates[1].strip() if len(dates) > 1 else "Present"
                if len(duration_text) > 1:
                    data["duration"] = duration_text[1].strip()

            return data
        except Exception as e:
            logging.debug(f"Could not parse volunteer item: {e}")
            return None

    def _parse_skill_item(self, item: bs) -> Optional[Dict]:
        try:
            skill_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
            return {"skill": skill_el.get_text(strip=True)} if skill_el else None
        except Exception as e:
            logging.debug(f"Could not parse skill item: {e}")
            return None

    def _save_output_to_file(self, json_output: str) -> None:
        """Saves the JSON output to a file."""
        try:
            # Extract a safe filename from the URL
            filename = self.driver.current_url.split('/in/')[1].strip('/').replace('/', '_')
            if not os.path.exists("./data"):
                os.makedirs("data")
            filepath = f"./data/{filename}.json"
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(json_output)
            logging.info(f"File saved to {filepath}")
        except IndexError:
            logging.error("Could not generate a filename from the URL.")
        except Exception as e:
            logging.error(f"Error saving file: {e}")