HH/apps/social/scrapers/linkedin.py

"""
LinkedIn comment scraper using LinkedIn Marketing API.
"""
import logging
from typing import List, Dict, Any
import requests

from .base import BaseScraper


class LinkedInScraper(BaseScraper):
    """
    Scraper for LinkedIn comments using LinkedIn Marketing API.
    Extracts comments from organization posts.
    """

    def __init__(self, config: Dict[str, Any]):
        """
        Initialize LinkedIn scraper.

        Args:
            config: Dictionary with 'access_token' and 'organization_id'
        """
        super().__init__(config)
        self.access_token = config.get('access_token')
        if not self.access_token:
            raise ValueError(
                "LinkedIn access token is required. "
                "Set LINKEDIN_ACCESS_TOKEN in your .env file."
            )

        self.org_id = config.get('organization_id')
        if not self.org_id:
            raise ValueError(
                "LinkedIn organization ID is required. "
                "Set LINKEDIN_ORGANIZATION_ID in your .env file."
            )

        self.api_version = config.get('api_version', '202401')
        self.headers = {
            'Authorization': f'Bearer {self.access_token}',
            'LinkedIn-Version': self.api_version,
            'X-Restli-Protocol-Version': '2.0.0',
            'Content-Type': 'application/json'
        }
        self.base_url = "https://api.linkedin.com/rest"
        self.logger = logging.getLogger(self.__class__.__name__)

    def scrape_comments(
        self,
        organization_id: str = None,
        max_posts: int = 50,
        max_comments_per_post: int = 100,
        **kwargs
    ) -> List[Dict[str, Any]]:
        """
        Scrape comments from LinkedIn organization posts.

        Args:
            organization_id: LinkedIn organization URN (e.g., 'urn:li:organization:1234567')
            max_posts: Maximum number of posts to scrape
            max_comments_per_post: Maximum comments to fetch per post

        Returns:
            List of standardized comment dictionaries
        """
        organization_id = organization_id or self.org_id
        if not organization_id:
            raise ValueError("Organization ID is required")

        all_comments = []

        self.logger.info(f"Starting LinkedIn comment extraction for {organization_id}")

        try:
            # Get all posts for the organization
            posts = self._get_all_page_posts(organization_id)
            self.logger.info(f"Found {len(posts)} posts")

            # Limit posts if needed
            if max_posts and len(posts) > max_posts:
                posts = posts[:max_posts]
                self.logger.info(f"Limited to {max_posts} posts")

            # Extract comments from each post
            for i, post_urn in enumerate(posts, 1):
                self.logger.info(f"Processing post {i}/{len(posts)}: {post_urn}")

                try:
                    comments = self._get_comments_for_post(
                        post_urn,
                        max_comments=max_comments_per_post
                    )

                    for comment in comments:
                        standardized = self._extract_comment(post_urn, comment)
                        if standardized:
                            all_comments.append(standardized)

                    self.logger.info(f"  - Found {len(comments)} comments")

                except Exception as e:
                    self.logger.warning(f"Error processing post {post_urn}: {e}")
                    continue

            self.logger.info(f"Completed LinkedIn scraping. Total comments: {len(all_comments)}")
            return all_comments

        except Exception as e:
            self.logger.error(f"Error scraping LinkedIn: {e}")
            raise

    def _get_all_page_posts(self, org_urn: str, count: int = 50) -> List[str]:
        """
        Retrieves all post URNs for the organization.

        Args:
            org_urn: Organization URN
            count: Number of posts per request

        Returns:
            List of post URNs
        """
        posts = []
        start = 0

        while True:
            # Finder query for posts by author
            url = f"{self.base_url}/posts?author={org_urn}&q=author&count={count}&start={start}"

            try:
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                data = response.json()

                if 'elements' not in data or not data['elements']:
                    break

                posts.extend([item['id'] for item in data['elements']])
                start += count

                self.logger.debug(f"Retrieved {len(data['elements'])} posts (total: {len(posts)})")

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Error fetching posts: {e}")
                break

        return posts

    def _get_comments_for_post(self, post_urn: str, max_comments: int = 100) -> List[Dict[str, Any]]:
        """
        Retrieves all comments for a specific post URN.

        Args:
            post_urn: Post URN
            max_comments: Maximum comments to fetch

        Returns:
            List of comment objects
        """
        comments = []
        start = 0
        count = 100

        while True:
            # Social Actions API for comments
            url = f"{self.base_url}/socialActions/{post_urn}/comments?count={count}&start={start}"

            try:
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                data = response.json()

                if 'elements' not in data or not data['elements']:
                    break

                for comment in data['elements']:
                    comments.append(comment)

                    # Check if we've reached the limit
                    if len(comments) >= max_comments:
                        return comments[:max_comments]

                start += count

                # Check if we need to stop
                if len(comments) >= max_comments:
                    return comments[:max_comments]

            except requests.exceptions.RequestException as e:
                self.logger.warning(f"Error fetching comments for post {post_urn}: {e}")
                break

        return comments[:max_comments]

    def _extract_comment(self, post_urn: str, comment: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract and standardize a comment from LinkedIn API response.

        Args:
            post_urn: Post URN
            comment: Comment object from LinkedIn API

        Returns:
            Standardized comment dictionary
        """
        try:
            # Extract comment data
            comment_id = comment.get('id', '')
            message = comment.get('message', {})
            comment_text = message.get('text', '')
            actor = comment.get('actor', '')

            # Extract author information
            author_id = ''
            author_name = ''
            if isinstance(actor, str):
                author_id = actor
            elif isinstance(actor, dict):
                author_id = actor.get('id', '')
                author_name = actor.get('firstName', '') + ' ' + actor.get('lastName', '')

            # Extract created time
            created_time = comment.get('created', {}).get('time', '')

            # Extract social actions (likes)
            social_actions = comment.get('socialActions', [])
            like_count = 0
            for action in social_actions:
                if action.get('actionType') == 'LIKE':
                    like_count = action.get('actorCount', 0)
                    break

            # Build LinkedIn URL
            linkedin_url = post_urn.replace('urn:li:activity:', 'https://www.linkedin.com/feed/update/')

            comment_data = {
                'comment_id': comment_id,
                'comments': comment_text,
                'author': author_name or author_id,
                'published_at': self._parse_timestamp(created_time) if created_time else None,
                'like_count': like_count,
                'reply_count': 0,  # LinkedIn API doesn't provide reply count easily
                'post_id': post_urn,
                'media_url': linkedin_url,
                'raw_data': {
                    'post_urn': post_urn,
                    'comment_id': comment_id,
                    'comment_text': comment_text,
                    'author_id': author_id,
                    'author_name': author_name,
                    'created_time': created_time,
                    'like_count': like_count,
                    'full_comment': comment
                }
            }

            return self._standardize_comment(comment_data)

        except Exception as e:
            self.logger.error(f"Error extracting LinkedIn comment: {e}")
            return None