""" Google Reviews scraper using Google My Business API. """ import os import json import logging from typing import List, Dict, Any, Optional from pathlib import Path try: from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from googleapiclient.discovery import build except ImportError: raise ImportError( "Google API client libraries not installed. " "Install with: pip install google-api-python-client google-auth-oauthlib" ) from .base import BaseScraper class GoogleReviewsScraper(BaseScraper): """ Scraper for Google Reviews using Google My Business API. Extracts reviews from one or multiple locations. """ # OAuth scope for managing Business Profile data SCOPES = ['https://www.googleapis.com/auth/business.manage'] def __init__(self, config: Dict[str, Any]): """ Initialize Google Reviews scraper. Args: config: Dictionary with: - 'credentials_file': Path to client_secret.json (or None) - 'token_file': Path to token.json (default: 'token.json') - 'locations': List of location names to scrape (optional) - 'account_name': Google account name (optional, will be fetched if not provided) """ super().__init__(config) self.credentials_file = config.get('credentials_file', 'client_secret.json') self.token_file = config.get('token_file', 'token.json') self.locations = config.get('locations', None) # Specific locations to scrape self.account_name = config.get('account_name', None) self.logger = logging.getLogger(self.__class__.__name__) # Authenticate and build service self.service = self._get_authenticated_service() def _get_authenticated_service(self): """ Get authenticated Google My Business API service. Returns: Authenticated service object """ creds = None # Load existing credentials from token file if os.path.exists(self.token_file): creds = Credentials.from_authorized_user_file(self.token_file, self.SCOPES) # If there are no (valid) credentials available, let the user log in if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: self.logger.info("Refreshing expired credentials...") creds.refresh(Request()) else: # Check if credentials file exists if not os.path.exists(self.credentials_file): raise FileNotFoundError( f"Google Reviews requires '{self.credentials_file}' credentials file. " "This scraper will be disabled. See GOOGLE_REVIEWS_INTEGRATION_GUIDE.md for setup instructions." ) self.logger.info("Starting OAuth flow...") flow = InstalledAppFlow.from_client_secrets_file( self.credentials_file, self.SCOPES ) creds = flow.run_local_server(port=0) # Save the credentials for the next run with open(self.token_file, 'w') as token: token.write(creds.to_json()) self.logger.info(f"Credentials saved to {self.token_file}") # Build the service using the My Business v4 discovery document service = build('mybusiness', 'v4', credentials=creds) self.logger.info("Successfully authenticated with Google My Business API") return service def _get_account_name(self) -> str: """ Get the account ID from Google My Business. Returns: Account name (e.g., 'accounts/123456789') """ if self.account_name: return self.account_name self.logger.info("Fetching account list...") accounts_resp = self.service.accounts().list().execute() if not accounts_resp.get('accounts'): raise ValueError("No Google My Business accounts found. Please ensure you have admin access.") account_name = accounts_resp['accounts'][0]['name'] self.logger.info(f"Using account: {account_name}") self.account_name = account_name return account_name def _get_locations(self, account_name: str) -> List[Dict[str, Any]]: """ Get all locations for the account. Args: account_name: Google account name Returns: List of location dictionaries """ self.logger.info("Fetching location list...") locations_resp = self.service.accounts().locations().list(parent=account_name).execute() locations = locations_resp.get('locations', []) if not locations: raise ValueError(f"No locations found under account {account_name}") self.logger.info(f"Found {len(locations)} locations") # Filter locations if specific locations are requested if self.locations: filtered_locations = [] for loc in locations: # Check if location name matches any of the requested locations if any(req_loc in loc['name'] for req_loc in self.locations): filtered_locations.append(loc) self.logger.info(f"Filtered to {len(filtered_locations)} locations") return filtered_locations return locations def scrape_comments( self, location_names: Optional[List[str]] = None, max_reviews_per_location: int = 100, **kwargs ) -> List[Dict[str, Any]]: """ Scrape Google reviews from specified locations. Args: location_names: Optional list of location names to scrape (scrapes all if None) max_reviews_per_location: Maximum reviews to fetch per location Returns: List of standardized review dictionaries """ all_reviews = [] try: # Get account and locations account_name = self._get_account_name() locations = self._get_locations(account_name) # Apply location filter if provided if location_names: filtered_locations = [] for loc in locations: if any(req_loc in loc['name'] for req_loc in location_names): filtered_locations.append(loc) locations = filtered_locations if not locations: self.logger.warning(f"No matching locations found for: {location_names}") return [] # Get location resource names for batch fetching location_resource_names = [loc['name'] for loc in locations] self.logger.info(f"Extracting reviews for {len(location_resource_names)} locations...") # Batch fetch reviews for all locations next_page_token = None page_num = 0 while True: page_num += 1 self.logger.info(f"Fetching page {page_num} of reviews...") batch_body = { "locationNames": location_resource_names, "pageSize": max_reviews_per_location, "pageToken": next_page_token, "ignoreRatingOnlyReviews": False } # Official batchGetReviews call results = self.service.accounts().locations().batchGetReviews( name=account_name, body=batch_body ).execute() location_reviews = results.get('locationReviews', []) if not location_reviews: self.logger.info(f"No more reviews found on page {page_num}") break # Process reviews for loc_review in location_reviews: review_data = loc_review.get('review', {}) location_name = loc_review.get('name') standardized = self._extract_review(location_name, review_data) if standardized: all_reviews.append(standardized) self.logger.info(f" - Page {page_num}: {len(location_reviews)} reviews (total: {len(all_reviews)})") next_page_token = results.get('nextPageToken') if not next_page_token: self.logger.info("All reviews fetched") break self.logger.info(f"Completed Google Reviews scraping. Total reviews: {len(all_reviews)}") # Log location distribution location_stats = {} for review in all_reviews: location_id = review.get('raw_data', {}).get('location_name', 'unknown') location_stats[location_id] = location_stats.get(location_id, 0) + 1 self.logger.info("Reviews by location:") for location, count in location_stats.items(): self.logger.info(f" - {location}: {count} reviews") return all_reviews except Exception as e: self.logger.error(f"Error scraping Google Reviews: {e}") raise def _extract_review( self, location_name: str, review_data: Dict[str, Any] ) -> Optional[Dict[str, Any]]: """ Extract and standardize a review from Google My Business API response. Args: location_name: Location resource name review_data: Review object from Google API Returns: Standardized review dictionary """ try: # Extract review data review_id = review_data.get('name', '') reviewer_info = review_data.get('reviewer', {}) comment = review_data.get('comment', '') star_rating = review_data.get('starRating') create_time = review_data.get('createTime') update_time = review_data.get('updateTime') # Extract reviewer information reviewer_name = reviewer_info.get('displayName', 'Anonymous') reviewer_id = reviewer_info.get('name', '') # Extract review reply reply_data = review_data.get('reviewReply', {}) reply_comment = reply_data.get('comment', '') reply_time = reply_data.get('updateTime', '') # Extract location details if available # We'll get the full location info from the location name try: location_info = self.service.accounts().locations().get( name=location_name ).execute() location_address = location_info.get('address', {}) location_name_display = location_info.get('locationName', '') location_city = location_address.get('locality', '') location_country = location_address.get('countryCode', '') except: location_info = {} location_name_display = '' location_city = '' location_country = '' # Build Google Maps URL for the review # Extract location ID from resource name (e.g., 'accounts/123/locations/456') location_id = location_name.split('/')[-1] google_maps_url = f"https://search.google.com/local/writereview?placeid={location_id}" review_dict = { 'comment_id': review_id, 'comments': comment, 'author': reviewer_name, 'published_at': self._parse_timestamp(create_time) if create_time else None, 'like_count': 0, # Google reviews don't have like counts 'reply_count': 1 if reply_comment else 0, 'post_id': location_name, # Store location name as post_id 'media_url': google_maps_url, 'raw_data': { 'location_name': location_name, 'location_id': location_id, 'location_display_name': location_name_display, 'location_city': location_city, 'location_country': location_country, 'location_info': location_info, 'review_id': review_id, 'reviewer_id': reviewer_id, 'reviewer_name': reviewer_name, 'star_rating': star_rating, 'comment': comment, 'create_time': create_time, 'update_time': update_time, 'reply_comment': reply_comment, 'reply_time': reply_time, 'full_review': review_data } } # Add rating field for Google Reviews (1-5 stars) if star_rating: review_dict['rating'] = int(star_rating) return self._standardize_comment(review_dict) except Exception as e: self.logger.error(f"Error extracting Google review: {e}") return None