HH/apps/social/scrapers/google_reviews.py
2026-01-12 12:27:29 +03:00

346 lines
14 KiB
Python

"""
Google Reviews scraper using Google My Business API.
"""
import os
import json
import logging
from typing import List, Dict, Any, Optional
from pathlib import Path
try:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
except ImportError:
raise ImportError(
"Google API client libraries not installed. "
"Install with: pip install google-api-python-client google-auth-oauthlib"
)
from .base import BaseScraper
class GoogleReviewsScraper(BaseScraper):
"""
Scraper for Google Reviews using Google My Business API.
Extracts reviews from one or multiple locations.
"""
# OAuth scope for managing Business Profile data
SCOPES = ['https://www.googleapis.com/auth/business.manage']
def __init__(self, config: Dict[str, Any]):
"""
Initialize Google Reviews scraper.
Args:
config: Dictionary with:
- 'credentials_file': Path to client_secret.json (or None)
- 'token_file': Path to token.json (default: 'token.json')
- 'locations': List of location names to scrape (optional)
- 'account_name': Google account name (optional, will be fetched if not provided)
"""
super().__init__(config)
self.credentials_file = config.get('credentials_file', 'client_secret.json')
self.token_file = config.get('token_file', 'token.json')
self.locations = config.get('locations', None) # Specific locations to scrape
self.account_name = config.get('account_name', None)
self.logger = logging.getLogger(self.__class__.__name__)
# Authenticate and build service
self.service = self._get_authenticated_service()
def _get_authenticated_service(self):
"""
Get authenticated Google My Business API service.
Returns:
Authenticated service object
"""
creds = None
# Load existing credentials from token file
if os.path.exists(self.token_file):
creds = Credentials.from_authorized_user_file(self.token_file, self.SCOPES)
# If there are no (valid) credentials available, let the user log in
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
self.logger.info("Refreshing expired credentials...")
creds.refresh(Request())
else:
# Check if credentials file exists
if not os.path.exists(self.credentials_file):
raise FileNotFoundError(
f"Google Reviews requires '{self.credentials_file}' credentials file. "
"This scraper will be disabled. See GOOGLE_REVIEWS_INTEGRATION_GUIDE.md for setup instructions."
)
self.logger.info("Starting OAuth flow...")
flow = InstalledAppFlow.from_client_secrets_file(
self.credentials_file,
self.SCOPES
)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open(self.token_file, 'w') as token:
token.write(creds.to_json())
self.logger.info(f"Credentials saved to {self.token_file}")
# Build the service using the My Business v4 discovery document
service = build('mybusiness', 'v4', credentials=creds)
self.logger.info("Successfully authenticated with Google My Business API")
return service
def _get_account_name(self) -> str:
"""
Get the account ID from Google My Business.
Returns:
Account name (e.g., 'accounts/123456789')
"""
if self.account_name:
return self.account_name
self.logger.info("Fetching account list...")
accounts_resp = self.service.accounts().list().execute()
if not accounts_resp.get('accounts'):
raise ValueError("No Google My Business accounts found. Please ensure you have admin access.")
account_name = accounts_resp['accounts'][0]['name']
self.logger.info(f"Using account: {account_name}")
self.account_name = account_name
return account_name
def _get_locations(self, account_name: str) -> List[Dict[str, Any]]:
"""
Get all locations for the account.
Args:
account_name: Google account name
Returns:
List of location dictionaries
"""
self.logger.info("Fetching location list...")
locations_resp = self.service.accounts().locations().list(parent=account_name).execute()
locations = locations_resp.get('locations', [])
if not locations:
raise ValueError(f"No locations found under account {account_name}")
self.logger.info(f"Found {len(locations)} locations")
# Filter locations if specific locations are requested
if self.locations:
filtered_locations = []
for loc in locations:
# Check if location name matches any of the requested locations
if any(req_loc in loc['name'] for req_loc in self.locations):
filtered_locations.append(loc)
self.logger.info(f"Filtered to {len(filtered_locations)} locations")
return filtered_locations
return locations
def scrape_comments(
self,
location_names: Optional[List[str]] = None,
max_reviews_per_location: int = 100,
**kwargs
) -> List[Dict[str, Any]]:
"""
Scrape Google reviews from specified locations.
Args:
location_names: Optional list of location names to scrape (scrapes all if None)
max_reviews_per_location: Maximum reviews to fetch per location
Returns:
List of standardized review dictionaries
"""
all_reviews = []
try:
# Get account and locations
account_name = self._get_account_name()
locations = self._get_locations(account_name)
# Apply location filter if provided
if location_names:
filtered_locations = []
for loc in locations:
if any(req_loc in loc['name'] for req_loc in location_names):
filtered_locations.append(loc)
locations = filtered_locations
if not locations:
self.logger.warning(f"No matching locations found for: {location_names}")
return []
# Get location resource names for batch fetching
location_resource_names = [loc['name'] for loc in locations]
self.logger.info(f"Extracting reviews for {len(location_resource_names)} locations...")
# Batch fetch reviews for all locations
next_page_token = None
page_num = 0
while True:
page_num += 1
self.logger.info(f"Fetching page {page_num} of reviews...")
batch_body = {
"locationNames": location_resource_names,
"pageSize": max_reviews_per_location,
"pageToken": next_page_token,
"ignoreRatingOnlyReviews": False
}
# Official batchGetReviews call
results = self.service.accounts().locations().batchGetReviews(
name=account_name,
body=batch_body
).execute()
location_reviews = results.get('locationReviews', [])
if not location_reviews:
self.logger.info(f"No more reviews found on page {page_num}")
break
# Process reviews
for loc_review in location_reviews:
review_data = loc_review.get('review', {})
location_name = loc_review.get('name')
standardized = self._extract_review(location_name, review_data)
if standardized:
all_reviews.append(standardized)
self.logger.info(f" - Page {page_num}: {len(location_reviews)} reviews (total: {len(all_reviews)})")
next_page_token = results.get('nextPageToken')
if not next_page_token:
self.logger.info("All reviews fetched")
break
self.logger.info(f"Completed Google Reviews scraping. Total reviews: {len(all_reviews)}")
# Log location distribution
location_stats = {}
for review in all_reviews:
location_id = review.get('raw_data', {}).get('location_name', 'unknown')
location_stats[location_id] = location_stats.get(location_id, 0) + 1
self.logger.info("Reviews by location:")
for location, count in location_stats.items():
self.logger.info(f" - {location}: {count} reviews")
return all_reviews
except Exception as e:
self.logger.error(f"Error scraping Google Reviews: {e}")
raise
def _extract_review(
self,
location_name: str,
review_data: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""
Extract and standardize a review from Google My Business API response.
Args:
location_name: Location resource name
review_data: Review object from Google API
Returns:
Standardized review dictionary
"""
try:
# Extract review data
review_id = review_data.get('name', '')
reviewer_info = review_data.get('reviewer', {})
comment = review_data.get('comment', '')
star_rating = review_data.get('starRating')
create_time = review_data.get('createTime')
update_time = review_data.get('updateTime')
# Extract reviewer information
reviewer_name = reviewer_info.get('displayName', 'Anonymous')
reviewer_id = reviewer_info.get('name', '')
# Extract review reply
reply_data = review_data.get('reviewReply', {})
reply_comment = reply_data.get('comment', '')
reply_time = reply_data.get('updateTime', '')
# Extract location details if available
# We'll get the full location info from the location name
try:
location_info = self.service.accounts().locations().get(
name=location_name
).execute()
location_address = location_info.get('address', {})
location_name_display = location_info.get('locationName', '')
location_city = location_address.get('locality', '')
location_country = location_address.get('countryCode', '')
except:
location_info = {}
location_name_display = ''
location_city = ''
location_country = ''
# Build Google Maps URL for the review
# Extract location ID from resource name (e.g., 'accounts/123/locations/456')
location_id = location_name.split('/')[-1]
google_maps_url = f"https://search.google.com/local/writereview?placeid={location_id}"
review_dict = {
'comment_id': review_id,
'comments': comment,
'author': reviewer_name,
'published_at': self._parse_timestamp(create_time) if create_time else None,
'like_count': 0, # Google reviews don't have like counts
'reply_count': 1 if reply_comment else 0,
'post_id': location_name, # Store location name as post_id
'media_url': google_maps_url,
'raw_data': {
'location_name': location_name,
'location_id': location_id,
'location_display_name': location_name_display,
'location_city': location_city,
'location_country': location_country,
'location_info': location_info,
'review_id': review_id,
'reviewer_id': reviewer_id,
'reviewer_name': reviewer_name,
'star_rating': star_rating,
'comment': comment,
'create_time': create_time,
'update_time': update_time,
'reply_comment': reply_comment,
'reply_time': reply_time,
'full_review': review_data
}
}
# Add rating field for Google Reviews (1-5 stars)
if star_rating:
review_dict['rating'] = int(star_rating)
return self._standardize_comment(review_dict)
except Exception as e:
self.logger.error(f"Error extracting Google review: {e}")
return None