346 lines
14 KiB
Python
346 lines
14 KiB
Python
"""
|
|
Google Reviews scraper using Google My Business API.
|
|
"""
|
|
import os
|
|
import json
|
|
import logging
|
|
from typing import List, Dict, Any, Optional
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from google.oauth2.credentials import Credentials
|
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
from google.auth.transport.requests import Request
|
|
from googleapiclient.discovery import build
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Google API client libraries not installed. "
|
|
"Install with: pip install google-api-python-client google-auth-oauthlib"
|
|
)
|
|
|
|
from .base import BaseScraper
|
|
|
|
|
|
class GoogleReviewsScraper(BaseScraper):
|
|
"""
|
|
Scraper for Google Reviews using Google My Business API.
|
|
Extracts reviews from one or multiple locations.
|
|
"""
|
|
|
|
# OAuth scope for managing Business Profile data
|
|
SCOPES = ['https://www.googleapis.com/auth/business.manage']
|
|
|
|
def __init__(self, config: Dict[str, Any]):
|
|
"""
|
|
Initialize Google Reviews scraper.
|
|
|
|
Args:
|
|
config: Dictionary with:
|
|
- 'credentials_file': Path to client_secret.json (or None)
|
|
- 'token_file': Path to token.json (default: 'token.json')
|
|
- 'locations': List of location names to scrape (optional)
|
|
- 'account_name': Google account name (optional, will be fetched if not provided)
|
|
"""
|
|
super().__init__(config)
|
|
|
|
self.credentials_file = config.get('credentials_file', 'client_secret.json')
|
|
self.token_file = config.get('token_file', 'token.json')
|
|
self.locations = config.get('locations', None) # Specific locations to scrape
|
|
self.account_name = config.get('account_name', None)
|
|
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
# Authenticate and build service
|
|
self.service = self._get_authenticated_service()
|
|
|
|
def _get_authenticated_service(self):
|
|
"""
|
|
Get authenticated Google My Business API service.
|
|
|
|
Returns:
|
|
Authenticated service object
|
|
"""
|
|
creds = None
|
|
|
|
# Load existing credentials from token file
|
|
if os.path.exists(self.token_file):
|
|
creds = Credentials.from_authorized_user_file(self.token_file, self.SCOPES)
|
|
|
|
# If there are no (valid) credentials available, let the user log in
|
|
if not creds or not creds.valid:
|
|
if creds and creds.expired and creds.refresh_token:
|
|
self.logger.info("Refreshing expired credentials...")
|
|
creds.refresh(Request())
|
|
else:
|
|
# Check if credentials file exists
|
|
if not os.path.exists(self.credentials_file):
|
|
raise FileNotFoundError(
|
|
f"Google Reviews requires '{self.credentials_file}' credentials file. "
|
|
"This scraper will be disabled. See GOOGLE_REVIEWS_INTEGRATION_GUIDE.md for setup instructions."
|
|
)
|
|
|
|
self.logger.info("Starting OAuth flow...")
|
|
flow = InstalledAppFlow.from_client_secrets_file(
|
|
self.credentials_file,
|
|
self.SCOPES
|
|
)
|
|
creds = flow.run_local_server(port=0)
|
|
|
|
# Save the credentials for the next run
|
|
with open(self.token_file, 'w') as token:
|
|
token.write(creds.to_json())
|
|
|
|
self.logger.info(f"Credentials saved to {self.token_file}")
|
|
|
|
# Build the service using the My Business v4 discovery document
|
|
service = build('mybusiness', 'v4', credentials=creds)
|
|
self.logger.info("Successfully authenticated with Google My Business API")
|
|
|
|
return service
|
|
|
|
def _get_account_name(self) -> str:
|
|
"""
|
|
Get the account ID from Google My Business.
|
|
|
|
Returns:
|
|
Account name (e.g., 'accounts/123456789')
|
|
"""
|
|
if self.account_name:
|
|
return self.account_name
|
|
|
|
self.logger.info("Fetching account list...")
|
|
accounts_resp = self.service.accounts().list().execute()
|
|
|
|
if not accounts_resp.get('accounts'):
|
|
raise ValueError("No Google My Business accounts found. Please ensure you have admin access.")
|
|
|
|
account_name = accounts_resp['accounts'][0]['name']
|
|
self.logger.info(f"Using account: {account_name}")
|
|
self.account_name = account_name
|
|
|
|
return account_name
|
|
|
|
def _get_locations(self, account_name: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get all locations for the account.
|
|
|
|
Args:
|
|
account_name: Google account name
|
|
|
|
Returns:
|
|
List of location dictionaries
|
|
"""
|
|
self.logger.info("Fetching location list...")
|
|
locations_resp = self.service.accounts().locations().list(parent=account_name).execute()
|
|
locations = locations_resp.get('locations', [])
|
|
|
|
if not locations:
|
|
raise ValueError(f"No locations found under account {account_name}")
|
|
|
|
self.logger.info(f"Found {len(locations)} locations")
|
|
|
|
# Filter locations if specific locations are requested
|
|
if self.locations:
|
|
filtered_locations = []
|
|
for loc in locations:
|
|
# Check if location name matches any of the requested locations
|
|
if any(req_loc in loc['name'] for req_loc in self.locations):
|
|
filtered_locations.append(loc)
|
|
self.logger.info(f"Filtered to {len(filtered_locations)} locations")
|
|
return filtered_locations
|
|
|
|
return locations
|
|
|
|
def scrape_comments(
|
|
self,
|
|
location_names: Optional[List[str]] = None,
|
|
max_reviews_per_location: int = 100,
|
|
**kwargs
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape Google reviews from specified locations.
|
|
|
|
Args:
|
|
location_names: Optional list of location names to scrape (scrapes all if None)
|
|
max_reviews_per_location: Maximum reviews to fetch per location
|
|
|
|
Returns:
|
|
List of standardized review dictionaries
|
|
"""
|
|
all_reviews = []
|
|
|
|
try:
|
|
# Get account and locations
|
|
account_name = self._get_account_name()
|
|
locations = self._get_locations(account_name)
|
|
|
|
# Apply location filter if provided
|
|
if location_names:
|
|
filtered_locations = []
|
|
for loc in locations:
|
|
if any(req_loc in loc['name'] for req_loc in location_names):
|
|
filtered_locations.append(loc)
|
|
locations = filtered_locations
|
|
if not locations:
|
|
self.logger.warning(f"No matching locations found for: {location_names}")
|
|
return []
|
|
|
|
# Get location resource names for batch fetching
|
|
location_resource_names = [loc['name'] for loc in locations]
|
|
|
|
self.logger.info(f"Extracting reviews for {len(location_resource_names)} locations...")
|
|
|
|
# Batch fetch reviews for all locations
|
|
next_page_token = None
|
|
page_num = 0
|
|
|
|
while True:
|
|
page_num += 1
|
|
self.logger.info(f"Fetching page {page_num} of reviews...")
|
|
|
|
batch_body = {
|
|
"locationNames": location_resource_names,
|
|
"pageSize": max_reviews_per_location,
|
|
"pageToken": next_page_token,
|
|
"ignoreRatingOnlyReviews": False
|
|
}
|
|
|
|
# Official batchGetReviews call
|
|
results = self.service.accounts().locations().batchGetReviews(
|
|
name=account_name,
|
|
body=batch_body
|
|
).execute()
|
|
|
|
location_reviews = results.get('locationReviews', [])
|
|
|
|
if not location_reviews:
|
|
self.logger.info(f"No more reviews found on page {page_num}")
|
|
break
|
|
|
|
# Process reviews
|
|
for loc_review in location_reviews:
|
|
review_data = loc_review.get('review', {})
|
|
location_name = loc_review.get('name')
|
|
|
|
standardized = self._extract_review(location_name, review_data)
|
|
if standardized:
|
|
all_reviews.append(standardized)
|
|
|
|
self.logger.info(f" - Page {page_num}: {len(location_reviews)} reviews (total: {len(all_reviews)})")
|
|
|
|
next_page_token = results.get('nextPageToken')
|
|
if not next_page_token:
|
|
self.logger.info("All reviews fetched")
|
|
break
|
|
|
|
self.logger.info(f"Completed Google Reviews scraping. Total reviews: {len(all_reviews)}")
|
|
|
|
# Log location distribution
|
|
location_stats = {}
|
|
for review in all_reviews:
|
|
location_id = review.get('raw_data', {}).get('location_name', 'unknown')
|
|
location_stats[location_id] = location_stats.get(location_id, 0) + 1
|
|
|
|
self.logger.info("Reviews by location:")
|
|
for location, count in location_stats.items():
|
|
self.logger.info(f" - {location}: {count} reviews")
|
|
|
|
return all_reviews
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error scraping Google Reviews: {e}")
|
|
raise
|
|
|
|
def _extract_review(
|
|
self,
|
|
location_name: str,
|
|
review_data: Dict[str, Any]
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract and standardize a review from Google My Business API response.
|
|
|
|
Args:
|
|
location_name: Location resource name
|
|
review_data: Review object from Google API
|
|
|
|
Returns:
|
|
Standardized review dictionary
|
|
"""
|
|
try:
|
|
# Extract review data
|
|
review_id = review_data.get('name', '')
|
|
reviewer_info = review_data.get('reviewer', {})
|
|
comment = review_data.get('comment', '')
|
|
star_rating = review_data.get('starRating')
|
|
create_time = review_data.get('createTime')
|
|
update_time = review_data.get('updateTime')
|
|
|
|
# Extract reviewer information
|
|
reviewer_name = reviewer_info.get('displayName', 'Anonymous')
|
|
reviewer_id = reviewer_info.get('name', '')
|
|
|
|
# Extract review reply
|
|
reply_data = review_data.get('reviewReply', {})
|
|
reply_comment = reply_data.get('comment', '')
|
|
reply_time = reply_data.get('updateTime', '')
|
|
|
|
# Extract location details if available
|
|
# We'll get the full location info from the location name
|
|
try:
|
|
location_info = self.service.accounts().locations().get(
|
|
name=location_name
|
|
).execute()
|
|
location_address = location_info.get('address', {})
|
|
location_name_display = location_info.get('locationName', '')
|
|
location_city = location_address.get('locality', '')
|
|
location_country = location_address.get('countryCode', '')
|
|
except:
|
|
location_info = {}
|
|
location_name_display = ''
|
|
location_city = ''
|
|
location_country = ''
|
|
|
|
# Build Google Maps URL for the review
|
|
# Extract location ID from resource name (e.g., 'accounts/123/locations/456')
|
|
location_id = location_name.split('/')[-1]
|
|
google_maps_url = f"https://search.google.com/local/writereview?placeid={location_id}"
|
|
|
|
review_dict = {
|
|
'comment_id': review_id,
|
|
'comments': comment,
|
|
'author': reviewer_name,
|
|
'published_at': self._parse_timestamp(create_time) if create_time else None,
|
|
'like_count': 0, # Google reviews don't have like counts
|
|
'reply_count': 1 if reply_comment else 0,
|
|
'post_id': location_name, # Store location name as post_id
|
|
'media_url': google_maps_url,
|
|
'raw_data': {
|
|
'location_name': location_name,
|
|
'location_id': location_id,
|
|
'location_display_name': location_name_display,
|
|
'location_city': location_city,
|
|
'location_country': location_country,
|
|
'location_info': location_info,
|
|
'review_id': review_id,
|
|
'reviewer_id': reviewer_id,
|
|
'reviewer_name': reviewer_name,
|
|
'star_rating': star_rating,
|
|
'comment': comment,
|
|
'create_time': create_time,
|
|
'update_time': update_time,
|
|
'reply_comment': reply_comment,
|
|
'reply_time': reply_time,
|
|
'full_review': review_data
|
|
}
|
|
}
|
|
|
|
# Add rating field for Google Reviews (1-5 stars)
|
|
if star_rating:
|
|
review_dict['rating'] = int(star_rating)
|
|
|
|
return self._standardize_comment(review_dict)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error extracting Google review: {e}")
|
|
return None
|