HH/analyze_complaint_sources.py
2026-04-19 10:53:12 +03:00

80 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""
Analyze complaint source values from 'جهة الشكوى' column across all years.
"""
import pandas as pd
import os
from collections import defaultdict
years = [2022, 2023, 2024, 2025]
# per-year data
year_data = {year: defaultdict(int) for year in years}
# overall data
all_data = defaultdict(lambda: {"total_count": 0, "years": set(), "sheets": set()})
for year in years:
file_path = f"data/Complaints Report - {year}.xlsx"
if not os.path.exists(file_path):
print(f"❌ File not found: {file_path}")
continue
print(f"📊 Processing {year}: {file_path}")
try:
xls = pd.ExcelFile(file_path)
print(f" Sheets: {xls.sheet_names}")
for sheet_name in xls.sheet_names:
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
print(f" Sheet '{sheet_name}': {len(df)} rows, columns: {list(df.columns)}")
if "جهة الشكوى" in df.columns:
value_counts = df["جهة الشكوى"].value_counts(dropna=False)
for value, count in value_counts.items():
value_str = str(value) if pd.notna(value) else "(NULL/Empty)"
year_data[year][value_str] += count
all_data[value_str]["total_count"] += count
all_data[value_str]["years"].add(year)
all_data[value_str]["sheets"].add(f"{year}/{sheet_name}")
else:
print(f" ⚠️ No 'جهة الشكوى' column")
except Exception as e:
print(f" ❌ Error: {e}")
except Exception as e:
print(f"❌ Error: {e}")
print()
print("=" * 100)
print("PER-YEAR BREAKDOWN")
print("=" * 100)
for year in years:
print(
f"\n--- Year {year} ({len(year_data[year])} unique sources, {sum(year_data[year].values())} total complaints) ---"
)
sorted_year = sorted(year_data[year].items(), key=lambda x: x[1], reverse=True)
for value, count in sorted_year:
print(f" {count:>6} | {value}")
print("\n")
print("=" * 100)
print("CONSOLIDATED - ALL UNIQUE VALUES ACROSS ALL YEARS")
print("=" * 100)
print(f"{'Total':<8} {'Value':<50} {'Appears In':<30}")
print("-" * 100)
sorted_all = sorted(all_data.items(), key=lambda x: x[1]["total_count"], reverse=True)
for value, data in sorted_all:
years_str = ", ".join(sorted([str(y) for y in data["years"]]))
print(f"{data['total_count']:<8} {value:<50} {years_str:<30}")
print(f"\n{'=' * 100}")
print(f"SUMMARY: {len(sorted_all)} unique values across all years")
print(f"{'=' * 100}")