HH/apps/complaints/tasks_enhanced.py

371 lines
14 KiB
Python

"""
Enhanced staff matching with fuzzy matching and improved accuracy.
This module provides improved staff matching functions with:
- Fuzzy string matching (Levenshtein distance)
- Better handling of name variations
- Matching against original full name field
- Improved confidence scoring
"""
import logging
from typing import Optional, Dict, Any, Tuple, List
from django.db.models import Q
logger = logging.getLogger(__name__)
def fuzzy_match_ratio(str1: str, str2: str) -> float:
"""
Calculate fuzzy match ratio using difflib.
Args:
str1: First string
str2: Second string
Returns:
Float from 0.0 to 1.0 representing similarity
"""
try:
from difflib import SequenceMatcher
return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()
except Exception:
return 0.0
def normalize_name(name: str) -> str:
"""
Normalize name for better matching.
- Remove extra spaces
- Remove hyphens (Al-Shammari -> AlShammari)
- Convert to lowercase
- Remove common titles
"""
if not name:
return ""
name = name.strip().lower()
# Remove common titles (both English and Arabic)
titles = ['dr.', 'dr', 'mr.', 'mr', 'mrs.', 'mrs', 'ms.', 'ms',
'د.', 'السيد', 'السيدة', 'الدكتور']
for title in titles:
if name.startswith(title):
name = name[len(title):].strip()
# Remove hyphens for better matching (Al-Shammari -> AlShammari)
name = name.replace('-', '')
# Remove extra spaces
while ' ' in name:
name = name.replace(' ', ' ')
return name.strip()
def match_staff_from_name_enhanced(
staff_name: str,
hospital_id: str,
department_name: Optional[str] = None,
return_all: bool = False,
fuzzy_threshold: float = 0.65
) -> Tuple[list, float, str]:
"""
Enhanced staff matching with fuzzy matching and better accuracy.
Args:
staff_name: Name extracted from complaint (without titles)
hospital_id: Hospital ID to search within
department_name: Optional department name to prioritize matching
return_all: If True, return all matching staff. If False, return single best match.
fuzzy_threshold: Minimum similarity ratio for fuzzy matches (0.0 to 1.0)
Returns:
If return_all=True: Tuple of (matches_list, confidence_score, matching_method)
If return_all=False: Tuple of (staff_id, confidence_score, matching_method)
"""
from apps.organizations.models import Staff, Department
if not staff_name or not staff_name.strip():
return [], 0.0, "No staff name provided"
staff_name = staff_name.strip()
normalized_input = normalize_name(staff_name)
matches = []
# Build base query - staff from this hospital, active status
base_query = Staff.objects.filter(
hospital_id=hospital_id,
status='active'
)
# Get department if specified
dept_id = None
if department_name:
department = Department.objects.filter(
hospital_id=hospital_id,
name__iexact=department_name,
status='active'
).first()
if department:
dept_id = department.id
# Fetch all staff to perform fuzzy matching
all_staff = list(base_query)
# If department specified, filter
if dept_id:
dept_staff = [s for s in all_staff if str(s.department.id) == dept_id if s.department]
else:
dept_staff = []
# ========================================
# LAYER 1: EXACT MATCHES
# ========================================
# 1a. Exact match on first_name + last_name (English)
words = staff_name.split()
if len(words) >= 2:
first_name = words[0]
last_name = ' '.join(words[1:])
for staff in all_staff:
if staff.first_name.lower() == first_name.lower() and \
staff.last_name.lower() == last_name.lower():
confidence = 0.95 if (dept_id and staff.department and str(staff.department.id) == dept_id) else 0.90
method = f"Exact English match in {'correct' if (dept_id and staff.department and str(staff.department.id) == dept_id) else 'any'} department"
if not any(m['id'] == str(staff.id) for m in matches):
matches.append(create_match_dict(staff, confidence, method, staff_name))
logger.info(f"EXACT MATCH (EN): {staff.first_name} {staff.last_name} == {first_name} {last_name}")
# 1b. Exact match on full Arabic name
for staff in all_staff:
full_arabic = f"{staff.first_name_ar} {staff.last_name_ar}".strip()
if full_arabic == staff_name:
confidence = 0.95 if (dept_id and staff.department and str(staff.department.id) == dept_id) else 0.90
method = f"Exact Arabic match in {'correct' if (dept_id and staff.department and str(staff.department.id) == dept_id) else 'any'} department"
if not any(m['id'] == str(staff.id) for m in matches):
matches.append(create_match_dict(staff, confidence, method, staff_name))
logger.info(f"EXACT MATCH (AR): {full_arabic} == {staff_name}")
# 1c. Exact match on 'name' field (original full name)
for staff in all_staff:
if staff.name and staff.name.lower() == staff_name.lower():
confidence = 0.93
method = "Exact match on original name field"
if not any(m['id'] == str(staff.id) for m in matches):
matches.append(create_match_dict(staff, confidence, method, staff_name))
logger.info(f"EXACT MATCH (name field): {staff.name} == {staff_name}")
# ========================================
# LAYER 2: FUZZY MATCHES (if no exact)
# ========================================
if not matches:
logger.info(f"No exact matches found, trying fuzzy matching for: {staff_name}")
for staff in all_staff:
# Try different name combinations
name_combinations = [
f"{staff.first_name} {staff.last_name}",
f"{staff.first_name_ar} {staff.last_name_ar}",
staff.name or "",
staff.first_name,
staff.last_name,
staff.first_name_ar,
staff.last_name_ar
]
# Check if any combination matches fuzzily
best_ratio = 0.0
best_match_name = ""
for combo in name_combinations:
if not combo:
continue
ratio = fuzzy_match_ratio(staff_name, combo)
if ratio > best_ratio:
best_ratio = ratio
best_match_name = combo
# If good fuzzy match found
if best_ratio >= fuzzy_threshold:
# Adjust confidence based on match quality and department
dept_bonus = 0.05 if (dept_id and staff.department and str(staff.department.id) == dept_id) else 0.0
confidence = best_ratio * 0.85 + dept_bonus # Scale down slightly for fuzzy
method = f"Fuzzy match ({best_ratio:.2f}) on '{best_match_name}'"
if not any(m['id'] == str(staff.id) for m in matches):
matches.append(create_match_dict(staff, confidence, method, staff_name))
logger.info(f"FUZZY MATCH ({best_ratio:.2f}): {best_match_name} ~ {staff_name}")
# ========================================
# LAYER 3: PARTIAL/WORD MATCHES
# ========================================
if not matches:
logger.info(f"No fuzzy matches found, trying partial/word matching for: {staff_name}")
# Split input name into words
input_words = [normalize_name(w) for w in staff_name.split() if normalize_name(w)]
for staff in all_staff:
# Build list of all name fields
staff_names = [
staff.first_name,
staff.last_name,
staff.first_name_ar,
staff.last_name_ar,
staff.name or ""
]
# Count word matches
match_count = 0
total_words = len(input_words)
for word in input_words:
word_matched = False
for staff_name_field in staff_names:
if normalize_name(staff_name_field) == word or \
word in normalize_name(staff_name_field):
word_matched = True
break
if word_matched:
match_count += 1
# If at least 2 words match (or all if only 2 words)
if match_count >= 2 or (total_words == 2 and match_count == 2):
confidence = 0.60 + (match_count / total_words) * 0.15
dept_bonus = 0.05 if (dept_id and staff.department and str(staff.department.id) == dept_id) else 0.0
confidence += dept_bonus
method = f"Partial match ({match_count}/{total_words} words)"
if not any(m['id'] == str(staff.id) for m in matches):
matches.append(create_match_dict(staff, confidence, method, staff_name))
logger.info(f"PARTIAL MATCH ({match_count}/{total_words}): {staff.first_name} {staff.last_name}")
# ========================================
# FINAL: SORT AND RETURN
# ========================================
if matches:
# Sort by confidence (descending)
matches.sort(key=lambda x: x['confidence'], reverse=True)
best_confidence = matches[0]['confidence']
best_method = matches[0]['matching_method']
logger.info(
f"Returning {len(matches)} match(es) for '{staff_name}'. "
f"Best: {matches[0]['name_en']} (confidence: {best_confidence:.2f}, method: {best_method})"
)
if not return_all:
return str(matches[0]['id']), best_confidence, best_method
else:
return matches, best_confidence, best_method
else:
logger.warning(f"No staff match found for name: '{staff_name}'")
return [], 0.0, "No match found"
def create_match_dict(staff, confidence: float, method: str, source_name: str) -> Dict[str, Any]:
"""
Create a match dictionary for a staff member.
Args:
staff: Staff model instance
confidence: Confidence score (0.0 to 1.0)
method: Description of matching method
source_name: Original input name that was matched
Returns:
Dictionary with match details
"""
return {
'id': str(staff.id),
'name_en': f"{staff.first_name} {staff.last_name}",
'name_ar': f"{staff.first_name_ar} {staff.last_name_ar}" if staff.first_name_ar and staff.last_name_ar else "",
'original_name': staff.name or "",
'job_title': staff.job_title,
'specialization': staff.specialization,
'department': staff.department.name if staff.department else None,
'department_id': str(staff.department.id) if staff.department else None,
'confidence': confidence,
'matching_method': method,
'source_name': source_name
}
def test_enhanced_matching():
"""Test the enhanced matching function with sample data."""
from apps.organizations.models import Staff, Hospital
print("\n" + "=" * 80)
print("🧪 TESTING ENHANCED STAFF MATCHING")
print("=" * 80)
hospital = Hospital.objects.first()
if not hospital:
print("❌ No hospitals found")
return
# Test cases
test_cases = [
# Exact matches (existing staff)
("Omar Al-Harbi", "Should match exact"),
("Ahmed Al-Farsi", "Should match exact"),
("محمد الرشيد", "Should match Arabic exact"),
# Fuzzy matches (variations)
("Omar Al Harbi", "Should match without hyphen"),
("Omar Alharbi", "Should match fuzzy"),
("احمد الفارسي", "Should match Arabic fuzzy"),
# Partial matches
("Omar", "Should match first name"),
("Al-Harbi", "Should match last name"),
# Non-existent (for testing suggestions)
("Ibrahim Abdulaziz Al-Shammari", "Non-existent staff"),
]
for name, description in test_cases:
print(f"\n🔍 Testing: '{name}'")
print(f" Expected: {description}")
matches, confidence, method = match_staff_from_name_enhanced(
staff_name=name,
hospital_id=str(hospital.id),
return_all=True,
fuzzy_threshold=0.65
)
if matches:
print(f" ✅ Found {len(matches)} match(es)")
print(f" Best confidence: {confidence:.2f}")
print(f" Method: {method}")
for i, match in enumerate(matches[:3], 1):
print(f" {i}. {match['name_en']} ({match['name_ar']}) - {match['confidence']:.2f}")
if match['original_name']:
print(f" Original: {match['original_name']}")
else:
print(f" ❌ No matches found")
print(f" Confidence: {confidence:.2f}")
print(f" Method: {method}")
if __name__ == '__main__':
import os
import django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings.dev')
django.setup()
test_enhanced_matching()