HH/apps/presentations/template_parser.py
ismail c5f76b3855
Some checks are pending
Build and Push Docker Image / build (push) Waiting to run
updates
2026-05-11 14:45:30 +03:00

249 lines
9.8 KiB
Python

import json
import logging
import os
import tempfile
from apps.core.ai_service import AIService
from .models import ReportTemplate, ReportTemplateSlide, SlideLayout
logger = logging.getLogger(__name__)
LAYOUT_CHOICES = [choice[0] for choice in SlideLayout.choices]
class ReportTemplateParser:
def __init__(self, template):
self.template = template
def parse(self):
pdf_path = self.template.reference_pdf.path
if not pdf_path or not os.path.exists(pdf_path):
raise ValueError("No reference PDF uploaded for this template")
pages = self._convert_pdf_to_images(pdf_path)
if not pages:
raise ValueError("Could not convert PDF to images")
all_analyses = self._analyze_pages(pages)
self.template.parsed_structure = {
"page_count": len(pages),
"pages": all_analyses,
}
self.template.save(update_fields=["parsed_structure"])
self._create_template_slides(all_analyses)
self._extract_style_config(all_analyses)
return self.template
def _convert_pdf_to_images(self, pdf_path):
try:
from pdf2image import convert_from_path
output_dir = tempfile.mkdtemp(prefix="report_parser_")
images = convert_from_path(
pdf_path,
dpi=150,
output_folder=output_dir,
fmt="png",
)
paths = []
for i, img in enumerate(images):
path = os.path.join(output_dir, f"page_{i+1:03d}.png")
img.save(path)
paths.append(path)
return paths
except ImportError:
logger.error("pdf2image not installed. Install with: pip install pdf2image")
return []
def _analyze_pages(self, page_paths):
analyses = []
batch_size = 5
for i in range(0, len(page_paths), batch_size):
batch = page_paths[i:i + batch_size]
batch_analysis = self._analyze_page_batch(batch, start_page=i + 1)
analyses.extend(batch_analysis)
return analyses
def _analyze_page_batch(self, page_paths, start_page=1):
pages_desc = "\n".join(
f"Page {start_page + i}: {path}"
for i, path in enumerate(page_paths)
)
prompt = (
"You are analyzing pages from a healthcare report PDF to create a reusable slide template. "
"For each page, determine the slide layout type and data requirements.\n\n"
"Available slide layouts:\n"
" - cover: Title page with report name, date, organization\n"
" - section_divider: Section separator with number/label and title\n"
" - kpi_dashboard: Grid of metric cards (numbers, percentages)\n"
" - full_chart: Full-width chart (bar, line, donut, pie)\n"
" - chart_metrics: Chart on left + sidebar metrics on right\n"
" - data_table: Table with rows of data (doctors, complaints, etc.)\n"
" - two_column: Two columns of content (text + bullets)\n"
" - quote: Large quote/callout text\n"
" - timeline: Horizontal timeline with dates/events\n"
" - comparison: Side-by-side comparison panels\n"
" - team_grid: Grid of cards (people/departments with metrics)\n"
" - closing: Thank you page with contact info\n\n"
"Also look for patterns:\n"
" - Does a table layout repeat for different entities (departments, doctors, etc.)?\n"
" - What color coding is used for rows/cells?\n"
" - What are the section headers/titles?\n\n"
"Return a JSON array with one object per page:\n"
'[\n {\n "page_number": 1,\n "layout": "cover",\n "title": "extracted title",\n "subtitle": "extracted subtitle",\n "section_label": "",\n "is_repeated_pattern": false,\n "table_columns": [],\n "chart_type": "",\n "data_description": "what data this page shows",\n "colors": {"primary": "", "secondary": "", "accent": ""}\n }\n]\n\n'
f"The PDF has pages at these image paths:\n{pages_desc}\n\n"
"IMPORTANT: Group repeated table pages into a single entry with is_repeated_pattern=true. "
"For example, if pages 5-15 all show the same table structure for different departments, "
"return ONE entry for page 5 with is_repeated_pattern=true and describe the repeat pattern."
)
try:
response = AIService.chat_completion(
prompt=prompt,
system_prompt="You are a document structure analyst. Analyze the described report pages and return valid JSON only.",
response_format="json_object",
temperature=0.2,
max_tokens=3000,
)
result = json.loads(response)
if isinstance(result, dict) and "pages" in result:
return result["pages"]
elif isinstance(result, list):
return result
return [result]
except Exception as e:
logger.error(f"AI page analysis failed: {e}")
return [
{
"page_number": start_page + i,
"layout": "data_table",
"title": f"Page {start_page + i}",
"subtitle": "",
"is_repeated_pattern": False,
"data_description": "Could not analyze this page",
}
for i in range(len(page_paths))
]
def _create_template_slides(self, analyses):
self.template.template_slides.all().delete()
order = 0
seen_table_pattern = False
for page in analyses:
layout = page.get("layout", "data_table")
if layout not in LAYOUT_CHOICES:
layout = "data_table"
is_repeated = page.get("is_repeated_pattern", False)
if layout == "data_table" and is_repeated and not seen_table_pattern:
seen_table_pattern = True
columns = page.get("table_columns", [])
content_mapping = self._build_table_mapping(columns)
ReportTemplateSlide.objects.create(
template=self.template,
order=order,
layout=layout,
title_template="{{ department_name }}",
subtitle_template="{{ manager }} | {{ item_count }} physicians",
content_mapping=content_mapping,
repeat_source="by_department",
repeat_title_key="department_name",
max_rows=18,
style_overrides=self._get_default_row_colors(),
)
order += 1
elif layout == "data_table" and seen_table_pattern:
continue
else:
content_mapping = self._build_content_mapping(page)
ReportTemplateSlide.objects.create(
template=self.template,
order=order,
layout=layout,
section_label=page.get("section_label", ""),
title_template=page.get("title", ""),
subtitle_template=page.get("subtitle", ""),
content_mapping=content_mapping,
)
order += 1
def _build_table_mapping(self, columns):
if not columns:
columns = ["ID", "Name", "Total", "Average"]
return {
"headers": columns,
"row_template": [
{"field": "employee_id", "type": "text"},
{"field": "name", "type": "text", "font_weight": "600"},
{"field": "total_surveys", "type": "text"},
{"field": "avg_rating", "type": "rating_bar"},
],
"row_color": {
"field": "avg_rating",
"rules": [
{"op": "gte", "value": 4.5, "class": "high"},
{"op": "gte", "value": 3.0, "class": "medium"},
{"op": "lt", "value": 3.0, "class": "low"},
],
"highlight_top": True,
"highlight_class": "top",
},
}
def _get_default_row_colors(self):
return {
"row_color": {
"field": "avg_rating",
"rules": [
{"op": "gte", "value": 4.5, "class": "high"},
{"op": "gte", "value": 3.0, "class": "medium"},
{"op": "lt", "value": 3.0, "class": "low"},
],
"highlight_top": True,
"highlight_class": "top",
},
}
def _build_content_mapping(self, page):
layout = page.get("layout", "")
mapping = {}
if layout == "kpi_dashboard":
mapping = {"metrics": []}
elif layout in ("full_chart", "chart_metrics"):
mapping = {"chart_config": {}}
elif layout == "two_column":
mapping = {}
elif layout == "quote":
mapping = {}
elif layout == "team_grid":
mapping = {"source_path": "", "sort_by": "", "limit": 10}
elif layout == "cover":
mapping = {}
elif layout == "closing":
mapping = {}
elif layout == "section_divider":
mapping = {"section_label": ""}
return mapping
def _extract_style_config(self, analyses):
colors = {}
for page in analyses:
page_colors = page.get("colors", {})
for key, val in page_colors.items():
if val and key not in colors:
colors[key] = val
if colors:
self.template.style_config = {"colors": colors}
self.template.save(update_fields=["style_config"])