249 lines
9.8 KiB
Python
249 lines
9.8 KiB
Python
import json
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
|
|
from apps.core.ai_service import AIService
|
|
|
|
from .models import ReportTemplate, ReportTemplateSlide, SlideLayout
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
LAYOUT_CHOICES = [choice[0] for choice in SlideLayout.choices]
|
|
|
|
|
|
class ReportTemplateParser:
|
|
def __init__(self, template):
|
|
self.template = template
|
|
|
|
def parse(self):
|
|
pdf_path = self.template.reference_pdf.path
|
|
if not pdf_path or not os.path.exists(pdf_path):
|
|
raise ValueError("No reference PDF uploaded for this template")
|
|
|
|
pages = self._convert_pdf_to_images(pdf_path)
|
|
if not pages:
|
|
raise ValueError("Could not convert PDF to images")
|
|
|
|
all_analyses = self._analyze_pages(pages)
|
|
self.template.parsed_structure = {
|
|
"page_count": len(pages),
|
|
"pages": all_analyses,
|
|
}
|
|
self.template.save(update_fields=["parsed_structure"])
|
|
|
|
self._create_template_slides(all_analyses)
|
|
self._extract_style_config(all_analyses)
|
|
|
|
return self.template
|
|
|
|
def _convert_pdf_to_images(self, pdf_path):
|
|
try:
|
|
from pdf2image import convert_from_path
|
|
output_dir = tempfile.mkdtemp(prefix="report_parser_")
|
|
images = convert_from_path(
|
|
pdf_path,
|
|
dpi=150,
|
|
output_folder=output_dir,
|
|
fmt="png",
|
|
)
|
|
paths = []
|
|
for i, img in enumerate(images):
|
|
path = os.path.join(output_dir, f"page_{i+1:03d}.png")
|
|
img.save(path)
|
|
paths.append(path)
|
|
return paths
|
|
except ImportError:
|
|
logger.error("pdf2image not installed. Install with: pip install pdf2image")
|
|
return []
|
|
|
|
def _analyze_pages(self, page_paths):
|
|
analyses = []
|
|
batch_size = 5
|
|
for i in range(0, len(page_paths), batch_size):
|
|
batch = page_paths[i:i + batch_size]
|
|
batch_analysis = self._analyze_page_batch(batch, start_page=i + 1)
|
|
analyses.extend(batch_analysis)
|
|
return analyses
|
|
|
|
def _analyze_page_batch(self, page_paths, start_page=1):
|
|
pages_desc = "\n".join(
|
|
f"Page {start_page + i}: {path}"
|
|
for i, path in enumerate(page_paths)
|
|
)
|
|
|
|
prompt = (
|
|
"You are analyzing pages from a healthcare report PDF to create a reusable slide template. "
|
|
"For each page, determine the slide layout type and data requirements.\n\n"
|
|
"Available slide layouts:\n"
|
|
" - cover: Title page with report name, date, organization\n"
|
|
" - section_divider: Section separator with number/label and title\n"
|
|
" - kpi_dashboard: Grid of metric cards (numbers, percentages)\n"
|
|
" - full_chart: Full-width chart (bar, line, donut, pie)\n"
|
|
" - chart_metrics: Chart on left + sidebar metrics on right\n"
|
|
" - data_table: Table with rows of data (doctors, complaints, etc.)\n"
|
|
" - two_column: Two columns of content (text + bullets)\n"
|
|
" - quote: Large quote/callout text\n"
|
|
" - timeline: Horizontal timeline with dates/events\n"
|
|
" - comparison: Side-by-side comparison panels\n"
|
|
" - team_grid: Grid of cards (people/departments with metrics)\n"
|
|
" - closing: Thank you page with contact info\n\n"
|
|
"Also look for patterns:\n"
|
|
" - Does a table layout repeat for different entities (departments, doctors, etc.)?\n"
|
|
" - What color coding is used for rows/cells?\n"
|
|
" - What are the section headers/titles?\n\n"
|
|
"Return a JSON array with one object per page:\n"
|
|
'[\n {\n "page_number": 1,\n "layout": "cover",\n "title": "extracted title",\n "subtitle": "extracted subtitle",\n "section_label": "",\n "is_repeated_pattern": false,\n "table_columns": [],\n "chart_type": "",\n "data_description": "what data this page shows",\n "colors": {"primary": "", "secondary": "", "accent": ""}\n }\n]\n\n'
|
|
f"The PDF has pages at these image paths:\n{pages_desc}\n\n"
|
|
"IMPORTANT: Group repeated table pages into a single entry with is_repeated_pattern=true. "
|
|
"For example, if pages 5-15 all show the same table structure for different departments, "
|
|
"return ONE entry for page 5 with is_repeated_pattern=true and describe the repeat pattern."
|
|
)
|
|
|
|
try:
|
|
response = AIService.chat_completion(
|
|
prompt=prompt,
|
|
system_prompt="You are a document structure analyst. Analyze the described report pages and return valid JSON only.",
|
|
response_format="json_object",
|
|
temperature=0.2,
|
|
max_tokens=3000,
|
|
)
|
|
result = json.loads(response)
|
|
if isinstance(result, dict) and "pages" in result:
|
|
return result["pages"]
|
|
elif isinstance(result, list):
|
|
return result
|
|
return [result]
|
|
except Exception as e:
|
|
logger.error(f"AI page analysis failed: {e}")
|
|
return [
|
|
{
|
|
"page_number": start_page + i,
|
|
"layout": "data_table",
|
|
"title": f"Page {start_page + i}",
|
|
"subtitle": "",
|
|
"is_repeated_pattern": False,
|
|
"data_description": "Could not analyze this page",
|
|
}
|
|
for i in range(len(page_paths))
|
|
]
|
|
|
|
def _create_template_slides(self, analyses):
|
|
self.template.template_slides.all().delete()
|
|
|
|
order = 0
|
|
seen_table_pattern = False
|
|
|
|
for page in analyses:
|
|
layout = page.get("layout", "data_table")
|
|
if layout not in LAYOUT_CHOICES:
|
|
layout = "data_table"
|
|
|
|
is_repeated = page.get("is_repeated_pattern", False)
|
|
|
|
if layout == "data_table" and is_repeated and not seen_table_pattern:
|
|
seen_table_pattern = True
|
|
columns = page.get("table_columns", [])
|
|
content_mapping = self._build_table_mapping(columns)
|
|
|
|
ReportTemplateSlide.objects.create(
|
|
template=self.template,
|
|
order=order,
|
|
layout=layout,
|
|
title_template="{{ department_name }}",
|
|
subtitle_template="{{ manager }} | {{ item_count }} physicians",
|
|
content_mapping=content_mapping,
|
|
repeat_source="by_department",
|
|
repeat_title_key="department_name",
|
|
max_rows=18,
|
|
style_overrides=self._get_default_row_colors(),
|
|
)
|
|
order += 1
|
|
elif layout == "data_table" and seen_table_pattern:
|
|
continue
|
|
else:
|
|
content_mapping = self._build_content_mapping(page)
|
|
|
|
ReportTemplateSlide.objects.create(
|
|
template=self.template,
|
|
order=order,
|
|
layout=layout,
|
|
section_label=page.get("section_label", ""),
|
|
title_template=page.get("title", ""),
|
|
subtitle_template=page.get("subtitle", ""),
|
|
content_mapping=content_mapping,
|
|
)
|
|
order += 1
|
|
|
|
def _build_table_mapping(self, columns):
|
|
if not columns:
|
|
columns = ["ID", "Name", "Total", "Average"]
|
|
return {
|
|
"headers": columns,
|
|
"row_template": [
|
|
{"field": "employee_id", "type": "text"},
|
|
{"field": "name", "type": "text", "font_weight": "600"},
|
|
{"field": "total_surveys", "type": "text"},
|
|
{"field": "avg_rating", "type": "rating_bar"},
|
|
],
|
|
"row_color": {
|
|
"field": "avg_rating",
|
|
"rules": [
|
|
{"op": "gte", "value": 4.5, "class": "high"},
|
|
{"op": "gte", "value": 3.0, "class": "medium"},
|
|
{"op": "lt", "value": 3.0, "class": "low"},
|
|
],
|
|
"highlight_top": True,
|
|
"highlight_class": "top",
|
|
},
|
|
}
|
|
|
|
def _get_default_row_colors(self):
|
|
return {
|
|
"row_color": {
|
|
"field": "avg_rating",
|
|
"rules": [
|
|
{"op": "gte", "value": 4.5, "class": "high"},
|
|
{"op": "gte", "value": 3.0, "class": "medium"},
|
|
{"op": "lt", "value": 3.0, "class": "low"},
|
|
],
|
|
"highlight_top": True,
|
|
"highlight_class": "top",
|
|
},
|
|
}
|
|
|
|
def _build_content_mapping(self, page):
|
|
layout = page.get("layout", "")
|
|
mapping = {}
|
|
|
|
if layout == "kpi_dashboard":
|
|
mapping = {"metrics": []}
|
|
elif layout in ("full_chart", "chart_metrics"):
|
|
mapping = {"chart_config": {}}
|
|
elif layout == "two_column":
|
|
mapping = {}
|
|
elif layout == "quote":
|
|
mapping = {}
|
|
elif layout == "team_grid":
|
|
mapping = {"source_path": "", "sort_by": "", "limit": 10}
|
|
elif layout == "cover":
|
|
mapping = {}
|
|
elif layout == "closing":
|
|
mapping = {}
|
|
elif layout == "section_divider":
|
|
mapping = {"section_label": ""}
|
|
|
|
return mapping
|
|
|
|
def _extract_style_config(self, analyses):
|
|
colors = {}
|
|
for page in analyses:
|
|
page_colors = page.get("colors", {})
|
|
for key, val in page_colors.items():
|
|
if val and key not in colors:
|
|
colors[key] = val
|
|
|
|
if colors:
|
|
self.template.style_config = {"colors": colors}
|
|
self.template.save(update_fields=["style_config"])
|