import json import logging import os import tempfile from apps.core.ai_service import AIService from .models import ReportTemplate, ReportTemplateSlide, SlideLayout logger = logging.getLogger(__name__) LAYOUT_CHOICES = [choice[0] for choice in SlideLayout.choices] class ReportTemplateParser: def __init__(self, template): self.template = template def parse(self): pdf_path = self.template.reference_pdf.path if not pdf_path or not os.path.exists(pdf_path): raise ValueError("No reference PDF uploaded for this template") pages = self._convert_pdf_to_images(pdf_path) if not pages: raise ValueError("Could not convert PDF to images") all_analyses = self._analyze_pages(pages) self.template.parsed_structure = { "page_count": len(pages), "pages": all_analyses, } self.template.save(update_fields=["parsed_structure"]) self._create_template_slides(all_analyses) self._extract_style_config(all_analyses) return self.template def _convert_pdf_to_images(self, pdf_path): try: from pdf2image import convert_from_path output_dir = tempfile.mkdtemp(prefix="report_parser_") images = convert_from_path( pdf_path, dpi=150, output_folder=output_dir, fmt="png", ) paths = [] for i, img in enumerate(images): path = os.path.join(output_dir, f"page_{i+1:03d}.png") img.save(path) paths.append(path) return paths except ImportError: logger.error("pdf2image not installed. Install with: pip install pdf2image") return [] def _analyze_pages(self, page_paths): analyses = [] batch_size = 5 for i in range(0, len(page_paths), batch_size): batch = page_paths[i:i + batch_size] batch_analysis = self._analyze_page_batch(batch, start_page=i + 1) analyses.extend(batch_analysis) return analyses def _analyze_page_batch(self, page_paths, start_page=1): pages_desc = "\n".join( f"Page {start_page + i}: {path}" for i, path in enumerate(page_paths) ) prompt = ( "You are analyzing pages from a healthcare report PDF to create a reusable slide template. " "For each page, determine the slide layout type and data requirements.\n\n" "Available slide layouts:\n" " - cover: Title page with report name, date, organization\n" " - section_divider: Section separator with number/label and title\n" " - kpi_dashboard: Grid of metric cards (numbers, percentages)\n" " - full_chart: Full-width chart (bar, line, donut, pie)\n" " - chart_metrics: Chart on left + sidebar metrics on right\n" " - data_table: Table with rows of data (doctors, complaints, etc.)\n" " - two_column: Two columns of content (text + bullets)\n" " - quote: Large quote/callout text\n" " - timeline: Horizontal timeline with dates/events\n" " - comparison: Side-by-side comparison panels\n" " - team_grid: Grid of cards (people/departments with metrics)\n" " - closing: Thank you page with contact info\n\n" "Also look for patterns:\n" " - Does a table layout repeat for different entities (departments, doctors, etc.)?\n" " - What color coding is used for rows/cells?\n" " - What are the section headers/titles?\n\n" "Return a JSON array with one object per page:\n" '[\n {\n "page_number": 1,\n "layout": "cover",\n "title": "extracted title",\n "subtitle": "extracted subtitle",\n "section_label": "",\n "is_repeated_pattern": false,\n "table_columns": [],\n "chart_type": "",\n "data_description": "what data this page shows",\n "colors": {"primary": "", "secondary": "", "accent": ""}\n }\n]\n\n' f"The PDF has pages at these image paths:\n{pages_desc}\n\n" "IMPORTANT: Group repeated table pages into a single entry with is_repeated_pattern=true. " "For example, if pages 5-15 all show the same table structure for different departments, " "return ONE entry for page 5 with is_repeated_pattern=true and describe the repeat pattern." ) try: response = AIService.chat_completion( prompt=prompt, system_prompt="You are a document structure analyst. Analyze the described report pages and return valid JSON only.", response_format="json_object", temperature=0.2, max_tokens=3000, ) result = json.loads(response) if isinstance(result, dict) and "pages" in result: return result["pages"] elif isinstance(result, list): return result return [result] except Exception as e: logger.error(f"AI page analysis failed: {e}") return [ { "page_number": start_page + i, "layout": "data_table", "title": f"Page {start_page + i}", "subtitle": "", "is_repeated_pattern": False, "data_description": "Could not analyze this page", } for i in range(len(page_paths)) ] def _create_template_slides(self, analyses): self.template.template_slides.all().delete() order = 0 seen_table_pattern = False for page in analyses: layout = page.get("layout", "data_table") if layout not in LAYOUT_CHOICES: layout = "data_table" is_repeated = page.get("is_repeated_pattern", False) if layout == "data_table" and is_repeated and not seen_table_pattern: seen_table_pattern = True columns = page.get("table_columns", []) content_mapping = self._build_table_mapping(columns) ReportTemplateSlide.objects.create( template=self.template, order=order, layout=layout, title_template="{{ department_name }}", subtitle_template="{{ manager }} | {{ item_count }} physicians", content_mapping=content_mapping, repeat_source="by_department", repeat_title_key="department_name", max_rows=18, style_overrides=self._get_default_row_colors(), ) order += 1 elif layout == "data_table" and seen_table_pattern: continue else: content_mapping = self._build_content_mapping(page) ReportTemplateSlide.objects.create( template=self.template, order=order, layout=layout, section_label=page.get("section_label", ""), title_template=page.get("title", ""), subtitle_template=page.get("subtitle", ""), content_mapping=content_mapping, ) order += 1 def _build_table_mapping(self, columns): if not columns: columns = ["ID", "Name", "Total", "Average"] return { "headers": columns, "row_template": [ {"field": "employee_id", "type": "text"}, {"field": "name", "type": "text", "font_weight": "600"}, {"field": "total_surveys", "type": "text"}, {"field": "avg_rating", "type": "rating_bar"}, ], "row_color": { "field": "avg_rating", "rules": [ {"op": "gte", "value": 4.5, "class": "high"}, {"op": "gte", "value": 3.0, "class": "medium"}, {"op": "lt", "value": 3.0, "class": "low"}, ], "highlight_top": True, "highlight_class": "top", }, } def _get_default_row_colors(self): return { "row_color": { "field": "avg_rating", "rules": [ {"op": "gte", "value": 4.5, "class": "high"}, {"op": "gte", "value": 3.0, "class": "medium"}, {"op": "lt", "value": 3.0, "class": "low"}, ], "highlight_top": True, "highlight_class": "top", }, } def _build_content_mapping(self, page): layout = page.get("layout", "") mapping = {} if layout == "kpi_dashboard": mapping = {"metrics": []} elif layout in ("full_chart", "chart_metrics"): mapping = {"chart_config": {}} elif layout == "two_column": mapping = {} elif layout == "quote": mapping = {} elif layout == "team_grid": mapping = {"source_path": "", "sort_by": "", "limit": 10} elif layout == "cover": mapping = {} elif layout == "closing": mapping = {} elif layout == "section_divider": mapping = {"section_label": ""} return mapping def _extract_style_config(self, analyses): colors = {} for page in analyses: page_colors = page.get("colors", {}) for key, val in page_colors.items(): if val and key not in colors: colors[key] = val if colors: self.template.style_config = {"colors": colors} self.template.save(update_fields=["style_config"])