HH/apps/presentations/template_parser.py

import json
import logging
import os
import tempfile

from apps.core.ai_service import AIService

from .models import ReportTemplate, ReportTemplateSlide, SlideLayout

logger = logging.getLogger(__name__)

LAYOUT_CHOICES = [choice[0] for choice in SlideLayout.choices]


class ReportTemplateParser:
    def __init__(self, template):
        self.template = template

    def parse(self):
        pdf_path = self.template.reference_pdf.path
        if not pdf_path or not os.path.exists(pdf_path):
            raise ValueError("No reference PDF uploaded for this template")

        pages = self._convert_pdf_to_images(pdf_path)
        if not pages:
            raise ValueError("Could not convert PDF to images")

        all_analyses = self._analyze_pages(pages)
        self.template.parsed_structure = {
            "page_count": len(pages),
            "pages": all_analyses,
        }
        self.template.save(update_fields=["parsed_structure"])

        self._create_template_slides(all_analyses)
        self._extract_style_config(all_analyses)

        return self.template

    def _convert_pdf_to_images(self, pdf_path):
        try:
            from pdf2image import convert_from_path
            output_dir = tempfile.mkdtemp(prefix="report_parser_")
            images = convert_from_path(
                pdf_path,
                dpi=150,
                output_folder=output_dir,
                fmt="png",
            )
            paths = []
            for i, img in enumerate(images):
                path = os.path.join(output_dir, f"page_{i+1:03d}.png")
                img.save(path)
                paths.append(path)
            return paths
        except ImportError:
            logger.error("pdf2image not installed. Install with: pip install pdf2image")
            return []

    def _analyze_pages(self, page_paths):
        analyses = []
        batch_size = 5
        for i in range(0, len(page_paths), batch_size):
            batch = page_paths[i:i + batch_size]
            batch_analysis = self._analyze_page_batch(batch, start_page=i + 1)
            analyses.extend(batch_analysis)
        return analyses

    def _analyze_page_batch(self, page_paths, start_page=1):
        pages_desc = "\n".join(
            f"Page {start_page + i}: {path}"
            for i, path in enumerate(page_paths)
        )

        prompt = (
            "You are analyzing pages from a healthcare report PDF to create a reusable slide template. "
            "For each page, determine the slide layout type and data requirements.\n\n"
            "Available slide layouts:\n"
            "  - cover: Title page with report name, date, organization\n"
            "  - section_divider: Section separator with number/label and title\n"
            "  - kpi_dashboard: Grid of metric cards (numbers, percentages)\n"
            "  - full_chart: Full-width chart (bar, line, donut, pie)\n"
            "  - chart_metrics: Chart on left + sidebar metrics on right\n"
            "  - data_table: Table with rows of data (doctors, complaints, etc.)\n"
            "  - two_column: Two columns of content (text + bullets)\n"
            "  - quote: Large quote/callout text\n"
            "  - timeline: Horizontal timeline with dates/events\n"
            "  - comparison: Side-by-side comparison panels\n"
            "  - team_grid: Grid of cards (people/departments with metrics)\n"
            "  - closing: Thank you page with contact info\n\n"
            "Also look for patterns:\n"
            "  - Does a table layout repeat for different entities (departments, doctors, etc.)?\n"
            "  - What color coding is used for rows/cells?\n"
            "  - What are the section headers/titles?\n\n"
            "Return a JSON array with one object per page:\n"
            '[\n  {\n    "page_number": 1,\n    "layout": "cover",\n    "title": "extracted title",\n    "subtitle": "extracted subtitle",\n    "section_label": "",\n    "is_repeated_pattern": false,\n    "table_columns": [],\n    "chart_type": "",\n    "data_description": "what data this page shows",\n    "colors": {"primary": "", "secondary": "", "accent": ""}\n  }\n]\n\n'
            f"The PDF has pages at these image paths:\n{pages_desc}\n\n"
            "IMPORTANT: Group repeated table pages into a single entry with is_repeated_pattern=true. "
            "For example, if pages 5-15 all show the same table structure for different departments, "
            "return ONE entry for page 5 with is_repeated_pattern=true and describe the repeat pattern."
        )

        try:
            response = AIService.chat_completion(
                prompt=prompt,
                system_prompt="You are a document structure analyst. Analyze the described report pages and return valid JSON only.",
                response_format="json_object",
                temperature=0.2,
                max_tokens=3000,
            )
            result = json.loads(response)
            if isinstance(result, dict) and "pages" in result:
                return result["pages"]
            elif isinstance(result, list):
                return result
            return [result]
        except Exception as e:
            logger.error(f"AI page analysis failed: {e}")
            return [
                {
                    "page_number": start_page + i,
                    "layout": "data_table",
                    "title": f"Page {start_page + i}",
                    "subtitle": "",
                    "is_repeated_pattern": False,
                    "data_description": "Could not analyze this page",
                }
                for i in range(len(page_paths))
            ]

    def _create_template_slides(self, analyses):
        self.template.template_slides.all().delete()

        order = 0
        seen_table_pattern = False

        for page in analyses:
            layout = page.get("layout", "data_table")
            if layout not in LAYOUT_CHOICES:
                layout = "data_table"

            is_repeated = page.get("is_repeated_pattern", False)

            if layout == "data_table" and is_repeated and not seen_table_pattern:
                seen_table_pattern = True
                columns = page.get("table_columns", [])
                content_mapping = self._build_table_mapping(columns)

                ReportTemplateSlide.objects.create(
                    template=self.template,
                    order=order,
                    layout=layout,
                    title_template="{{ department_name }}",
                    subtitle_template="{{ manager }} | {{ item_count }} physicians",
                    content_mapping=content_mapping,
                    repeat_source="by_department",
                    repeat_title_key="department_name",
                    max_rows=18,
                    style_overrides=self._get_default_row_colors(),
                )
                order += 1
            elif layout == "data_table" and seen_table_pattern:
                continue
            else:
                content_mapping = self._build_content_mapping(page)

                ReportTemplateSlide.objects.create(
                    template=self.template,
                    order=order,
                    layout=layout,
                    section_label=page.get("section_label", ""),
                    title_template=page.get("title", ""),
                    subtitle_template=page.get("subtitle", ""),
                    content_mapping=content_mapping,
                )
                order += 1

    def _build_table_mapping(self, columns):
        if not columns:
            columns = ["ID", "Name", "Total", "Average"]
        return {
            "headers": columns,
            "row_template": [
                {"field": "employee_id", "type": "text"},
                {"field": "name", "type": "text", "font_weight": "600"},
                {"field": "total_surveys", "type": "text"},
                {"field": "avg_rating", "type": "rating_bar"},
            ],
            "row_color": {
                "field": "avg_rating",
                "rules": [
                    {"op": "gte", "value": 4.5, "class": "high"},
                    {"op": "gte", "value": 3.0, "class": "medium"},
                    {"op": "lt", "value": 3.0, "class": "low"},
                ],
                "highlight_top": True,
                "highlight_class": "top",
            },
        }

    def _get_default_row_colors(self):
        return {
            "row_color": {
                "field": "avg_rating",
                "rules": [
                    {"op": "gte", "value": 4.5, "class": "high"},
                    {"op": "gte", "value": 3.0, "class": "medium"},
                    {"op": "lt", "value": 3.0, "class": "low"},
                ],
                "highlight_top": True,
                "highlight_class": "top",
            },
        }

    def _build_content_mapping(self, page):
        layout = page.get("layout", "")
        mapping = {}

        if layout == "kpi_dashboard":
            mapping = {"metrics": []}
        elif layout in ("full_chart", "chart_metrics"):
            mapping = {"chart_config": {}}
        elif layout == "two_column":
            mapping = {}
        elif layout == "quote":
            mapping = {}
        elif layout == "team_grid":
            mapping = {"source_path": "", "sort_by": "", "limit": 10}
        elif layout == "cover":
            mapping = {}
        elif layout == "closing":
            mapping = {}
        elif layout == "section_divider":
            mapping = {"section_label": ""}

        return mapping

    def _extract_style_config(self, analyses):
        colors = {}
        for page in analyses:
            page_colors = page.get("colors", {})
            for key, val in page_colors.items():
                if val and key not in colors:
                    colors[key] = val

        if colors:
            self.template.style_config = {"colors": colors}
            self.template.save(update_fields=["style_config"])