kaauh_ats/test_word_integration.py

#!/usr/bin/env python3
"""
Test script to verify Word document integration in recruitment/tasks.py
"""

import os
import sys
import tempfile

# Add the project directory to Python path
sys.path.insert(0, '/home/ismail/projects/ats/kaauh_ats')

# Import the tasks module
try:
    from recruitment.tasks import extract_text_from_document, extract_text_from_pdf, extract_text_from_word
    print("✓ Successfully imported text extraction functions")
except ImportError as e:
    print(f"✗ Failed to import functions: {e}")
    sys.exit(1)

def test_pdf_extraction():
    """Test PDF text extraction with a sample PDF"""
    print("\n--- Testing PDF Extraction ---")

    # Create a temporary PDF file for testing
    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
        try:
            # Create a simple PDF content (this would normally be a real PDF)
            tmp_pdf.write(b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n")
            tmp_pdf_path = tmp_pdf.name

            # Test the PDF extraction
            text = extract_text_from_pdf(tmp_pdf_path)
            print(f"✓ PDF extraction completed. Text length: {len(text)}")

            # Clean up
            os.unlink(tmp_pdf_path)

        except Exception as e:
            print(f"✗ PDF extraction failed: {e}")

def test_word_extraction():
    """Test Word text extraction with a sample Word document"""
    print("\n--- Testing Word Extraction ---")

    try:
        # Check if python-docx is available
        from recruitment.tasks import DOCX_AVAILABLE
        if not DOCX_AVAILABLE:
            print("⚠ python-docx not available. Skipping Word extraction test.")
            return

        # Create a temporary Word file for testing
        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp_docx:
            try:
                # Create a simple Word document content
                tmp_docx.write(b'PK\x03\x04')  # Basic DOCX header
                tmp_docx_path = tmp_docx.name

                # Test the Word extraction
                text = extract_text_from_word(tmp_docx_path)
                print(f"✓ Word extraction completed. Text length: {len(text)}")

                # Clean up
                os.unlink(tmp_docx_path)

            except Exception as e:
                print(f"✗ Word extraction failed: {e}")
                # Clean up on failure
                if os.path.exists(tmp_docx.name):
                    os.unlink(tmp_docx.name)

    except Exception as e:
        print(f"✗ Word extraction setup failed: {e}")

def test_unified_document_parser():
    """Test the unified document parser"""
    print("\n--- Testing Unified Document Parser ---")

    # Test with non-existent file
    try:
        extract_text_from_document('/nonexistent/file.pdf')
        print("✗ Should have failed for non-existent file")
    except FileNotFoundError:
        print("✓ Correctly handled non-existent file")
    except Exception as e:
        print(f"✗ Unexpected error for non-existent file: {e}")

    # Test with unsupported file type
    with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp_txt:
        try:
            tmp_txt.write(b'This is a text file')
            tmp_txt_path = tmp_txt.name

            try:
                extract_text_from_document(tmp_txt_path)
                print("✗ Should have failed for unsupported file type")
            except ValueError as e:
                print(f"✓ Correctly handled unsupported file type: {e}")
            except Exception as e:
                print(f"✗ Unexpected error for unsupported file type: {e}")

            # Clean up
            os.unlink(tmp_txt_path)

        except Exception as e:
            print(f"✗ Test setup failed: {e}")

def main():
    """Run all tests"""
    print("Starting Word Document Integration Tests...")

    test_pdf_extraction()
    test_word_extraction()
    test_unified_document_parser()

    print("\n--- Test Summary ---")
    print("Integration tests completed. Check the output above for any errors.")
    print("\nNote: For full Word document processing, ensure python-docx is installed:")
    print("pip install python-docx")

if __name__ == "__main__":
    main()