#!/usr/bin/env python3 """ Test script to verify Word document integration in recruitment/tasks.py """ import os import sys import tempfile # Add the project directory to Python path sys.path.insert(0, '/home/ismail/projects/ats/kaauh_ats') # Import the tasks module try: from recruitment.tasks import extract_text_from_document, extract_text_from_pdf, extract_text_from_word print("✓ Successfully imported text extraction functions") except ImportError as e: print(f"✗ Failed to import functions: {e}") sys.exit(1) def test_pdf_extraction(): """Test PDF text extraction with a sample PDF""" print("\n--- Testing PDF Extraction ---") # Create a temporary PDF file for testing with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf: try: # Create a simple PDF content (this would normally be a real PDF) tmp_pdf.write(b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n") tmp_pdf_path = tmp_pdf.name # Test the PDF extraction text = extract_text_from_pdf(tmp_pdf_path) print(f"✓ PDF extraction completed. Text length: {len(text)}") # Clean up os.unlink(tmp_pdf_path) except Exception as e: print(f"✗ PDF extraction failed: {e}") def test_word_extraction(): """Test Word text extraction with a sample Word document""" print("\n--- Testing Word Extraction ---") try: # Check if python-docx is available from recruitment.tasks import DOCX_AVAILABLE if not DOCX_AVAILABLE: print("⚠ python-docx not available. Skipping Word extraction test.") return # Create a temporary Word file for testing with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp_docx: try: # Create a simple Word document content tmp_docx.write(b'PK\x03\x04') # Basic DOCX header tmp_docx_path = tmp_docx.name # Test the Word extraction text = extract_text_from_word(tmp_docx_path) print(f"✓ Word extraction completed. Text length: {len(text)}") # Clean up os.unlink(tmp_docx_path) except Exception as e: print(f"✗ Word extraction failed: {e}") # Clean up on failure if os.path.exists(tmp_docx.name): os.unlink(tmp_docx.name) except Exception as e: print(f"✗ Word extraction setup failed: {e}") def test_unified_document_parser(): """Test the unified document parser""" print("\n--- Testing Unified Document Parser ---") # Test with non-existent file try: extract_text_from_document('/nonexistent/file.pdf') print("✗ Should have failed for non-existent file") except FileNotFoundError: print("✓ Correctly handled non-existent file") except Exception as e: print(f"✗ Unexpected error for non-existent file: {e}") # Test with unsupported file type with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp_txt: try: tmp_txt.write(b'This is a text file') tmp_txt_path = tmp_txt.name try: extract_text_from_document(tmp_txt_path) print("✗ Should have failed for unsupported file type") except ValueError as e: print(f"✓ Correctly handled unsupported file type: {e}") except Exception as e: print(f"✗ Unexpected error for unsupported file type: {e}") # Clean up os.unlink(tmp_txt_path) except Exception as e: print(f"✗ Test setup failed: {e}") def main(): """Run all tests""" print("Starting Word Document Integration Tests...") test_pdf_extraction() test_word_extraction() test_unified_document_parser() print("\n--- Test Summary ---") print("Integration tests completed. Check the output above for any errors.") print("\nNote: For full Word document processing, ensure python-docx is installed:") print("pip install python-docx") if __name__ == "__main__": main()