kaauh_ats/test_word_integration.py
2025-10-21 14:19:13 +03:00

124 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""
Test script to verify Word document integration in recruitment/tasks.py
"""
import os
import sys
import tempfile
# Add the project directory to Python path
sys.path.insert(0, '/home/ismail/projects/ats/kaauh_ats')
# Import the tasks module
try:
from recruitment.tasks import extract_text_from_document, extract_text_from_pdf, extract_text_from_word
print("✓ Successfully imported text extraction functions")
except ImportError as e:
print(f"✗ Failed to import functions: {e}")
sys.exit(1)
def test_pdf_extraction():
"""Test PDF text extraction with a sample PDF"""
print("\n--- Testing PDF Extraction ---")
# Create a temporary PDF file for testing
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
try:
# Create a simple PDF content (this would normally be a real PDF)
tmp_pdf.write(b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n")
tmp_pdf_path = tmp_pdf.name
# Test the PDF extraction
text = extract_text_from_pdf(tmp_pdf_path)
print(f"✓ PDF extraction completed. Text length: {len(text)}")
# Clean up
os.unlink(tmp_pdf_path)
except Exception as e:
print(f"✗ PDF extraction failed: {e}")
def test_word_extraction():
"""Test Word text extraction with a sample Word document"""
print("\n--- Testing Word Extraction ---")
try:
# Check if python-docx is available
from recruitment.tasks import DOCX_AVAILABLE
if not DOCX_AVAILABLE:
print("⚠ python-docx not available. Skipping Word extraction test.")
return
# Create a temporary Word file for testing
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp_docx:
try:
# Create a simple Word document content
tmp_docx.write(b'PK\x03\x04') # Basic DOCX header
tmp_docx_path = tmp_docx.name
# Test the Word extraction
text = extract_text_from_word(tmp_docx_path)
print(f"✓ Word extraction completed. Text length: {len(text)}")
# Clean up
os.unlink(tmp_docx_path)
except Exception as e:
print(f"✗ Word extraction failed: {e}")
# Clean up on failure
if os.path.exists(tmp_docx.name):
os.unlink(tmp_docx.name)
except Exception as e:
print(f"✗ Word extraction setup failed: {e}")
def test_unified_document_parser():
"""Test the unified document parser"""
print("\n--- Testing Unified Document Parser ---")
# Test with non-existent file
try:
extract_text_from_document('/nonexistent/file.pdf')
print("✗ Should have failed for non-existent file")
except FileNotFoundError:
print("✓ Correctly handled non-existent file")
except Exception as e:
print(f"✗ Unexpected error for non-existent file: {e}")
# Test with unsupported file type
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp_txt:
try:
tmp_txt.write(b'This is a text file')
tmp_txt_path = tmp_txt.name
try:
extract_text_from_document(tmp_txt_path)
print("✗ Should have failed for unsupported file type")
except ValueError as e:
print(f"✓ Correctly handled unsupported file type: {e}")
except Exception as e:
print(f"✗ Unexpected error for unsupported file type: {e}")
# Clean up
os.unlink(tmp_txt_path)
except Exception as e:
print(f"✗ Test setup failed: {e}")
def main():
"""Run all tests"""
print("Starting Word Document Integration Tests...")
test_pdf_extraction()
test_word_extraction()
test_unified_document_parser()
print("\n--- Test Summary ---")
print("Integration tests completed. Check the output above for any errors.")
print("\nNote: For full Word document processing, ensure python-docx is installed:")
print("pip install python-docx")
if __name__ == "__main__":
main()