124 lines
4.2 KiB
Python
124 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify Word document integration in recruitment/tasks.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
|
|
# Add the project directory to Python path
|
|
sys.path.insert(0, '/home/ismail/projects/ats/kaauh_ats')
|
|
|
|
# Import the tasks module
|
|
try:
|
|
from recruitment.tasks import extract_text_from_document, extract_text_from_pdf, extract_text_from_word
|
|
print("✓ Successfully imported text extraction functions")
|
|
except ImportError as e:
|
|
print(f"✗ Failed to import functions: {e}")
|
|
sys.exit(1)
|
|
|
|
def test_pdf_extraction():
|
|
"""Test PDF text extraction with a sample PDF"""
|
|
print("\n--- Testing PDF Extraction ---")
|
|
|
|
# Create a temporary PDF file for testing
|
|
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_pdf:
|
|
try:
|
|
# Create a simple PDF content (this would normally be a real PDF)
|
|
tmp_pdf.write(b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n")
|
|
tmp_pdf_path = tmp_pdf.name
|
|
|
|
# Test the PDF extraction
|
|
text = extract_text_from_pdf(tmp_pdf_path)
|
|
print(f"✓ PDF extraction completed. Text length: {len(text)}")
|
|
|
|
# Clean up
|
|
os.unlink(tmp_pdf_path)
|
|
|
|
except Exception as e:
|
|
print(f"✗ PDF extraction failed: {e}")
|
|
|
|
def test_word_extraction():
|
|
"""Test Word text extraction with a sample Word document"""
|
|
print("\n--- Testing Word Extraction ---")
|
|
|
|
try:
|
|
# Check if python-docx is available
|
|
from recruitment.tasks import DOCX_AVAILABLE
|
|
if not DOCX_AVAILABLE:
|
|
print("⚠ python-docx not available. Skipping Word extraction test.")
|
|
return
|
|
|
|
# Create a temporary Word file for testing
|
|
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp_docx:
|
|
try:
|
|
# Create a simple Word document content
|
|
tmp_docx.write(b'PK\x03\x04') # Basic DOCX header
|
|
tmp_docx_path = tmp_docx.name
|
|
|
|
# Test the Word extraction
|
|
text = extract_text_from_word(tmp_docx_path)
|
|
print(f"✓ Word extraction completed. Text length: {len(text)}")
|
|
|
|
# Clean up
|
|
os.unlink(tmp_docx_path)
|
|
|
|
except Exception as e:
|
|
print(f"✗ Word extraction failed: {e}")
|
|
# Clean up on failure
|
|
if os.path.exists(tmp_docx.name):
|
|
os.unlink(tmp_docx.name)
|
|
|
|
except Exception as e:
|
|
print(f"✗ Word extraction setup failed: {e}")
|
|
|
|
def test_unified_document_parser():
|
|
"""Test the unified document parser"""
|
|
print("\n--- Testing Unified Document Parser ---")
|
|
|
|
# Test with non-existent file
|
|
try:
|
|
extract_text_from_document('/nonexistent/file.pdf')
|
|
print("✗ Should have failed for non-existent file")
|
|
except FileNotFoundError:
|
|
print("✓ Correctly handled non-existent file")
|
|
except Exception as e:
|
|
print(f"✗ Unexpected error for non-existent file: {e}")
|
|
|
|
# Test with unsupported file type
|
|
with tempfile.NamedTemporaryFile(suffix='.txt', delete=False) as tmp_txt:
|
|
try:
|
|
tmp_txt.write(b'This is a text file')
|
|
tmp_txt_path = tmp_txt.name
|
|
|
|
try:
|
|
extract_text_from_document(tmp_txt_path)
|
|
print("✗ Should have failed for unsupported file type")
|
|
except ValueError as e:
|
|
print(f"✓ Correctly handled unsupported file type: {e}")
|
|
except Exception as e:
|
|
print(f"✗ Unexpected error for unsupported file type: {e}")
|
|
|
|
# Clean up
|
|
os.unlink(tmp_txt_path)
|
|
|
|
except Exception as e:
|
|
print(f"✗ Test setup failed: {e}")
|
|
|
|
def main():
|
|
"""Run all tests"""
|
|
print("Starting Word Document Integration Tests...")
|
|
|
|
test_pdf_extraction()
|
|
test_word_extraction()
|
|
test_unified_document_parser()
|
|
|
|
print("\n--- Test Summary ---")
|
|
print("Integration tests completed. Check the output above for any errors.")
|
|
print("\nNote: For full Word document processing, ensure python-docx is installed:")
|
|
print("pip install python-docx")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|