resume data extraction

This commit is contained in:
Faheed 2025-10-05 19:29:53 +03:00
parent 1aa8b6800a
commit ede6b2760b
59 changed files with 313 additions and 34 deletions

View File

@ -45,7 +45,7 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'rest_framework',
'recruitment',
'recruitment.apps.RecruitmentConfig',
'corsheaders',
'django.contrib.sites',
'allauth',

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,6 +1,6 @@
from django.contrib import messages
from . import models
from .utils import extract_summary_from_pdf
# from .utils import extract_summary_from_pdf
from django.contrib import admin
from django.contrib.auth.admin import UserAdmin as BaseUserAdmin
@ -27,20 +27,25 @@ class GroupAdmin(BaseGroupAdmin, ModelAdmin):
pass
@admin.register(models.JobPosting)
class JobPostingAdmin(ModelAdmin):
list_display = ('title','description','qualifications')
@admin.register(models.Job)
class JobAdmin(ModelAdmin):
list_display = ('title', 'is_published', 'posted_to_linkedin', 'created_at')
list_filter = ('is_published', 'posted_to_linkedin')
search_fields = ('title', 'description_en', 'description_ar')
@admin.action(description="Parse selected resumes")
def parse_resumes(modeladmin, request, queryset):
for candidate in queryset:
if candidate.resume:
summary = extract_summary_from_pdf(candidate.resume.path)
candidate.parsed_summary = str(summary)
candidate.save()
messages.success(request, f"Parsed {queryset.count()} resumes successfully.")
# @admin.action(description="Parse selected resumes")
# def parse_resumes(modeladmin, request, queryset):
# for candidate in queryset:
# if candidate.resume:
# summary = extract_summary_from_pdf(candidate.resume.path)
# candidate.parsed_summary = str(summary)
# candidate.save()
# messages.success(request, f"Parsed {queryset.count()} resumes successfully.")
@admin.register(models.Candidate)
class CandidateAdmin(ModelAdmin):
@ -48,7 +53,7 @@ class CandidateAdmin(ModelAdmin):
list_filter = ('applied', 'job')
search_fields = ('name', 'email')
# readonly_fields = ('parsed_summary',)
actions = [parse_resumes]
@admin.register(models.TrainingMaterial)
class TrainingMaterialAdmin(ModelAdmin):

View File

@ -4,3 +4,5 @@ from django.apps import AppConfig
class RecruitmentConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'recruitment'
def ready(self):
import recruitment.signals

View File

@ -0,0 +1,33 @@
# Generated by Django 5.2.7 on 2025-10-05 13:12
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('recruitment', '0012_form_formsubmission_uploadedfile'),
]
operations = [
migrations.AddField(
model_name='candidate',
name='criteria_checklist',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='candidate',
name='match_score',
field=models.IntegerField(blank=True, null=True),
),
migrations.AddField(
model_name='candidate',
name='strengths',
field=models.TextField(blank=True),
),
migrations.AddField(
model_name='candidate',
name='weaknesses',
field=models.TextField(blank=True),
),
]

View File

@ -0,0 +1,31 @@
# Generated by Django 5.2.7 on 2025-10-05 16:11
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('recruitment', '0013_candidate_criteria_checklist_candidate_match_score_and_more'),
]
operations = [
migrations.CreateModel(
name='Source',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(choices=[('ATS', 'Applicant Tracking System'), ('ERP', 'ERP system')], max_length=100, verbose_name='Source Type')),
('created_at', models.DateTimeField(auto_now_add=True)),
],
options={
'verbose_name': 'Source',
'verbose_name_plural': 'Sources',
},
),
migrations.AddField(
model_name='jobposting',
name='source',
field=models.ForeignKey(blank=True, help_text='The system or channel from which this job posting originated or was first published.', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='job_postings', to='recruitment.source'),
),
]

View File

@ -103,6 +103,15 @@ class JobPosting(Base):
start_date = models.DateField(null=True, blank=True, help_text="Desired start date")
open_positions = models.PositiveIntegerField(default=1, help_text="Number of open positions for this job")
source = models.ForeignKey(
'Source',
on_delete=models.SET_NULL, # Recommended: If a source is deleted, job's source is set to NULL
related_name='job_postings',
null=True,
blank=True,
help_text="The system or channel from which this job posting originated or was first published."
)
class Meta:
ordering = ['-created_at']
verbose_name = "Job Posting"
@ -114,7 +123,7 @@ class JobPosting(Base):
def save(self, *args, **kwargs):
# Generate unique internal job ID if not exists
if not self.internal_job_id:
prefix = "UNIV"
prefix = "KAAUH"
year = timezone.now().year
# Get next sequential number
last_job = JobPosting.objects.filter(
@ -188,6 +197,12 @@ class Candidate(Base):
offer_status = models.CharField(choices=Status.choices,max_length=100, null=True, blank=True, verbose_name=_('Offer Status'))
join_date = models.DateField(null=True, blank=True, verbose_name=_('Join Date'))
# Scoring fields (populated by signal)
match_score = models.IntegerField(null=True, blank=True)
strengths = models.TextField(blank=True)
weaknesses = models.TextField(blank=True)
criteria_checklist = models.JSONField(default=dict, blank=True)
class Meta:
verbose_name = _('Candidate')
verbose_name_plural = _('Candidates')
@ -297,6 +312,8 @@ class Form(models.Model):
def __str__(self):
return self.title
class FormSubmission(models.Model):
form = models.ForeignKey(Form, on_delete=models.CASCADE, related_name='submissions')
submission_data = models.JSONField(default=dict) # Stores form responses
@ -312,4 +329,32 @@ class UploadedFile(models.Model):
field_id = models.CharField(max_length=100)
file = models.FileField(upload_to='form_uploads/%Y/%m/%d/')
original_filename = models.CharField(max_length=255)
uploaded_at = models.DateTimeField(auto_now_add=True)
uploaded_at = models.DateTimeField(auto_now_add=True)
class Source(models.Model):
class SourceType(models.TextChoices):
ATS = 'ATS', _('Applicant Tracking System')
CRM = 'ERP', _('ERP system')
name = models.CharField(
max_length=100,
choices=SourceType.choices,
verbose_name=_('Source Type')
)
created_at = models.DateTimeField(auto_now_add=True)
def __str__(self):
return f"{self.get_name_display()}"
class Meta:
verbose_name = _('Source')
verbose_name_plural = _('Sources')

71
recruitment/signals.py Normal file
View File

@ -0,0 +1,71 @@
from django.db.models.signals import post_save
from django.dispatch import receiver
from . import models
# @receiver(post_save, sender=models.Candidate)
# def parse_resume(sender, instance, created, **kwargs):
# if instance.resume and not instance.summary:
# from .utils import extract_summary_from_pdf,match_resume_with_job_description
# summary = extract_summary_from_pdf(instance.resume.path)
# if 'error' not in summary:
# instance.summary = summary
# instance.save()
# match_resume_with_job_description
import logging
logger = logging.getLogger(__name__)
import os
from .utils import extract_text_from_pdf,score_resume_with_openrouter
@receiver(post_save, sender=models.Candidate)
def score_candidate_resume(sender, instance, created, **kwargs):
# Skip if no resume or OpenRouter not configured
if instance.resume is None:
return
if kwargs.get('update_fields') is not None:
return
# Optional: Only re-score if resume changed (advanced: track file hash)
# For simplicity, we score on every save with a resume
try:
# Get absolute file path
file_path = instance.resume.path
if not os.path.exists(file_path):
logger.warning(f"Resume file not found: {file_path}")
return
resume_text = extract_text_from_pdf(file_path)
# if not resume_text:
# instance.scoring_error = "Could not extract text from resume."
# instance.save(update_fields=['scoring_error'])
# return
result = score_resume_with_openrouter(resume_text)
# Update candidate with scoring results
instance.match_score = result.get('match_score')
instance.strengths = result.get('strengths', '')
instance.weaknesses = result.get('weaknesses', '')
instance.criteria_checklist = result.get('criteria_checklist', {})
# Save only scoring-related fields to avoid recursion
instance.save(update_fields=[
'match_score', 'strengths', 'weaknesses',
'criteria_checklist'
])
logger.info(f"Successfully scored resume for candidate {instance.id}")
except Exception as e:
# error_msg = str(e)[:500] # Truncate to fit TextField
# instance.scoring_error = error_msg
# instance.save(update_fields=['scoring_error'])
logger.error(f"Failed to score resume for candidate {instance.id}: {e}")

View File

@ -1,32 +1,124 @@
import os
import fitz # PyMuPDF
import spacy
import requests
# import os
# import fitz # PyMuPDF
# import spacy
# import requests
from recruitment import models
from django.conf import settings
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_sm")
def extract_text_from_pdf(pdf_path):
# def extract_text_from_pdf(pdf_path):
# text = ""
# with fitz.open(pdf_path) as doc:
# for page in doc:
# text += page.get_text()
# return text
# def extract_summary_from_pdf(pdf_path):
# if not os.path.exists(pdf_path):
# return {'error': 'File not found'}
# text = extract_text_from_pdf(pdf_path)
# doc = nlp(text)
# summary = {
# 'name': doc.ents[0].text if doc.ents else '',
# 'skills': [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1],
# 'summary': text[:500]
# }
# return summary
import requests
from PyPDF2 import PdfReader
import os
import json
import logging
logger = logging.getLogger(__name__)
OPENROUTER_API_KEY ='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'
OPENROUTER_MODEL = 'qwen/qwen-2.5-72b-instruct:free'
if not OPENROUTER_API_KEY:
logger.warning("OPENROUTER_API_KEY not set. Resume scoring will be skipped.")
def extract_text_from_pdf(file_path):
print("text extraction")
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
try:
with open(file_path, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
text += (page.extract_text() or "")
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
raise
return text.strip()
def extract_summary_from_pdf(pdf_path):
if not os.path.exists(pdf_path):
return {'error': 'File not found'}
def score_resume_with_openrouter(resume_text):
prompt = f"""
You are an expert technical recruiter. Your task is to score the following candidate for the role of a Senior Data Analyst based on the provided job criteria.
text = extract_text_from_pdf(pdf_path)
doc = nlp(text)
summary = {
'name': doc.ents[0].text if doc.ents else '',
'skills': [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1],
'summary': text[:500]
}
return summary
**Job Criteria:**
- Must-Have Skills: Python, SQL, 5+ years of experience.
- Nice-to-Have Skills: Tableau, AWS.
- Experience: Must have led at least one project.
**Candidate's Extracted Resume Text:**
\"\"\"
{resume_text}
\"\"\"
**Your Task:**
Provide a response in strict JSON format with the following keys:
1. 'match_score': A score from 0 to 100 representing how well the candidate fits the role.
2. 'strengths': A brief summary of why the candidate is a strong fit, referencing specific criteria.
3. 'weaknesses': A brief summary of where the candidate falls short or what criteria are missing.
4. 'criteria_checklist': An object where you rate the candidate's match for each specific criterion (e.g., {{'Python': 'Met', 'AWS': 'Not Mentioned'}}).
Only output valid JSON. Do not include any other text.
"""
print("model call")
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
},
data=json.dumps({
"model": OPENROUTER_MODEL,
"messages": [{"role": "user", "content": prompt}],
},
)
)
# print(response.status_code)
# print(response.json())
res = {}
print(response.status_code)
if response.status_code == 200:
res = response.json()
content = res["choices"][0]['message']['content']
try:
print(content)
content = content.replace("```json","").replace("```","")
print(content)
res = json.loads(content)
print(res)
except Exception as e:
print(e)
# res = raw_output["choices"][0]["message"]["content"]
else:
print("error response")
return res
# print(f"rawraw_output)
# print(response)
# def match_resume_with_job_description(resume, job_description,prompt=""):
# resume_doc = nlp(resume)
# job_doc = nlp(job_description)
# similarity = resume_doc.similarity(job_doc)
# return similarity
def dashboard_callback(request, context):
total_jobs = models.Job.objects.count()

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.