Spaces:

khrek
/

detailed-resume-parser

Running

File size: 7,376 Bytes

from itertools import chain
from models import Models
#from output_model import OutputModel, WorkExperience
from segmenter import ResumeSegmenter
from flashtext import KeywordProcessor
from collections import defaultdict
import re
import wordninja
from utils import percentage_difference
from nltk import word_tokenize
class ResumeParser():
    def __init__(self) -> None:
        self.resumeSegmenter = ResumeSegmenter()
        self.models = Models()
        
        
    def get_date_index(self, clean_resume_lines, date):
        indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
        return indexes 
    
    #better suited to a utils file
    def sort_tokens_table(self, tokens_data):
        table  = {}
        for key, tokens in tokens_data:
            for token in tokens:
                table[token] = key
        return table
    
    def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
        
        dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
        dates_indexes = list(chain.from_iterable(dates_indexes))
        dates_indexes = [i + start_index for i in dates_indexes]
        dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
        dates_indexes = set(dates_indexes)
        dates_indexes =  sorted(list(dates_indexes))
        individual_sections = []
        for i, index in enumerate(dates_indexes):
            section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
            if len(section) == 0:
                continue
            individual_sections.append(section)
    
        return  individual_sections
    
    def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
        _ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
        if sections is None:
            return None
        print(sections)
        if section_header not in sections:
          return None
        start_index = sections[section_header][0]
        end_index = sections[section_header][1]
        #on the bases dates would be unique
        return start_index, end_index
    
    #more of a utils function
    def sort_tokens_table(self, tokens_data):
        table  = {}
        for key, tokens in tokens_data:
            for token in tokens:
                table[token] = key
        return table
    
    def format_output(self, keywords, headlines, isWorkExp=True):
        data = []
        for section in headlines:
            extracted_data = {}
            
            paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
            extracted_data['description'] = paragraph
            recovered_headlines = ' '.join(wordninja.split(paragraph))
            if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
                extracted_data['description'] = recovered_headlines
            for attr in keywords:
                result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
                if len(result) > 0:
                    extracted_data[attr[0]] = result
            data.append(extracted_data)
        return data
        
      
    def parse_work_history(self, resume_lines, sections):
        
        start_index, end_index = sections['work_and_employment']
        text = ' '.join(resume_lines[start_index:end_index])
        recovered_text = ' '.join(wordninja.split(text))
        work_dates, companies, locations = self.models.get_ner(text, recovered_text)
        single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
        entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
        job_positions = entity_dict['job title']
        keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
        return self.format_output(keywords, single_work_experiences)
    
    def parse_education(self, resume_lines, sections):
        start_index, end_index = sections["education_and_training"]
        text = ' '.join(resume_lines[start_index:end_index])
        
        dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
        single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
        entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
        degrees = entity_dict['degree']
        majors = entity_dict['major']
        keywords = [("date", dates), ("major", majors), ("degree", degrees),
                    ("university", universities), ("location", locations)]
        output = self.format_output(keywords, single_education_experiences, False)
        output = [res for res in output if res]

        return output
      
    def parse_basic_info(self,resume_lines, sections):
        
        start_index, end_index = sections["basics_info"]
        text = ' '.join(resume_lines[start_index:end_index])
        phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        entites = self.models.ner(text)
        if len(entites) == 0:
            entites = self.models.ner(' '.join(resume_lines))
            
        output = {}
        score = 0
        for entity in entites:
          if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
            output['name']= entity['word']
            score = entity['score']
        
        email = re.findall(email_pattern, text)
        phone = re.findall(phone_pattern, text)
        if email == '':
            email = re.findall(email_pattern, ' '.join(resume_lines))
        if phone == '':
            phone = re.findall(phone_pattern, ' '.join(resume_lines))
        output['email'] = email[0] if len(email) > 0 else ''
        output['phone'] = phone[0] if len(phone) > 0 else ''
        return output
    
    def parse(self, resume_lines):
        self.resumeSegmenter.resume_segments = {
                'objective': [],
                'work_and_employment': [],
                'education_and_training': [],
                'skills': [],
                'accomplishments': [],
                'misc': []
            }
        self.resumeSegmenter.resume_indices = []
        sections = self.resumeSegmenter.segment(resume_lines)
        if sections is None:
            return {}
        jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
        education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
        basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}        
        result =  {"basic_info":basic_info, "education":education, "work_experience":jobs}
        for section in sections.keys():
            if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
                text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
                result[section] =' '.join(wordninja.split(text))
        return result