File size: 7,376 Bytes
0d375ed
81c1983
0d375ed
a3970e4
0d375ed
 
1ab9160
 
 
 
0d375ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ab9160
 
 
 
0d375ed
1ab9160
 
 
 
0d375ed
 
1ab9160
 
 
 
 
 
0d375ed
 
 
 
 
 
1ab9160
0d375ed
 
 
 
 
 
1ab9160
0d375ed
1ab9160
 
 
 
 
 
 
 
 
 
 
 
 
0d375ed
1ab9160
0d375ed
1ab9160
 
 
 
 
 
0d375ed
1ab9160
 
 
0d375ed
 
1ab9160
 
 
0d375ed
1ab9160
 
 
 
 
 
 
 
0d375ed
 
 
 
1ab9160
 
 
 
 
 
 
 
 
0d375ed
 
1ab9160
 
 
 
 
 
 
 
 
 
 
 
 
 
0d375ed
 
 
1ab9160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from itertools import chain
from models import Models
#from output_model import OutputModel, WorkExperience
from segmenter import ResumeSegmenter
from flashtext import KeywordProcessor
from collections import defaultdict
import re
import wordninja
from utils import percentage_difference
from nltk import word_tokenize
class ResumeParser():
    def __init__(self) -> None:
        self.resumeSegmenter = ResumeSegmenter()
        self.models = Models()
        
        
    def get_date_index(self, clean_resume_lines, date):
        indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
        return indexes 
    
    #better suited to a utils file
    def sort_tokens_table(self, tokens_data):
        table  = {}
        for key, tokens in tokens_data:
            for token in tokens:
                table[token] = key
        return table
    
    def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
        
        dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
        dates_indexes = list(chain.from_iterable(dates_indexes))
        dates_indexes = [i + start_index for i in dates_indexes]
        dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
        dates_indexes = set(dates_indexes)
        dates_indexes =  sorted(list(dates_indexes))
        individual_sections = []
        for i, index in enumerate(dates_indexes):
            section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
            if len(section) == 0:
                continue
            individual_sections.append(section)
    
        return  individual_sections
    
    def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
        _ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
        if sections is None:
            return None
        print(sections)
        if section_header not in sections:
          return None
        start_index = sections[section_header][0]
        end_index = sections[section_header][1]
        #on the bases dates would be unique
        return start_index, end_index
    
    #more of a utils function
    def sort_tokens_table(self, tokens_data):
        table  = {}
        for key, tokens in tokens_data:
            for token in tokens:
                table[token] = key
        return table
    
    def format_output(self, keywords, headlines, isWorkExp=True):
        data = []
        for section in headlines:
            extracted_data = {}
            
            paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
            extracted_data['description'] = paragraph
            recovered_headlines = ' '.join(wordninja.split(paragraph))
            if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
                extracted_data['description'] = recovered_headlines
            for attr in keywords:
                result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
                if len(result) > 0:
                    extracted_data[attr[0]] = result
            data.append(extracted_data)
        return data
        
      
    def parse_work_history(self, resume_lines, sections):
        
        start_index, end_index = sections['work_and_employment']
        text = ' '.join(resume_lines[start_index:end_index])
        recovered_text = ' '.join(wordninja.split(text))
        work_dates, companies, locations = self.models.get_ner(text, recovered_text)
        single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
        entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
        job_positions = entity_dict['job title']
        keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
        return self.format_output(keywords, single_work_experiences)
    
    def parse_education(self, resume_lines, sections):
        start_index, end_index = sections["education_and_training"]
        text = ' '.join(resume_lines[start_index:end_index])
        
        dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
        single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
        entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
        degrees = entity_dict['degree']
        majors = entity_dict['major']
        keywords = [("date", dates), ("major", majors), ("degree", degrees),
                    ("university", universities), ("location", locations)]
        output = self.format_output(keywords, single_education_experiences, False)
        output = [res for res in output if res]

        return output
      
    def parse_basic_info(self,resume_lines, sections):
        
        start_index, end_index = sections["basics_info"]
        text = ' '.join(resume_lines[start_index:end_index])
        phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        entites = self.models.ner(text)
        if len(entites) == 0:
            entites = self.models.ner(' '.join(resume_lines))
            
        output = {}
        score = 0
        for entity in entites:
          if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
            output['name']= entity['word']
            score = entity['score']
        
        email = re.findall(email_pattern, text)
        phone = re.findall(phone_pattern, text)
        if email == '':
            email = re.findall(email_pattern, ' '.join(resume_lines))
        if phone == '':
            phone = re.findall(phone_pattern, ' '.join(resume_lines))
        output['email'] = email[0] if len(email) > 0 else ''
        output['phone'] = phone[0] if len(phone) > 0 else ''
        return output
    
    def parse(self, resume_lines):
        self.resumeSegmenter.resume_segments = {
                'objective': [],
                'work_and_employment': [],
                'education_and_training': [],
                'skills': [],
                'accomplishments': [],
                'misc': []
            }
        self.resumeSegmenter.resume_indices = []
        sections = self.resumeSegmenter.segment(resume_lines)
        if sections is None:
            return {}
        jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
        education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
        basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}        
        result =  {"basic_info":basic_info, "education":education, "work_experience":jobs}
        for section in sections.keys():
            if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
                text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
                result[section] =' '.join(wordninja.split(text))
        return result