Nikhil Singh commited on
Commit
b10c920
1 Parent(s): d90af9d
Files changed (2) hide show
  1. app.py +19 -4
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
 
 
3
  from mailparser import parse_from_file
4
  from bs4 import BeautifulSoup
5
  from gliner import GLiNER
@@ -11,6 +12,9 @@ import os
11
  import en_core_web_sm
12
  nlp = en_core_web_sm.load()
13
 
 
 
 
14
  _MODEL = {}
15
  _CACHE_DIR = os.environ.get("CACHE_DIR", None)
16
 
@@ -58,6 +62,13 @@ def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3,
58
 
59
  return results
60
 
 
 
 
 
 
 
 
61
  def present(email_file, labels, multilingual=False):
62
  email = accept_mail(email_file)
63
  cleaned_text = clean_email(email)
@@ -67,16 +78,18 @@ def present(email_file, labels, multilingual=False):
67
  entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
68
 
69
  # Format entities for DataFrame: Convert list of dicts to list of lists
70
- entities_data = [[entity['text'], entity['label']] for entity in entities]
 
 
71
 
72
  email_info = {
73
  "Subject": email.subject,
74
  "From": email.from_,
75
  "To": email.to,
76
  "Date": email.date,
77
- "Extracted Entities": entities_data # Adjusted for DataFrame
78
  }
79
- return [email_info[key] for key in ["Subject", "From", "To", "Date"]] + [entities_data]
80
 
81
  labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
82
 
@@ -96,8 +109,10 @@ demo = gr.Interface(
96
  gr.components.Textbox(label="From"),
97
  gr.components.Textbox(label="To"),
98
  gr.components.Textbox(label="Date"),
99
- gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities")
 
100
  ],
 
101
  title="Email Info Extractor",
102
  description="Upload an email file (.eml) to extract its details and detected entities."
103
  )
 
1
  import gradio as gr
2
 
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
4
  from mailparser import parse_from_file
5
  from bs4 import BeautifulSoup
6
  from gliner import GLiNER
 
12
  import en_core_web_sm
13
  nlp = en_core_web_sm.load()
14
 
15
+ t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
16
+ t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
17
+
18
  _MODEL = {}
19
  _CACHE_DIR = os.environ.get("CACHE_DIR", None)
20
 
 
62
 
63
  return results
64
 
65
+ def refine_entities_with_t5(entities):
66
+ inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
67
+ input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
68
+ outputs = t5_model.generate(input_ids)
69
+ result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
70
+ return result
71
+
72
  def present(email_file, labels, multilingual=False):
73
  email = accept_mail(email_file)
74
  cleaned_text = clean_email(email)
 
78
  entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
79
 
80
  # Format entities for DataFrame: Convert list of dicts to list of lists
81
+ entities = [[entity['text'], entity['label']] for entity in entities]
82
+
83
+ refined_entities = refine_entities_with_t5(entities)
84
 
85
  email_info = {
86
  "Subject": email.subject,
87
  "From": email.from_,
88
  "To": email.to,
89
  "Date": email.date,
90
+ "Extracted Entities": refined_entities
91
  }
92
+ return [email_info[key] for key in email_info]
93
 
94
  labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
95
 
 
109
  gr.components.Textbox(label="From"),
110
  gr.components.Textbox(label="To"),
111
  gr.components.Textbox(label="Date"),
112
+ gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
113
+ gr.components.Textbox(label="Refined Entities")
114
  ],
115
+ layout="horizontal",
116
  title="Email Info Extractor",
117
  description="Upload an email file (.eml) to extract its details and detected entities."
118
  )
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gliner
 
2
  mail-parser
3
  gradio
4
  beautifulsoup4
 
1
  gliner
2
+ transformers
3
  mail-parser
4
  gradio
5
  beautifulsoup4