Spaces:
Sleeping
Sleeping
Nikhil Singh
commited on
Commit
•
b10c920
1
Parent(s):
d90af9d
added T5
Browse files- app.py +19 -4
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
|
|
|
3 |
from mailparser import parse_from_file
|
4 |
from bs4 import BeautifulSoup
|
5 |
from gliner import GLiNER
|
@@ -11,6 +12,9 @@ import os
|
|
11 |
import en_core_web_sm
|
12 |
nlp = en_core_web_sm.load()
|
13 |
|
|
|
|
|
|
|
14 |
_MODEL = {}
|
15 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
16 |
|
@@ -58,6 +62,13 @@ def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3,
|
|
58 |
|
59 |
return results
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def present(email_file, labels, multilingual=False):
|
62 |
email = accept_mail(email_file)
|
63 |
cleaned_text = clean_email(email)
|
@@ -67,16 +78,18 @@ def present(email_file, labels, multilingual=False):
|
|
67 |
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
|
68 |
|
69 |
# Format entities for DataFrame: Convert list of dicts to list of lists
|
70 |
-
|
|
|
|
|
71 |
|
72 |
email_info = {
|
73 |
"Subject": email.subject,
|
74 |
"From": email.from_,
|
75 |
"To": email.to,
|
76 |
"Date": email.date,
|
77 |
-
"Extracted Entities":
|
78 |
}
|
79 |
-
return [email_info[key] for key in
|
80 |
|
81 |
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
|
82 |
|
@@ -96,8 +109,10 @@ demo = gr.Interface(
|
|
96 |
gr.components.Textbox(label="From"),
|
97 |
gr.components.Textbox(label="To"),
|
98 |
gr.components.Textbox(label="Date"),
|
99 |
-
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities")
|
|
|
100 |
],
|
|
|
101 |
title="Email Info Extractor",
|
102 |
description="Upload an email file (.eml) to extract its details and detected entities."
|
103 |
)
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
4 |
from mailparser import parse_from_file
|
5 |
from bs4 import BeautifulSoup
|
6 |
from gliner import GLiNER
|
|
|
12 |
import en_core_web_sm
|
13 |
nlp = en_core_web_sm.load()
|
14 |
|
15 |
+
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
16 |
+
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
17 |
+
|
18 |
_MODEL = {}
|
19 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
20 |
|
|
|
62 |
|
63 |
return results
|
64 |
|
65 |
+
def refine_entities_with_t5(entities):
|
66 |
+
inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
|
67 |
+
input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
|
68 |
+
outputs = t5_model.generate(input_ids)
|
69 |
+
result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
70 |
+
return result
|
71 |
+
|
72 |
def present(email_file, labels, multilingual=False):
|
73 |
email = accept_mail(email_file)
|
74 |
cleaned_text = clean_email(email)
|
|
|
78 |
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
|
79 |
|
80 |
# Format entities for DataFrame: Convert list of dicts to list of lists
|
81 |
+
entities = [[entity['text'], entity['label']] for entity in entities]
|
82 |
+
|
83 |
+
refined_entities = refine_entities_with_t5(entities)
|
84 |
|
85 |
email_info = {
|
86 |
"Subject": email.subject,
|
87 |
"From": email.from_,
|
88 |
"To": email.to,
|
89 |
"Date": email.date,
|
90 |
+
"Extracted Entities": refined_entities
|
91 |
}
|
92 |
+
return [email_info[key] for key in email_info]
|
93 |
|
94 |
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
|
95 |
|
|
|
109 |
gr.components.Textbox(label="From"),
|
110 |
gr.components.Textbox(label="To"),
|
111 |
gr.components.Textbox(label="Date"),
|
112 |
+
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
|
113 |
+
gr.components.Textbox(label="Refined Entities")
|
114 |
],
|
115 |
+
layout="horizontal",
|
116 |
title="Email Info Extractor",
|
117 |
description="Upload an email file (.eml) to extract its details and detected entities."
|
118 |
)
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
gliner
|
|
|
2 |
mail-parser
|
3 |
gradio
|
4 |
beautifulsoup4
|
|
|
1 |
gliner
|
2 |
+
transformers
|
3 |
mail-parser
|
4 |
gradio
|
5 |
beautifulsoup4
|