new rag approach
Browse files- .streamlit/config.toml +0 -3
- Dockerfile +12 -10
- RAG.py +22 -10
- app.py +4 -11
- classification.py +2 -44
- cropped_1099-Div.jpg +0 -0
- cropped_1099-Int.jpg +0 -0
- cropped_w2.jpg +0 -0
- cropped_w3.jpg +0 -0
- donut_inference.py +9 -16
- non_form_llama_parse.py +2 -0
.streamlit/config.toml
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
[server]
|
2 |
-
enableXsrfProtection = false
|
3 |
-
enableCORS = false
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
-
|
2 |
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
|
3 |
|
4 |
# Set the working directory in the container
|
5 |
-
WORKDIR /
|
6 |
|
7 |
# Install system dependencies, including Python and utilities
|
8 |
RUN apt-get update && apt-get install -y \
|
@@ -12,9 +11,6 @@ RUN apt-get update && apt-get install -y \
|
|
12 |
poppler-utils \
|
13 |
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
# Add a new user to avoid running as root
|
19 |
RUN useradd -m -u 1000 user
|
20 |
|
@@ -28,16 +24,22 @@ WORKDIR $HOME/app
|
|
28 |
|
29 |
# Copy the requirements.txt first to leverage Docker cache
|
30 |
COPY --chown=user requirements.txt $HOME/app/
|
|
|
31 |
RUN pip install --no-cache-dir -r requirements.txt
|
32 |
-
RUN pip uninstall --y faiss-cpu & pip install faiss-gpu
|
33 |
-
RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124
|
34 |
|
35 |
# Copy the rest of the application's code to the container
|
36 |
-
COPY --chown=user . $HOME/app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# Expose the port the app runs on
|
39 |
EXPOSE 8501
|
40 |
|
41 |
# Set the entry point to run the application
|
42 |
-
ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]
|
43 |
-
|
|
|
|
|
1 |
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
|
2 |
|
3 |
# Set the working directory in the container
|
4 |
+
WORKDIR /app
|
5 |
|
6 |
# Install system dependencies, including Python and utilities
|
7 |
RUN apt-get update && apt-get install -y \
|
|
|
11 |
poppler-utils \
|
12 |
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
13 |
|
|
|
|
|
|
|
14 |
# Add a new user to avoid running as root
|
15 |
RUN useradd -m -u 1000 user
|
16 |
|
|
|
24 |
|
25 |
# Copy the requirements.txt first to leverage Docker cache
|
26 |
COPY --chown=user requirements.txt $HOME/app/
|
27 |
+
RUN pip install torch --extra-index-url https://download.pytorch.org/whl/cu124
|
28 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
29 |
|
30 |
# Copy the rest of the application's code to the container
|
31 |
+
COPY --chown=user src/app.py $HOME/app
|
32 |
+
COPY --chown=user src/classification.py $HOME/app
|
33 |
+
COPY --chown=user src/donut_inference.py $HOME/app
|
34 |
+
COPY --chown=user src/non_form_llama_parse.py $HOME/app
|
35 |
+
COPY --chown=user src/RAG.py $HOME/app
|
36 |
+
COPY --chown=user src/.env $HOME/app
|
37 |
+
COPY --chown=user images $HOME/app/images
|
38 |
+
COPY --chown=user Model $HOME/app/Model
|
39 |
+
COPY --chown=user best_resnet152_model.h5 $HOME/app
|
40 |
|
41 |
# Expose the port the app runs on
|
42 |
EXPOSE 8501
|
43 |
|
44 |
# Set the entry point to run the application
|
45 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]
|
|
RAG.py
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
-
from ragatouille import RAGPretrainedModel
|
|
|
|
|
|
|
|
|
2 |
from langchain_groq import ChatGroq
|
3 |
from langchain.chains import RetrievalQA
|
4 |
from langchain.memory import ConversationBufferMemory
|
@@ -6,13 +10,17 @@ from langchain.prompts import PromptTemplate
|
|
6 |
from dotenv import load_dotenv
|
7 |
import os
|
8 |
import streamlit as st
|
9 |
-
import asyncio
|
10 |
|
11 |
load_dotenv()
|
12 |
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
|
|
|
13 |
|
14 |
llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
|
15 |
-
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
|
|
|
|
|
|
|
16 |
system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
|
17 |
Read the given context before answering questions and think step by step. If you can not answer a user question based on
|
18 |
the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
|
@@ -30,14 +38,18 @@ memory = ConversationBufferMemory(input_key="question", memory_key="history")
|
|
30 |
|
31 |
def rag(full_string):
|
32 |
|
33 |
-
RAG.index(
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
)
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
qa = RetrievalQA.from_chain_type(
|
42 |
llm=llm,
|
43 |
chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
|
|
|
1 |
+
# from ragatouille import RAGPretrainedModel
|
2 |
+
from langchain_voyageai import VoyageAIEmbeddings
|
3 |
+
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
4 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
5 |
+
from langchain_community.vectorstores import FAISS
|
6 |
from langchain_groq import ChatGroq
|
7 |
from langchain.chains import RetrievalQA
|
8 |
from langchain.memory import ConversationBufferMemory
|
|
|
10 |
from dotenv import load_dotenv
|
11 |
import os
|
12 |
import streamlit as st
|
13 |
+
# import asyncio
|
14 |
|
15 |
load_dotenv()
|
16 |
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
|
17 |
+
VOYAGE_EMBEDDINGS = os.getenv('VOYAGE_EMBEDDINGS')
|
18 |
|
19 |
llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
|
20 |
+
# RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
|
21 |
+
embeddings = VoyageAIEmbeddings(
|
22 |
+
voyage_api_key=VOYAGE_EMBEDDINGS, model="voyage-large-2-instruct"
|
23 |
+
)
|
24 |
system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
|
25 |
Read the given context before answering questions and think step by step. If you can not answer a user question based on
|
26 |
the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
|
|
|
38 |
|
39 |
def rag(full_string):
|
40 |
|
41 |
+
# RAG.index(
|
42 |
+
# collection=[full_string],
|
43 |
+
# index_name="vector_db",
|
44 |
+
# max_document_length=512,
|
45 |
+
# split_documents=True,
|
46 |
|
47 |
+
# )
|
48 |
+
text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
|
49 |
+
texts = text_splitter.create_documents([full_string])
|
50 |
+
db = FAISS.from_documents(texts, embeddings)
|
51 |
+
retriever = db.as_retriever(search_kwargs={"k": 5})
|
52 |
+
# retriever = RAG.as_langchain_retriever(k=5)
|
53 |
qa = RetrievalQA.from_chain_type(
|
54 |
llm=llm,
|
55 |
chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
|
app.py
CHANGED
@@ -135,7 +135,6 @@ Context:\n
|
|
135 |
messages=context, # Pass conversation context directly
|
136 |
model="llama3-70b-8192",
|
137 |
temperature=0,
|
138 |
-
max_tokens=1024,
|
139 |
top_p=1,
|
140 |
stop=None,
|
141 |
stream=True,
|
@@ -181,10 +180,10 @@ def upload():
|
|
181 |
|
182 |
# Define the paths to your images
|
183 |
image_paths = [
|
184 |
-
"cropped_1099-Div.jpg",
|
185 |
-
"cropped_1099-Int.jpg",
|
186 |
-
"cropped_w2.jpg",
|
187 |
-
"cropped_w3.jpg"
|
188 |
]
|
189 |
|
190 |
# Define the captions for your images
|
@@ -197,17 +196,11 @@ def upload():
|
|
197 |
|
198 |
st.markdown('''
|
199 |
# Instructions:
|
200 |
-
|
201 |
1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
|
202 |
-
|
203 |
2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
|
204 |
-
|
205 |
3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
|
206 |
-
|
207 |
4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
|
208 |
-
|
209 |
5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
|
210 |
-
|
211 |
6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
|
212 |
''')
|
213 |
st.subheader("Try it out")
|
|
|
135 |
messages=context, # Pass conversation context directly
|
136 |
model="llama3-70b-8192",
|
137 |
temperature=0,
|
|
|
138 |
top_p=1,
|
139 |
stop=None,
|
140 |
stream=True,
|
|
|
180 |
|
181 |
# Define the paths to your images
|
182 |
image_paths = [
|
183 |
+
"images/cropped_1099-Div.jpg",
|
184 |
+
"images/cropped_1099-Int.jpg",
|
185 |
+
"images/cropped_w2.jpg",
|
186 |
+
"images/cropped_w3.jpg"
|
187 |
]
|
188 |
|
189 |
# Define the captions for your images
|
|
|
196 |
|
197 |
st.markdown('''
|
198 |
# Instructions:
|
|
|
199 |
1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
|
|
|
200 |
2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
|
|
|
201 |
3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
|
|
|
202 |
4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
|
|
|
203 |
5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
|
|
|
204 |
6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
|
205 |
''')
|
206 |
st.subheader("Try it out")
|
classification.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import numpy as np
|
2 |
import time
|
3 |
from tensorflow.keras.preprocessing import image
|
|
|
4 |
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
5 |
import tensorflow as tf
|
6 |
gpus = tf.config.experimental.list_physical_devices('GPU')
|
@@ -11,7 +12,6 @@ if gpus:
|
|
11 |
except RuntimeError as e:
|
12 |
# Memory growth must be set before GPUs have been initialized
|
13 |
print(e)
|
14 |
-
import streamlit as st
|
15 |
# with tf.device('/cpu:0'):
|
16 |
# Load the saved model
|
17 |
model = tf.keras.models.load_model('best_resnet152_model.h5')
|
@@ -39,46 +39,4 @@ def predict(pil_img):
|
|
39 |
predicted_class_name = class_names[predicted_class_index]
|
40 |
print("Predicted class:", predicted_class_name)
|
41 |
print("Execution time: ", end_time - start_time)
|
42 |
-
return predicted_class_name
|
43 |
-
# import numpy as np
|
44 |
-
# import time
|
45 |
-
# from PIL import Image # Import for PIL image handling
|
46 |
-
# from torchvision import transforms # Import for image preprocessing
|
47 |
-
|
48 |
-
# import torch
|
49 |
-
# import torch.nn as nn # Import for PyTorch neural networks
|
50 |
-
# import streamlit as st
|
51 |
-
|
52 |
-
# # Load the PyTorch model (assuming it's saved in PyTorch format)
|
53 |
-
# model = torch.load('./best_resnet152_model.pt') # Replace with your model filename
|
54 |
-
|
55 |
-
# # Define class names dictionary
|
56 |
-
# class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
|
57 |
-
|
58 |
-
|
59 |
-
# # Define a function for prediction using PyTorch
|
60 |
-
# @st.cache_resource
|
61 |
-
# def predict(pil_img):
|
62 |
-
# # Preprocess the image
|
63 |
-
# preprocess = transforms.Compose([
|
64 |
-
# transforms.ToTensor(), # Convert to PyTorch tensor
|
65 |
-
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize based on ImageNet statistics
|
66 |
-
# ])
|
67 |
-
# img_tensor = preprocess(pil_img)
|
68 |
-
# img_tensor.unsqueeze_(0) # Add batch dimension
|
69 |
-
|
70 |
-
# # Predict with PyTorch
|
71 |
-
# start_time = time.time()
|
72 |
-
# with torch.no_grad(): # Disable gradient calculation for prediction
|
73 |
-
# predictions = model(img_tensor)
|
74 |
-
# end_time = time.time()
|
75 |
-
|
76 |
-
# # Get the predicted class
|
77 |
-
# predicted_class_index = torch.argmax(predictions, dim=1).item()
|
78 |
-
# predicted_class_name = class_names[predicted_class_index]
|
79 |
-
|
80 |
-
# # Print results (optional for debugging)
|
81 |
-
# print("Predicted class:", predicted_class_name)
|
82 |
-
# print("Execution time: ", end_time - start_time)
|
83 |
-
|
84 |
-
# return predicted_class_name
|
|
|
1 |
import numpy as np
|
2 |
import time
|
3 |
from tensorflow.keras.preprocessing import image
|
4 |
+
import streamlit as st
|
5 |
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
6 |
import tensorflow as tf
|
7 |
gpus = tf.config.experimental.list_physical_devices('GPU')
|
|
|
12 |
except RuntimeError as e:
|
13 |
# Memory growth must be set before GPUs have been initialized
|
14 |
print(e)
|
|
|
15 |
# with tf.device('/cpu:0'):
|
16 |
# Load the saved model
|
17 |
model = tf.keras.models.load_model('best_resnet152_model.h5')
|
|
|
39 |
predicted_class_name = class_names[predicted_class_index]
|
40 |
print("Predicted class:", predicted_class_name)
|
41 |
print("Execution time: ", end_time - start_time)
|
42 |
+
return predicted_class_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cropped_1099-Div.jpg
DELETED
Binary file (721 kB)
|
|
cropped_1099-Int.jpg
DELETED
Binary file (279 kB)
|
|
cropped_w2.jpg
DELETED
Binary file (198 kB)
|
|
cropped_w3.jpg
DELETED
Binary file (292 kB)
|
|
donut_inference.py
CHANGED
@@ -1,33 +1,27 @@
|
|
1 |
import torch, re
|
2 |
from PIL import Image
|
3 |
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
4 |
-
|
5 |
-
from dotenv import load_dotenv
|
6 |
-
import os
|
7 |
-
import time
|
8 |
-
load_dotenv()
|
9 |
# image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
|
10 |
# image = Image.open(image_path)
|
11 |
# imgae = image.resize((1864, 1440))
|
12 |
|
13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
-
# device = "cpu"
|
15 |
# Load the processor from the local directory
|
16 |
-
processor = DonutProcessor.from_pretrained("
|
17 |
-
|
18 |
# Load the model from the local directory
|
19 |
-
model = VisionEncoderDecoderModel.from_pretrained("
|
20 |
model.to(device)
|
21 |
|
22 |
-
@st.cache_resource
|
23 |
def inference(image):
|
24 |
pixel_values = processor(image, return_tensors="pt").pixel_values
|
25 |
task_prompt = "<s>"
|
26 |
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
outputs = model.generate(pixel_values.to(device),
|
32 |
decoder_input_ids=decoder_input_ids.to(device),
|
33 |
max_length=model.decoder.config.max_position_embeddings,
|
@@ -39,12 +33,11 @@ def inference(image):
|
|
39 |
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
40 |
return_dict_in_generate=True,
|
41 |
output_scores=True,)
|
42 |
-
|
43 |
sequence = processor.batch_decode(outputs.sequences)[0]
|
44 |
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
|
45 |
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
|
46 |
-
print(processor.token2json(sequence))
|
47 |
-
print(f"Donut Inference time {end_time-start_time}")
|
48 |
return processor.token2json(sequence)
|
49 |
|
50 |
# data = inference(image)
|
|
|
1 |
import torch, re
|
2 |
from PIL import Image
|
3 |
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
4 |
+
|
|
|
|
|
|
|
|
|
5 |
# image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
|
6 |
# image = Image.open(image_path)
|
7 |
# imgae = image.resize((1864, 1440))
|
8 |
|
9 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
10 |
# Load the processor from the local directory
|
11 |
+
processor = DonutProcessor.from_pretrained("Model")
|
12 |
+
|
13 |
# Load the model from the local directory
|
14 |
+
model = VisionEncoderDecoderModel.from_pretrained("Model")
|
15 |
model.to(device)
|
16 |
|
|
|
17 |
def inference(image):
|
18 |
pixel_values = processor(image, return_tensors="pt").pixel_values
|
19 |
task_prompt = "<s>"
|
20 |
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
|
21 |
|
22 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
23 |
+
model.to(device)
|
24 |
+
|
25 |
outputs = model.generate(pixel_values.to(device),
|
26 |
decoder_input_ids=decoder_input_ids.to(device),
|
27 |
max_length=model.decoder.config.max_position_embeddings,
|
|
|
33 |
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
34 |
return_dict_in_generate=True,
|
35 |
output_scores=True,)
|
36 |
+
|
37 |
sequence = processor.batch_decode(outputs.sequences)[0]
|
38 |
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
|
39 |
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
|
40 |
+
# print(processor.token2json(sequence))
|
|
|
41 |
return processor.token2json(sequence)
|
42 |
|
43 |
# data = inference(image)
|
non_form_llama_parse.py
CHANGED
@@ -2,6 +2,7 @@ from llama_parse import LlamaParse
|
|
2 |
from dotenv import load_dotenv
|
3 |
import os
|
4 |
import streamlit as st
|
|
|
5 |
load_dotenv()
|
6 |
LLAMA_PARSE = os.getenv('LLAMA_PARSE')
|
7 |
|
@@ -12,6 +13,7 @@ parser = LlamaParse(
|
|
12 |
verbose=True,
|
13 |
language="en" # Optionaly you can define a language, default=en
|
14 |
)
|
|
|
15 |
@st.cache_data
|
16 |
def extract_text(pdf_path):
|
17 |
documents = parser.load_data(pdf_path)
|
|
|
2 |
from dotenv import load_dotenv
|
3 |
import os
|
4 |
import streamlit as st
|
5 |
+
|
6 |
load_dotenv()
|
7 |
LLAMA_PARSE = os.getenv('LLAMA_PARSE')
|
8 |
|
|
|
13 |
verbose=True,
|
14 |
language="en" # Optionaly you can define a language, default=en
|
15 |
)
|
16 |
+
|
17 |
@st.cache_data
|
18 |
def extract_text(pdf_path):
|
19 |
documents = parser.load_data(pdf_path)
|