Spaces:
Running
Running
File size: 2,789 Bytes
bf12aca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import pickle
import tempfile
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
class Embedder:
def __init__(self):
self.PATH = "embeddings"
self.createEmbeddingsDir()
def createEmbeddingsDir(self):
"""
Creates a directory to store the embeddings vectors
"""
if not os.path.exists(self.PATH):
os.mkdir(self.PATH)
def storeDocEmbeds(self, file, original_filename):
"""
Stores document embeddings using Langchain and FAISS
"""
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tmp_file:
tmp_file.write(file)
tmp_file_path = tmp_file.name
def get_file_extension(uploaded_file):
file_extension = os.path.splitext(uploaded_file)[1].lower()
return file_extension
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=100,
length_function=len,
)
file_extension = get_file_extension(original_filename)
if file_extension == ".csv":
loader = CSVLoader(
file_path=tmp_file_path,
encoding="utf-8",
csv_args={
"delimiter": ",",
},
)
data = loader.load()
elif file_extension == ".pdf":
loader = PyPDFLoader(file_path=tmp_file_path)
data = loader.load_and_split(text_splitter)
elif file_extension == ".txt":
loader = TextLoader(file_path=tmp_file_path, encoding="utf-8")
data = loader.load_and_split(text_splitter)
# embeddings = OpenAIEmbeddings()
from langchain.embeddings import HuggingFaceEmbeddings
modelpath = "intfloat/e5-large-v2"
embeddings = HuggingFaceEmbeddings(model_name=modelpath)
vectors = FAISS.from_documents(data, embeddings)
os.remove(tmp_file_path)
# Save the vectors to a pickle file
with open(f"{self.PATH}/{original_filename}.pkl", "wb") as f:
pickle.dump(vectors, f)
def getDocEmbeds(self, file, original_filename):
"""
Retrieves document embeddings
"""
if not os.path.isfile(f"{self.PATH}/{original_filename}.pkl"):
self.storeDocEmbeds(file, original_filename)
# Load the vectors from the pickle file
with open(f"{self.PATH}/{original_filename}.pkl", "rb") as f:
vectors = pickle.load(f)
return vectors
|