|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.document_loaders import UnstructuredURLLoader |
|
from langchain.vectorstores.faiss import FAISS |
|
from langchain.embeddings import OpenAIEmbeddings |
|
import pickle |
|
|
|
|
|
urls = [ |
|
"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-4-2023", |
|
"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-6-2023", |
|
"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-7-2023", |
|
"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023", |
|
"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023", |
|
] |
|
loader = UnstructuredURLLoader(urls=urls) |
|
raw_documents = loader.load() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter() |
|
documents = text_splitter.split_documents(raw_documents) |
|
|
|
|
|
|
|
embeddings = OpenAIEmbeddings() |
|
vectorstore = FAISS.from_documents(documents, embeddings) |
|
|
|
|
|
|
|
with open("vectorstore.pkl", "wb") as f: |
|
pickle.dump(vectorstore, f) |
|
|