Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- app.py +37 -0
- fn.py +190 -0
- install.bat +56 -0
- main.py +71 -0
- requirements.txt +10 -0
- venv.sh +7 -0
app.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fn
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
with gr.Blocks() as demo:
|
5 |
+
gr.Markdown('# vector_search')
|
6 |
+
with gr.Tab('write'):
|
7 |
+
info = gr.Markdown()
|
8 |
+
|
9 |
+
with gr.Tab('search'):
|
10 |
+
with gr.Row():
|
11 |
+
name = gr.Textbox(
|
12 |
+
lines=1,
|
13 |
+
label='name',
|
14 |
+
interactive=True,
|
15 |
+
show_copy_button=True,
|
16 |
+
)
|
17 |
+
query = gr.Textbox(
|
18 |
+
lines=1,
|
19 |
+
label='query',
|
20 |
+
interactive=True,
|
21 |
+
show_copy_button=True,
|
22 |
+
)
|
23 |
+
result = gr.Textbox(
|
24 |
+
label='result',
|
25 |
+
lines=20,
|
26 |
+
show_copy_button=True,
|
27 |
+
)
|
28 |
+
search_button = gr.Button(value='search')
|
29 |
+
|
30 |
+
search_button.click(
|
31 |
+
fn=fn.search,
|
32 |
+
inputs=[name, query],
|
33 |
+
outputs=[result],
|
34 |
+
)
|
35 |
+
|
36 |
+
if __name__ == '__main__':
|
37 |
+
demo.launch()
|
fn.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import struct
|
4 |
+
import binascii
|
5 |
+
import datetime
|
6 |
+
import csv
|
7 |
+
import json
|
8 |
+
import requests
|
9 |
+
from transformers import AutoTokenizer, AutoModel
|
10 |
+
import torch
|
11 |
+
from torch import Tensor
|
12 |
+
import torch.nn.functional as F
|
13 |
+
import numpy as np
|
14 |
+
from scipy.spatial.distance import cdist
|
15 |
+
from duckduckgo_search import DDGS
|
16 |
+
from bs4 import BeautifulSoup
|
17 |
+
|
18 |
+
model_name = "intfloat/multilingual-e5-large"
|
19 |
+
input_dir = 'input'
|
20 |
+
vectors_dir = 'vectors'
|
21 |
+
|
22 |
+
model = None
|
23 |
+
tokenizer = None
|
24 |
+
device = None
|
25 |
+
|
26 |
+
vectors = {}
|
27 |
+
|
28 |
+
os.makedirs(input_dir, exist_ok=True)
|
29 |
+
os.makedirs(vectors_dir, exist_ok=True)
|
30 |
+
|
31 |
+
def ddg(text, max_results = 5):
|
32 |
+
with DDGS() as ddgs:
|
33 |
+
results = [r for r in ddgs.text(text, max_results=max_results)]
|
34 |
+
print(results)
|
35 |
+
return results
|
36 |
+
|
37 |
+
def bs4(url):
|
38 |
+
html = requests.get(url).text
|
39 |
+
soup = BeautifulSoup(html, features="html.parser")
|
40 |
+
|
41 |
+
# kill all script and style elements
|
42 |
+
for script in soup(["script", "style"]):
|
43 |
+
script.extract() # rip it out
|
44 |
+
|
45 |
+
# get text
|
46 |
+
text = soup.get_text()
|
47 |
+
|
48 |
+
# break into lines and remove leading and trailing space on each
|
49 |
+
lines = (line.strip() for line in text.splitlines())
|
50 |
+
# break multi-headlines into a line each
|
51 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
52 |
+
# drop blank lines
|
53 |
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
54 |
+
|
55 |
+
return text
|
56 |
+
|
57 |
+
def upload(name, filename, content):
|
58 |
+
os.makedirs(f"{input_dir}/{name}", exist_ok=True)
|
59 |
+
srcpath = f"{input_dir}/{name}/{filename}"
|
60 |
+
with open(srcpath, 'w', encoding='utf-8') as f:
|
61 |
+
f.write(content)
|
62 |
+
|
63 |
+
def delete(name, filename):
|
64 |
+
srcpath = f"{input_dir}/{name}/{filename}"
|
65 |
+
dstpath = f"{vectors_dir}/{name}/{filename}"
|
66 |
+
if os.path.exists(srcpath):
|
67 |
+
os.unlink(srcpath)
|
68 |
+
if os.path.exists(dstpath):
|
69 |
+
os.unlink(dstpath)
|
70 |
+
|
71 |
+
def load_model():
|
72 |
+
global model, tokenizer, device
|
73 |
+
|
74 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
75 |
+
|
76 |
+
# CUDAが利用可能かチェックし、利用可能であればデバイスをCUDAに設定
|
77 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
78 |
+
print(f"Using device: {device}")
|
79 |
+
|
80 |
+
# モデルをデバイスに移動
|
81 |
+
model = AutoModel.from_pretrained(model_name).to(device)
|
82 |
+
|
83 |
+
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
|
84 |
+
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
|
85 |
+
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
|
86 |
+
|
87 |
+
def cosine_similarity(v1, v2):
|
88 |
+
return 1 - cdist([v1], [v2], 'cosine')[0][0]
|
89 |
+
|
90 |
+
def embedding():
|
91 |
+
for name in os.listdir(input_dir):
|
92 |
+
os.makedirs(f"{input_dir}/{name}", exist_ok=True)
|
93 |
+
os.makedirs(f"{vectors_dir}/{name}", exist_ok=True)
|
94 |
+
for filename in os.listdir(f"{input_dir}/{name}"):
|
95 |
+
embedding_file(name, filename)
|
96 |
+
|
97 |
+
def embedding_file(name, filename):
|
98 |
+
srcpath = f"{input_dir}/{name}/{filename}"
|
99 |
+
dstpath = f"{vectors_dir}/{name}/{filename}"
|
100 |
+
if os.path.isdir(srcpath):
|
101 |
+
return
|
102 |
+
if os.path.exists(dstpath):
|
103 |
+
return
|
104 |
+
|
105 |
+
print(srcpath)
|
106 |
+
chunks = []
|
107 |
+
with open(srcpath, 'r', encoding='utf-8') as csv_file:
|
108 |
+
reader = csv.reader(csv_file)
|
109 |
+
for r in reader:
|
110 |
+
if not r:
|
111 |
+
continue
|
112 |
+
if r[0] == 'chunk': # header
|
113 |
+
continue
|
114 |
+
if len(r) == 1:
|
115 |
+
r.append('')
|
116 |
+
chunks.append(r)
|
117 |
+
|
118 |
+
# CSVファイルを開き、書き込みます
|
119 |
+
with open(dstpath, mode='w', encoding='utf-8', newline='') as csv_file:
|
120 |
+
fieldnames = ['chunk', 'output', 'vector']
|
121 |
+
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
122 |
+
writer.writeheader()
|
123 |
+
for r in chunks:
|
124 |
+
writer.writerow({'chunk': r[0], 'output': r[1], 'vector': get_vector_string(r[0])})
|
125 |
+
|
126 |
+
def get_vector_string(chunk):
|
127 |
+
global model, tokenizer, device
|
128 |
+
|
129 |
+
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
|
130 |
+
|
131 |
+
with torch.no_grad(): # 勾配計算を不要にする
|
132 |
+
outputs = model(**inputs)
|
133 |
+
|
134 |
+
embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
|
135 |
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
136 |
+
vector_string = ",".join([hex(struct.unpack('>Q', struct.pack('>d', x))[0])[2:-7] for x in embeddings[0].cpu().numpy()]) # ベクトルを文字列に変換
|
137 |
+
|
138 |
+
return vector_string
|
139 |
+
|
140 |
+
def load_vectors():
|
141 |
+
global vectors
|
142 |
+
|
143 |
+
vectors = {}
|
144 |
+
for name in os.listdir(vectors_dir):
|
145 |
+
vectors[name] = []
|
146 |
+
for filename in os.listdir(f"{vectors_dir}/{name}"):
|
147 |
+
filepath = f"{vectors_dir}/{name}/{filename}"
|
148 |
+
with open(filepath, mode='r', encoding='utf-8') as csv_file:
|
149 |
+
reader = csv.DictReader(csv_file)
|
150 |
+
for row in reader:
|
151 |
+
vector = np.array([struct.unpack('>d', binascii.unhexlify(x+'0000000'))[0] for x in row['vector'].split(',')])
|
152 |
+
vectors[name].append([row['chunk'], row['output'], vector])
|
153 |
+
|
154 |
+
def search(name, query_text):
|
155 |
+
dt = datetime.datetime.now()
|
156 |
+
|
157 |
+
# クエリテキストをエンベディング
|
158 |
+
inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
|
159 |
+
with torch.no_grad():
|
160 |
+
outputs = model(**inputs)
|
161 |
+
|
162 |
+
query_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
|
163 |
+
query_embeddings = F.normalize(query_embeddings, p=2, dim=1).cpu().numpy()[0]
|
164 |
+
|
165 |
+
# CSVファイルを読み込み、各レコードとクエリの類似度を計算
|
166 |
+
similarities = []
|
167 |
+
|
168 |
+
for row in vectors[name]:
|
169 |
+
similarity = cosine_similarity(query_embeddings, row[2])
|
170 |
+
similarities.append((row, similarity))
|
171 |
+
|
172 |
+
# 類似度でソートし、上位3つの結果を取得
|
173 |
+
top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
|
174 |
+
|
175 |
+
result = ''
|
176 |
+
for i, (row, similarity) in enumerate(top_matches, 1):
|
177 |
+
if not row[1]:
|
178 |
+
row[1] = row[0]
|
179 |
+
result += f"#{i} {similarity*100:.2f}%\n{row[1]}\n\n"
|
180 |
+
|
181 |
+
print(result)
|
182 |
+
print(datetime.datetime.now() - dt)
|
183 |
+
|
184 |
+
return result
|
185 |
+
|
186 |
+
load_model()
|
187 |
+
load_vectors()
|
188 |
+
|
189 |
+
if __name__ == '__main__':
|
190 |
+
embedding()
|
install.bat
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
|
3 |
+
rem -------------------------------------------
|
4 |
+
rem NOT guaranteed to work on Windows
|
5 |
+
|
6 |
+
set APPDIR=vector_search
|
7 |
+
set REPOS=https://huggingface.co/spaces/aka7774/%APPDIR%
|
8 |
+
set VENV=venv
|
9 |
+
|
10 |
+
rem -------------------------------------------
|
11 |
+
|
12 |
+
set INSTALL_DIR=%~dp0
|
13 |
+
cd /d %INSTALL_DIR%
|
14 |
+
|
15 |
+
:git_clone
|
16 |
+
set DL_URL=%REPOS%
|
17 |
+
set DL_DST=%APPDIR%
|
18 |
+
git clone %DL_URL% %APPDIR%
|
19 |
+
if exist %DL_DST% goto install_python
|
20 |
+
|
21 |
+
set DL_URL=https://github.com/git-for-windows/git/releases/download/v2.41.0.windows.3/PortableGit-2.41.0.3-64-bit.7z.exe
|
22 |
+
set DL_DST=PortableGit-2.41.0.3-64-bit.7z.exe
|
23 |
+
curl -L -o %DL_DST% %DL_URL%
|
24 |
+
if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
|
25 |
+
%DL_DST% -y
|
26 |
+
del %DL_DST%
|
27 |
+
|
28 |
+
set GIT=%INSTALL_DIR%PortableGit\bin\git
|
29 |
+
%GIT% clone %REPOS%
|
30 |
+
|
31 |
+
:install_python
|
32 |
+
set DL_URL=https://github.com/indygreg/python-build-standalone/releases/download/20240415/cpython-3.10.14+20240415-x86_64-pc-windows-msvc-shared-install_only.tar.gz
|
33 |
+
set DL_DST="%INSTALL_DIR%python.tar.gz"
|
34 |
+
curl -L -o %DL_DST% %DL_URL%
|
35 |
+
if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
|
36 |
+
tar -xzf %DL_DST%
|
37 |
+
|
38 |
+
set PYTHON=%INSTALL_DIR%python\python.exe
|
39 |
+
set PATH=%PATH%;%INSTALL_DIR%python310\Scripts
|
40 |
+
|
41 |
+
:install_venv
|
42 |
+
cd %APPDIR%
|
43 |
+
%PYTHON% -m venv %VENV%
|
44 |
+
set PYTHON=%VENV%\Scripts\python.exe
|
45 |
+
|
46 |
+
:install_pip
|
47 |
+
set DL_URL=https://bootstrap.pypa.io/get-pip.py
|
48 |
+
set DL_DST=%INSTALL_DIR%get-pip.py
|
49 |
+
curl -o %DL_DST% %DL_URL%
|
50 |
+
if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
|
51 |
+
%PYTHON% %DL_DST%
|
52 |
+
|
53 |
+
%PYTHON% -m pip install gradio
|
54 |
+
%PYTHON% -m pip install -r requirements.txt
|
55 |
+
|
56 |
+
pause
|
main.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import signal
|
5 |
+
import io
|
6 |
+
|
7 |
+
from fastapi import FastAPI, Request, status, Form, UploadFile
|
8 |
+
from fastapi.staticfiles import StaticFiles
|
9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
10 |
+
from pydantic import BaseModel, Field
|
11 |
+
from fastapi.exceptions import RequestValidationError
|
12 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
13 |
+
import fn
|
14 |
+
import gradio as gr
|
15 |
+
from app import demo
|
16 |
+
|
17 |
+
app = FastAPI()
|
18 |
+
|
19 |
+
app.add_middleware(
|
20 |
+
CORSMiddleware,
|
21 |
+
allow_origins=['*'],
|
22 |
+
allow_credentials=True,
|
23 |
+
allow_methods=["*"],
|
24 |
+
allow_headers=["*"],
|
25 |
+
)
|
26 |
+
|
27 |
+
gr.mount_gradio_app(app, demo, path="/gradio")
|
28 |
+
|
29 |
+
@app.post("/ddg")
|
30 |
+
async def api_ddg(args: dict):
|
31 |
+
try:
|
32 |
+
results = fn.ddg(args['query'], args['max_results'])
|
33 |
+
|
34 |
+
return {"results": results}
|
35 |
+
except Exception as e:
|
36 |
+
return {"error": str(e)}
|
37 |
+
|
38 |
+
@app.post("/bs4")
|
39 |
+
async def api_bs4(args: dict):
|
40 |
+
try:
|
41 |
+
text = fn.bs4(args['url'])
|
42 |
+
|
43 |
+
return {"text": text}
|
44 |
+
except Exception as e:
|
45 |
+
return {"error": str(e)}
|
46 |
+
|
47 |
+
@app.post("/reload")
|
48 |
+
async def api_reload(args: dict):
|
49 |
+
fn.load_vectors()
|
50 |
+
return {'status': 0}
|
51 |
+
|
52 |
+
@app.post("/upload")
|
53 |
+
async def api_upload(args: dict):
|
54 |
+
fn.upload(args['name'], args['filename'], args['content'])
|
55 |
+
return {'status': 0}
|
56 |
+
|
57 |
+
@app.post("/delete")
|
58 |
+
async def api_delete(args: dict):
|
59 |
+
fn.delete(args['name'], args['filename'])
|
60 |
+
return {'status': 0}
|
61 |
+
|
62 |
+
@app.post("/embedding")
|
63 |
+
async def api_embedding(args: dict):
|
64 |
+
fn.embedding()
|
65 |
+
fn.load_vectors()
|
66 |
+
return {'status': 0}
|
67 |
+
|
68 |
+
@app.post("/search")
|
69 |
+
async def api_search(args: dict):
|
70 |
+
result = fn.search(args['name'], args['query'])
|
71 |
+
return {'result': result}
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
numpy
|
4 |
+
transformers
|
5 |
+
torch
|
6 |
+
scipy
|
7 |
+
duckduckgo_search
|
8 |
+
requests
|
9 |
+
beautifulsoup4
|
10 |
+
python-multipart
|
venv.sh
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/bash
|
2 |
+
|
3 |
+
python3 -m venv venv
|
4 |
+
curl -kL https://bootstrap.pypa.io/get-pip.py | venv/bin/python
|
5 |
+
|
6 |
+
venv/bin/python -m pip install gradio
|
7 |
+
venv/bin/python -m pip install -r requirements.txt
|