aka7774 commited on
Commit
30ef6e4
1 Parent(s): d004d00

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +37 -0
  2. fn.py +190 -0
  3. install.bat +56 -0
  4. main.py +71 -0
  5. requirements.txt +10 -0
  6. venv.sh +7 -0
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fn
2
+ import gradio as gr
3
+
4
+ with gr.Blocks() as demo:
5
+ gr.Markdown('# vector_search')
6
+ with gr.Tab('write'):
7
+ info = gr.Markdown()
8
+
9
+ with gr.Tab('search'):
10
+ with gr.Row():
11
+ name = gr.Textbox(
12
+ lines=1,
13
+ label='name',
14
+ interactive=True,
15
+ show_copy_button=True,
16
+ )
17
+ query = gr.Textbox(
18
+ lines=1,
19
+ label='query',
20
+ interactive=True,
21
+ show_copy_button=True,
22
+ )
23
+ result = gr.Textbox(
24
+ label='result',
25
+ lines=20,
26
+ show_copy_button=True,
27
+ )
28
+ search_button = gr.Button(value='search')
29
+
30
+ search_button.click(
31
+ fn=fn.search,
32
+ inputs=[name, query],
33
+ outputs=[result],
34
+ )
35
+
36
+ if __name__ == '__main__':
37
+ demo.launch()
fn.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import struct
4
+ import binascii
5
+ import datetime
6
+ import csv
7
+ import json
8
+ import requests
9
+ from transformers import AutoTokenizer, AutoModel
10
+ import torch
11
+ from torch import Tensor
12
+ import torch.nn.functional as F
13
+ import numpy as np
14
+ from scipy.spatial.distance import cdist
15
+ from duckduckgo_search import DDGS
16
+ from bs4 import BeautifulSoup
17
+
18
+ model_name = "intfloat/multilingual-e5-large"
19
+ input_dir = 'input'
20
+ vectors_dir = 'vectors'
21
+
22
+ model = None
23
+ tokenizer = None
24
+ device = None
25
+
26
+ vectors = {}
27
+
28
+ os.makedirs(input_dir, exist_ok=True)
29
+ os.makedirs(vectors_dir, exist_ok=True)
30
+
31
+ def ddg(text, max_results = 5):
32
+ with DDGS() as ddgs:
33
+ results = [r for r in ddgs.text(text, max_results=max_results)]
34
+ print(results)
35
+ return results
36
+
37
+ def bs4(url):
38
+ html = requests.get(url).text
39
+ soup = BeautifulSoup(html, features="html.parser")
40
+
41
+ # kill all script and style elements
42
+ for script in soup(["script", "style"]):
43
+ script.extract() # rip it out
44
+
45
+ # get text
46
+ text = soup.get_text()
47
+
48
+ # break into lines and remove leading and trailing space on each
49
+ lines = (line.strip() for line in text.splitlines())
50
+ # break multi-headlines into a line each
51
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
52
+ # drop blank lines
53
+ text = '\n'.join(chunk for chunk in chunks if chunk)
54
+
55
+ return text
56
+
57
+ def upload(name, filename, content):
58
+ os.makedirs(f"{input_dir}/{name}", exist_ok=True)
59
+ srcpath = f"{input_dir}/{name}/{filename}"
60
+ with open(srcpath, 'w', encoding='utf-8') as f:
61
+ f.write(content)
62
+
63
+ def delete(name, filename):
64
+ srcpath = f"{input_dir}/{name}/{filename}"
65
+ dstpath = f"{vectors_dir}/{name}/{filename}"
66
+ if os.path.exists(srcpath):
67
+ os.unlink(srcpath)
68
+ if os.path.exists(dstpath):
69
+ os.unlink(dstpath)
70
+
71
+ def load_model():
72
+ global model, tokenizer, device
73
+
74
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
75
+
76
+ # CUDAが利用可能かチェックし、利用可能であればデバイスをCUDAに設定
77
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
+ print(f"Using device: {device}")
79
+
80
+ # モデルをデバイスに移動
81
+ model = AutoModel.from_pretrained(model_name).to(device)
82
+
83
+ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
84
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
85
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
86
+
87
+ def cosine_similarity(v1, v2):
88
+ return 1 - cdist([v1], [v2], 'cosine')[0][0]
89
+
90
+ def embedding():
91
+ for name in os.listdir(input_dir):
92
+ os.makedirs(f"{input_dir}/{name}", exist_ok=True)
93
+ os.makedirs(f"{vectors_dir}/{name}", exist_ok=True)
94
+ for filename in os.listdir(f"{input_dir}/{name}"):
95
+ embedding_file(name, filename)
96
+
97
+ def embedding_file(name, filename):
98
+ srcpath = f"{input_dir}/{name}/{filename}"
99
+ dstpath = f"{vectors_dir}/{name}/{filename}"
100
+ if os.path.isdir(srcpath):
101
+ return
102
+ if os.path.exists(dstpath):
103
+ return
104
+
105
+ print(srcpath)
106
+ chunks = []
107
+ with open(srcpath, 'r', encoding='utf-8') as csv_file:
108
+ reader = csv.reader(csv_file)
109
+ for r in reader:
110
+ if not r:
111
+ continue
112
+ if r[0] == 'chunk': # header
113
+ continue
114
+ if len(r) == 1:
115
+ r.append('')
116
+ chunks.append(r)
117
+
118
+ # CSVファイルを開き、書き込みます
119
+ with open(dstpath, mode='w', encoding='utf-8', newline='') as csv_file:
120
+ fieldnames = ['chunk', 'output', 'vector']
121
+ writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
122
+ writer.writeheader()
123
+ for r in chunks:
124
+ writer.writerow({'chunk': r[0], 'output': r[1], 'vector': get_vector_string(r[0])})
125
+
126
+ def get_vector_string(chunk):
127
+ global model, tokenizer, device
128
+
129
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
130
+
131
+ with torch.no_grad(): # 勾配計算を不要にする
132
+ outputs = model(**inputs)
133
+
134
+ embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
135
+ embeddings = F.normalize(embeddings, p=2, dim=1)
136
+ vector_string = ",".join([hex(struct.unpack('>Q', struct.pack('>d', x))[0])[2:-7] for x in embeddings[0].cpu().numpy()]) # ベクトルを文字列に変換
137
+
138
+ return vector_string
139
+
140
+ def load_vectors():
141
+ global vectors
142
+
143
+ vectors = {}
144
+ for name in os.listdir(vectors_dir):
145
+ vectors[name] = []
146
+ for filename in os.listdir(f"{vectors_dir}/{name}"):
147
+ filepath = f"{vectors_dir}/{name}/{filename}"
148
+ with open(filepath, mode='r', encoding='utf-8') as csv_file:
149
+ reader = csv.DictReader(csv_file)
150
+ for row in reader:
151
+ vector = np.array([struct.unpack('>d', binascii.unhexlify(x+'0000000'))[0] for x in row['vector'].split(',')])
152
+ vectors[name].append([row['chunk'], row['output'], vector])
153
+
154
+ def search(name, query_text):
155
+ dt = datetime.datetime.now()
156
+
157
+ # クエリテキストをエンベディング
158
+ inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
159
+ with torch.no_grad():
160
+ outputs = model(**inputs)
161
+
162
+ query_embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
163
+ query_embeddings = F.normalize(query_embeddings, p=2, dim=1).cpu().numpy()[0]
164
+
165
+ # CSVファイルを読み込み、各レコードとクエリの類似度を計算
166
+ similarities = []
167
+
168
+ for row in vectors[name]:
169
+ similarity = cosine_similarity(query_embeddings, row[2])
170
+ similarities.append((row, similarity))
171
+
172
+ # 類似度でソートし、上位3つの結果を取得
173
+ top_matches = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
174
+
175
+ result = ''
176
+ for i, (row, similarity) in enumerate(top_matches, 1):
177
+ if not row[1]:
178
+ row[1] = row[0]
179
+ result += f"#{i} {similarity*100:.2f}%\n{row[1]}\n\n"
180
+
181
+ print(result)
182
+ print(datetime.datetime.now() - dt)
183
+
184
+ return result
185
+
186
+ load_model()
187
+ load_vectors()
188
+
189
+ if __name__ == '__main__':
190
+ embedding()
install.bat ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ rem -------------------------------------------
4
+ rem NOT guaranteed to work on Windows
5
+
6
+ set APPDIR=vector_search
7
+ set REPOS=https://huggingface.co/spaces/aka7774/%APPDIR%
8
+ set VENV=venv
9
+
10
+ rem -------------------------------------------
11
+
12
+ set INSTALL_DIR=%~dp0
13
+ cd /d %INSTALL_DIR%
14
+
15
+ :git_clone
16
+ set DL_URL=%REPOS%
17
+ set DL_DST=%APPDIR%
18
+ git clone %DL_URL% %APPDIR%
19
+ if exist %DL_DST% goto install_python
20
+
21
+ set DL_URL=https://github.com/git-for-windows/git/releases/download/v2.41.0.windows.3/PortableGit-2.41.0.3-64-bit.7z.exe
22
+ set DL_DST=PortableGit-2.41.0.3-64-bit.7z.exe
23
+ curl -L -o %DL_DST% %DL_URL%
24
+ if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
25
+ %DL_DST% -y
26
+ del %DL_DST%
27
+
28
+ set GIT=%INSTALL_DIR%PortableGit\bin\git
29
+ %GIT% clone %REPOS%
30
+
31
+ :install_python
32
+ set DL_URL=https://github.com/indygreg/python-build-standalone/releases/download/20240415/cpython-3.10.14+20240415-x86_64-pc-windows-msvc-shared-install_only.tar.gz
33
+ set DL_DST="%INSTALL_DIR%python.tar.gz"
34
+ curl -L -o %DL_DST% %DL_URL%
35
+ if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
36
+ tar -xzf %DL_DST%
37
+
38
+ set PYTHON=%INSTALL_DIR%python\python.exe
39
+ set PATH=%PATH%;%INSTALL_DIR%python310\Scripts
40
+
41
+ :install_venv
42
+ cd %APPDIR%
43
+ %PYTHON% -m venv %VENV%
44
+ set PYTHON=%VENV%\Scripts\python.exe
45
+
46
+ :install_pip
47
+ set DL_URL=https://bootstrap.pypa.io/get-pip.py
48
+ set DL_DST=%INSTALL_DIR%get-pip.py
49
+ curl -o %DL_DST% %DL_URL%
50
+ if not exist %DL_DST% bitsadmin /transfer dl %DL_URL% %DL_DST%
51
+ %PYTHON% %DL_DST%
52
+
53
+ %PYTHON% -m pip install gradio
54
+ %PYTHON% -m pip install -r requirements.txt
55
+
56
+ pause
main.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import signal
5
+ import io
6
+
7
+ from fastapi import FastAPI, Request, status, Form, UploadFile
8
+ from fastapi.staticfiles import StaticFiles
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from pydantic import BaseModel, Field
11
+ from fastapi.exceptions import RequestValidationError
12
+ from fastapi.responses import JSONResponse, StreamingResponse
13
+ import fn
14
+ import gradio as gr
15
+ from app import demo
16
+
17
+ app = FastAPI()
18
+
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=['*'],
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ gr.mount_gradio_app(app, demo, path="/gradio")
28
+
29
+ @app.post("/ddg")
30
+ async def api_ddg(args: dict):
31
+ try:
32
+ results = fn.ddg(args['query'], args['max_results'])
33
+
34
+ return {"results": results}
35
+ except Exception as e:
36
+ return {"error": str(e)}
37
+
38
+ @app.post("/bs4")
39
+ async def api_bs4(args: dict):
40
+ try:
41
+ text = fn.bs4(args['url'])
42
+
43
+ return {"text": text}
44
+ except Exception as e:
45
+ return {"error": str(e)}
46
+
47
+ @app.post("/reload")
48
+ async def api_reload(args: dict):
49
+ fn.load_vectors()
50
+ return {'status': 0}
51
+
52
+ @app.post("/upload")
53
+ async def api_upload(args: dict):
54
+ fn.upload(args['name'], args['filename'], args['content'])
55
+ return {'status': 0}
56
+
57
+ @app.post("/delete")
58
+ async def api_delete(args: dict):
59
+ fn.delete(args['name'], args['filename'])
60
+ return {'status': 0}
61
+
62
+ @app.post("/embedding")
63
+ async def api_embedding(args: dict):
64
+ fn.embedding()
65
+ fn.load_vectors()
66
+ return {'status': 0}
67
+
68
+ @app.post("/search")
69
+ async def api_search(args: dict):
70
+ result = fn.search(args['name'], args['query'])
71
+ return {'result': result}
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ numpy
4
+ transformers
5
+ torch
6
+ scipy
7
+ duckduckgo_search
8
+ requests
9
+ beautifulsoup4
10
+ python-multipart
venv.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+
3
+ python3 -m venv venv
4
+ curl -kL https://bootstrap.pypa.io/get-pip.py | venv/bin/python
5
+
6
+ venv/bin/python -m pip install gradio
7
+ venv/bin/python -m pip install -r requirements.txt