yonkasoft commited on
Commit
b9ab29b
1 Parent(s): 02cf58a

Upload 11 files

Browse files
Files changed (11) hide show
  1. Dockerfile +14 -0
  2. bart.txt +5 -0
  3. load.ipynb +694 -0
  4. load2.ipynb +378 -0
  5. merged_train.parquet +3 -0
  6. model.ipynb +371 -0
  7. mongoDb.py +39 -0
  8. mongoDb_2.py +24 -0
  9. mongodb_egitim.py +1 -0
  10. pyvenv.cfg +5 -0
  11. requirements.txt +6 -2
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Temel imaj
2
+ FROM python:3.11.4
3
+
4
+ # Çalışma dizinini ayarla
5
+ WORKDIR /deneme
6
+
7
+ # Gereksinimler dosyasını kopyala
8
+ COPY requirements.txt /deneme/requirements.txt
9
+
10
+ # Gereksinimleri yükle
11
+ RUN pip install --no-cache-dir --upgrade -r /deneme/requirements.txt
12
+
13
+ # Uygulamanızı başlatma komutu
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
bart.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ MNLI: a bitext classification task to predict whether one sentence entails another. The fine-tuned model concatenates the two sentences with appended an EOS token, and passes them to both the BART encoder and decoder. In contrast to BERT, the representation of the EOS token is used to classify the sentences relations.
2
+
3
+ ELI5: a long-form abstractive question answering dataset. Models generate answers conditioned on the concatenation of a question and supporting documents.
4
+ ConvAI2: a dialogue response generation task, conditioned on context and a persona.
5
+ CNN/DM: a news summarization dataset. Summaries here are typically closely related to source sentences.
load.ipynb ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 10,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#birleştirilcek dosyaların listesi \n",
10
+ "train_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00000-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00001-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00002-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00003-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00004-of-00007.parquet']\n",
11
+ "test_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00005-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00006-of-00007.parquet']\n",
12
+ "\n"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 11,
18
+ "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "ename": "ImportError",
22
+ "evalue": "cannot import name 'Automodel' from 'transformers' (c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py)",
23
+ "output_type": "error",
24
+ "traceback": [
25
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
26
+ "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)",
27
+ "Cell \u001b[1;32mIn[11], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Automodel \n",
28
+ "\u001b[1;31mImportError\u001b[0m: cannot import name 'Automodel' from 'transformers' (c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py)"
29
+ ]
30
+ }
31
+ ],
32
+ "source": [
33
+ "import datasets\n",
34
+ "import transformers\n",
35
+ "from datasets import Dataset\n",
36
+ "from transformers import Automodel "
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 7,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stdout",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "Package Version\n",
49
+ "----------------- -----------\n",
50
+ "asttokens 2.4.1\n",
51
+ "colorama 0.4.6\n",
52
+ "comm 0.2.2\n",
53
+ "debugpy 1.8.2\n",
54
+ "decorator 5.1.1\n",
55
+ "executing 2.0.1\n",
56
+ "ipykernel 6.29.5\n",
57
+ "ipython 8.26.0\n",
58
+ "jedi 0.19.1\n",
59
+ "jupyter_client 8.6.2\n",
60
+ "jupyter_core 5.7.2\n",
61
+ "matplotlib-inline 0.1.7\n",
62
+ "nest-asyncio 1.6.0\n",
63
+ "packaging 24.1\n",
64
+ "parso 0.8.4\n",
65
+ "pip 24.2\n",
66
+ "platformdirs 4.2.2\n",
67
+ "prompt_toolkit 3.0.47\n",
68
+ "psutil 6.0.0\n",
69
+ "pure_eval 0.2.3\n",
70
+ "Pygments 2.18.0\n",
71
+ "python-dateutil 2.9.0.post0\n",
72
+ "pywin32 306\n",
73
+ "pyzmq 26.0.3\n",
74
+ "setuptools 65.5.0\n",
75
+ "six 1.16.0\n",
76
+ "stack-data 0.6.3\n",
77
+ "tornado 6.4.1\n",
78
+ "traitlets 5.14.3\n",
79
+ "typing_extensions 4.12.2\n",
80
+ "wcwidth 0.2.13\n",
81
+ "Collecting transformers\n",
82
+ " Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)\n",
83
+ "Collecting filelock (from transformers)\n",
84
+ " Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)\n",
85
+ "Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)\n",
86
+ " Using cached huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)\n",
87
+ "Collecting numpy>=1.17 (from transformers)\n",
88
+ " Using cached numpy-2.0.1-cp311-cp311-win_amd64.whl.metadata (60 kB)\n",
89
+ "Requirement already satisfied: packaging>=20.0 in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from transformers) (24.1)\n",
90
+ "Collecting pyyaml>=5.1 (from transformers)\n",
91
+ " Using cached PyYAML-6.0.1-cp311-cp311-win_amd64.whl.metadata (2.1 kB)\n",
92
+ "Collecting regex!=2019.12.17 (from transformers)\n",
93
+ " Downloading regex-2024.7.24-cp311-cp311-win_amd64.whl.metadata (41 kB)\n",
94
+ "Collecting requests (from transformers)\n",
95
+ " Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)\n",
96
+ "Collecting safetensors>=0.4.1 (from transformers)\n",
97
+ " Downloading safetensors-0.4.3-cp311-none-win_amd64.whl.metadata (3.9 kB)\n",
98
+ "Collecting tokenizers<0.20,>=0.19 (from transformers)\n",
99
+ " Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)\n",
100
+ "Collecting tqdm>=4.27 (from transformers)\n",
101
+ " Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n",
102
+ "Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)\n",
103
+ " Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)\n",
104
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
105
+ "Requirement already satisfied: colorama in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from tqdm>=4.27->transformers) (0.4.6)\n",
106
+ "Collecting charset-normalizer<4,>=2 (from requests->transformers)\n",
107
+ " Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl.metadata (34 kB)\n",
108
+ "Collecting idna<4,>=2.5 (from requests->transformers)\n",
109
+ " Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)\n",
110
+ "Collecting urllib3<3,>=1.21.1 (from requests->transformers)\n",
111
+ " Using cached urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)\n",
112
+ "Collecting certifi>=2017.4.17 (from requests->transformers)\n",
113
+ " Using cached certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)\n",
114
+ "Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)\n",
115
+ " ---------------------------------------- 0.0/9.4 MB ? eta -:--:--\n",
116
+ " ---------------------------------------- 0.0/9.4 MB ? eta -:--:--\n",
117
+ " - -------------------------------------- 0.3/9.4 MB ? eta -:--:--\n",
118
+ " -- ------------------------------------- 0.5/9.4 MB 932.9 kB/s eta 0:00:10\n",
119
+ " -- ------------------------------------- 0.5/9.4 MB 932.9 kB/s eta 0:00:10\n",
120
+ " --- ------------------------------------ 0.8/9.4 MB 838.9 kB/s eta 0:00:11\n",
121
+ " ---- ----------------------------------- 1.0/9.4 MB 825.2 kB/s eta 0:00:11\n",
122
+ " ---- ----------------------------------- 1.0/9.4 MB 825.2 kB/s eta 0:00:11\n",
123
+ " ----- ---------------------------------- 1.3/9.4 MB 818.6 kB/s eta 0:00:10\n",
124
+ " ------ --------------------------------- 1.6/9.4 MB 822.8 kB/s eta 0:00:10\n",
125
+ " ------ --------------------------------- 1.6/9.4 MB 822.8 kB/s eta 0:00:10\n",
126
+ " ------- -------------------------------- 1.8/9.4 MB 838.9 kB/s eta 0:00:10\n",
127
+ " -------- ------------------------------- 2.1/9.4 MB 851.1 kB/s eta 0:00:09\n",
128
+ " -------- ------------------------------- 2.1/9.4 MB 851.1 kB/s eta 0:00:09\n",
129
+ " ---------- ----------------------------- 2.4/9.4 MB 860.5 kB/s eta 0:00:09\n",
130
+ " ----------- ---------------------------- 2.6/9.4 MB 878.0 kB/s eta 0:00:08\n",
131
+ " ------------ --------------------------- 2.9/9.4 MB 897.4 kB/s eta 0:00:08\n",
132
+ " ------------- -------------------------- 3.1/9.4 MB 913.7 kB/s eta 0:00:07\n",
133
+ " -------------- ------------------------- 3.4/9.4 MB 911.0 kB/s eta 0:00:07\n",
134
+ " -------------- ------------------------- 3.4/9.4 MB 911.0 kB/s eta 0:00:07\n",
135
+ " --------------- ------------------------ 3.7/9.4 MB 908.8 kB/s eta 0:00:07\n",
136
+ " ---------------- ----------------------- 3.9/9.4 MB 910.4 kB/s eta 0:00:07\n",
137
+ " ----------------- ---------------------- 4.2/9.4 MB 918.5 kB/s eta 0:00:06\n",
138
+ " ----------------- ---------------------- 4.2/9.4 MB 918.5 kB/s eta 0:00:06\n",
139
+ " ------------------ --------------------- 4.5/9.4 MB 916.2 kB/s eta 0:00:06\n",
140
+ " -------------------- ------------------- 4.7/9.4 MB 926.1 kB/s eta 0:00:06\n",
141
+ " --------------------- ------------------ 5.0/9.4 MB 935.1 kB/s eta 0:00:05\n",
142
+ " ---------------------- ----------------- 5.2/9.4 MB 940.5 kB/s eta 0:00:05\n",
143
+ " ----------------------- ---------------- 5.5/9.4 MB 950.7 kB/s eta 0:00:05\n",
144
+ " ------------------------ --------------- 5.8/9.4 MB 957.4 kB/s eta 0:00:04\n",
145
+ " ------------------------- -------------- 6.0/9.4 MB 966.3 kB/s eta 0:00:04\n",
146
+ " -------------------------- ------------- 6.3/9.4 MB 974.5 kB/s eta 0:00:04\n",
147
+ " --------------------------- ------------ 6.6/9.4 MB 984.6 kB/s eta 0:00:03\n",
148
+ " ---------------------------- ----------- 6.8/9.4 MB 991.6 kB/s eta 0:00:03\n",
149
+ " ------------------------------ --------- 7.1/9.4 MB 1.0 MB/s eta 0:00:03\n",
150
+ " ------------------------------- -------- 7.3/9.4 MB 1.0 MB/s eta 0:00:03\n",
151
+ " -------------------------------- ------- 7.6/9.4 MB 1.0 MB/s eta 0:00:02\n",
152
+ " --------------------------------- ------ 7.9/9.4 MB 1.0 MB/s eta 0:00:02\n",
153
+ " ---------------------------------- ----- 8.1/9.4 MB 1.0 MB/s eta 0:00:02\n",
154
+ " ----------------------------------- ---- 8.4/9.4 MB 1.0 MB/s eta 0:00:01\n",
155
+ " ------------------------------------ --- 8.7/9.4 MB 1.1 MB/s eta 0:00:01\n",
156
+ " ------------------------------------- -- 8.9/9.4 MB 1.1 MB/s eta 0:00:01\n",
157
+ " ---------------------------------------- 9.4/9.4 MB 1.1 MB/s eta 0:00:00\n",
158
+ "Using cached huggingface_hub-0.24.5-py3-none-any.whl (417 kB)\n",
159
+ "Using cached numpy-2.0.1-cp311-cp311-win_amd64.whl (16.6 MB)\n",
160
+ "Using cached PyYAML-6.0.1-cp311-cp311-win_amd64.whl (144 kB)\n",
161
+ "Downloading regex-2024.7.24-cp311-cp311-win_amd64.whl (269 kB)\n",
162
+ "Downloading safetensors-0.4.3-cp311-none-win_amd64.whl (287 kB)\n",
163
+ "Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl (2.2 MB)\n",
164
+ " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n",
165
+ " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n",
166
+ " --------- ------------------------------ 0.5/2.2 MB 1.4 MB/s eta 0:00:02\n",
167
+ " -------------- ------------------------- 0.8/2.2 MB 1.3 MB/s eta 0:00:02\n",
168
+ " ------------------ --------------------- 1.0/2.2 MB 1.3 MB/s eta 0:00:01\n",
169
+ " ----------------------- ---------------- 1.3/2.2 MB 1.4 MB/s eta 0:00:01\n",
170
+ " ---------------------------- ----------- 1.6/2.2 MB 1.4 MB/s eta 0:00:01\n",
171
+ " --------------------------------- ------ 1.8/2.2 MB 1.4 MB/s eta 0:00:01\n",
172
+ " ---------------------------------------- 2.2/2.2 MB 1.4 MB/s eta 0:00:00\n",
173
+ "Using cached tqdm-4.66.4-py3-none-any.whl (78 kB)\n",
174
+ "Using cached filelock-3.15.4-py3-none-any.whl (16 kB)\n",
175
+ "Using cached requests-2.32.3-py3-none-any.whl (64 kB)\n",
176
+ "Using cached certifi-2024.7.4-py3-none-any.whl (162 kB)\n",
177
+ "Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl (99 kB)\n",
178
+ "Using cached fsspec-2024.6.1-py3-none-any.whl (177 kB)\n",
179
+ "Using cached idna-3.7-py3-none-any.whl (66 kB)\n",
180
+ "Using cached urllib3-2.2.2-py3-none-any.whl (121 kB)\n",
181
+ "Installing collected packages: urllib3, tqdm, safetensors, regex, pyyaml, numpy, idna, fsspec, filelock, charset-normalizer, certifi, requests, huggingface-hub, tokenizers, transformers\n",
182
+ "Successfully installed certifi-2024.7.4 charset-normalizer-3.3.2 filelock-3.15.4 fsspec-2024.6.1 huggingface-hub-0.24.5 idna-3.7 numpy-2.0.1 pyyaml-6.0.1 regex-2024.7.24 requests-2.32.3 safetensors-0.4.3 tokenizers-0.19.1 tqdm-4.66.4 transformers-4.43.3 urllib3-2.2.2\n"
183
+ ]
184
+ }
185
+ ],
186
+ "source": [
187
+ "!pip list dataset\n",
188
+ "!pip install transformers"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "#dosyaları yükleyin ve birleştirin\n",
198
+ "train_dfs=[pd.read_parquet(file) for file in train_files]\n",
199
+ "test_dfs=[pd.read_parquet(file) for file in test_files]"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": null,
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "#parque dosyalarının birleştirilmesi\n",
209
+ "train_df=pd.concat(train_dfs,ignore_index=True)\n",
210
+ "test_df=pd.concat(test_dfs,ignore_index=True)\n",
211
+ "\n",
212
+ "print(train_df.head())\n",
213
+ "print(train_df.head())\n"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "#train ve test dosyaları oluşturma \n",
223
+ "train_df.to_parquet('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n",
224
+ "test_df.to_parquet('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "#test ve train yollarını belirleme ve test, traindeki önemli sütunları alma\n",
234
+ "train_file_path=('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n",
235
+ "test_file_path=('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')\n",
236
+ "\n",
237
+ "train_df=pd.read_parquet(train_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
238
+ "test_df=pd.read_parquet(test_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
239
+ "\n",
240
+ "print(train_df.head())\n",
241
+ "print(test_df.head())"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": null,
247
+ "metadata": {},
248
+ "outputs": [],
249
+ "source": [
250
+ "#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \n",
251
+ "# Load model directly\n",
252
+ "from transformers import AutoModel,AutoTokenizer\n",
253
+ "from transformers import (WEIGHTS_NAME, BertConfig,\n",
254
+ " BertForQuestionAnswering, BertTokenizer)\n",
255
+ "from torch.utils.data import DataLoader, SequentialSampler, TensorDataset\n",
256
+ "\n",
257
+ "#from utils import (get_answer, input_to_squad_example,squad_examples_to_features, to_list)\n",
258
+ "import collections\n",
259
+ "# Load model directly\n",
260
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
261
+ "\n",
262
+ "tokenizer = AutoTokenizer.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n",
263
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "metadata": {},
270
+ "outputs": [],
271
+ "source": [
272
+ "from pymongo import MongoClient\n",
273
+ "import pandas as pd\n",
274
+ "\n",
275
+ "# MongoDB connection settings\n",
276
+ "\n",
277
+ "def get_mongodb(database_name='yeniDatabase', collection_name='train', host='localhost', port=27017):\n",
278
+ " \"\"\"\n",
279
+ " MongoDB connection and collection selection\n",
280
+ " \"\"\"\n",
281
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
282
+ " db = client[database_name]\n",
283
+ " collection = db[collection_name]\n",
284
+ " return collection\n",
285
+ "\n",
286
+ "# Function to load dataset into MongoDB\n",
287
+ "def dataset_read():\n",
288
+ " train_file_path = ('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n",
289
+ " data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
290
+ " data_dict = data.to_dict(\"records\")\n",
291
+ "\n",
292
+ " # Get the MongoDB collection\n",
293
+ " source_collection = get_mongodb(database_name='yeniDatabase', collection_name='train') # Collection for translation\n",
294
+ "\n",
295
+ " # Insert data into MongoDB\n",
296
+ " source_collection.insert_many(data_dict)\n",
297
+ "\n",
298
+ " print(\"Data successfully loaded into MongoDB.\")\n",
299
+ " return source_collection\n",
300
+ "\n",
301
+ "# Call the function to load the dataset into MongoDB\n",
302
+ "source_collection = dataset_read()"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "markdown",
307
+ "metadata": {},
308
+ "source": [
309
+ "Test ve train verilerini mongodb ye yükleme"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": null,
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": [
318
+ "def get_mongodb(database_name='yeniDatabase', collection_name='test', mongo_url='mongodb://localhost:27017/'):\n",
319
+ " \"\"\"\n",
320
+ " MongoDB connection and collection selection\n",
321
+ " \"\"\"\n",
322
+ " client = MongoClient(mongo_url)\n",
323
+ " db = client[database_name]\n",
324
+ " collection = db[collection_name]\n",
325
+ " return collection\n",
326
+ "\n",
327
+ "# Function to load dataset into MongoDB\n",
328
+ "def dataset_read():\n",
329
+ " train_file_path = ('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')\n",
330
+ " data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
331
+ " data_dict = data.to_dict(\"records\")\n",
332
+ "\n",
333
+ " # Get the MongoDB collection\n",
334
+ " source_collection = get_mongodb(database_name='yeniDatabase', collection_name='test') # Collection for translation\n",
335
+ "\n",
336
+ " # Insert data into MongoDB\n",
337
+ " source_collection.insert_many(data_dict)\n",
338
+ "\n",
339
+ " print(\"Data successfully loaded into MongoDB.\")\n",
340
+ " return source_collection\n",
341
+ "\n",
342
+ "# Call the function to load the dataset into MongoDB\n",
343
+ "source_collection = dataset_read()"
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "markdown",
348
+ "metadata": {},
349
+ "source": [
350
+ "Model eğitimi \n"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "# uygulama için kullanılcak olan özelliklerin tanımlanması\n",
360
+ "from transformers import BertTokenizer,BertForQuestionAnswering,BertConfig\n",
361
+ "class QA:\n",
362
+ " def __init__(self,model_path: str):\n",
363
+ " self.max_seq_length = 384 #max seq\n",
364
+ " self.doc_stride = 128 #stride \n",
365
+ " self.do_lower_case = False\n",
366
+ " self.max_query_length = 30\n",
367
+ " self.n_best_size = 3\n",
368
+ " self.max_answer_length = 30\n",
369
+ " self.version_2_with_negative = False\n",
370
+ " #modelin yüklenmesi\n",
371
+ " self.model, self.tokenizer = self.load_model(model_path)\n",
372
+ " #hangi işlmecinin kullanıldığının belirlenmesi\n",
373
+ " if torch.cuda.is_available():\n",
374
+ " self.device = 'cuda'\n",
375
+ " else:\n",
376
+ " self.device = 'cpu'\n",
377
+ " self.model.to(self.device)\n",
378
+ " self.model.eval()\n",
379
+ " \n",
380
+ " # This function is used to load the model\n",
381
+ " def load_model(self,model_path: str,do_lower_case=False):\n",
382
+ " config = BertConfig.from_pretrained(model_path + \"C:\\\\gitProjects\\\\train_Egitim\")\n",
383
+ " tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=do_lower_case)\n",
384
+ " model = BertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config)\n",
385
+ " return model, tokenizer\n"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": null,
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": [
394
+ "from pymongo import MongoClient\n",
395
+ "\n",
396
+ "def get_mongodb():\n",
397
+ " # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.\n",
398
+ " return 'mongodb://localhost:27017/', 'yeniDatabase', 'test'\n",
399
+ "\n",
400
+ "def get_average_prompt_token_length():\n",
401
+ " # MongoDB bağlantı bilgilerini alma\n",
402
+ " mongo_url, db_name, collection_name = get_mongodb()\n",
403
+ "\n",
404
+ " # MongoDB'ye bağlanma\n",
405
+ " client = MongoClient(mongo_url)\n",
406
+ " db = client[db_name]\n",
407
+ " collection = db[collection_name]\n",
408
+ "\n",
409
+ " # Tüm dökümanları çekme ve 'prompt_token_length' alanını alma\n",
410
+ " docs = collection.find({}, {'Prompt_token_length': 1})\n",
411
+ "\n",
412
+ " # 'prompt_token_length' değerlerini toplama ve sayma\n",
413
+ " total_length = 0\n",
414
+ " count = 0\n",
415
+ "\n",
416
+ " for doc in docs:\n",
417
+ " if 'Prompt_token_length' in doc:\n",
418
+ " total_length += doc['Prompt_token_length']\n",
419
+ " count += 1\n",
420
+ " \n",
421
+ " # Ortalama hesaplama\n",
422
+ " if count > 0:\n",
423
+ " average_length = total_length / count\n",
424
+ " else:\n",
425
+ " average_length = 0 # Eğer 'prompt_token_length' alanı olan döküman yoksa\n",
426
+ "\n",
427
+ " return int(average_length)\n",
428
+ "\n",
429
+ "# Ortalama prompt token uzunluğunu al ve yazdır\n",
430
+ "average_length = get_average_prompt_token_length()\n",
431
+ "print(f\"Ortalama prompt token uzunluğu: {average_length}\")\n"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "code",
436
+ "execution_count": null,
437
+ "metadata": {},
438
+ "outputs": [],
439
+ "source": [
440
+ "from pymongo import MongoClient\n",
441
+ "from transformers import BertTokenizer\n",
442
+ "\n",
443
+ "#getmongodb oluştumak yerine içeriği değiştirilmeli \n",
444
+ "def get_mongodb():\n",
445
+ " # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.\n",
446
+ " return 'mongodb://localhost:27017/', 'yeniDatabase', 'train'\n",
447
+ "\n",
448
+ "def get_input_texts():\n",
449
+ " # MongoDB bağlantı bilgilerini alma\n",
450
+ " mongo_url, db_name, collection_name = get_mongodb()\n",
451
+ "\n",
452
+ " # MongoDB'ye bağlanma\n",
453
+ " client = MongoClient(mongo_url)\n",
454
+ " db = client[db_name]\n",
455
+ " collection = db[collection_name]\n",
456
+ " \n",
457
+ " #input texleri mongodb üzerinde 'Prompt' lara denk gelir.\n",
458
+ "\n",
459
+ " # Sorguyu tanımlama\n",
460
+ " query = {\"Prompt\": {\"$exists\": True}}\n",
461
+ "\n",
462
+ " # Sorguyu çalıştırma ve dökümanları çekme\n",
463
+ " cursor = collection.find(query, {\"Prompt\": 1, \"_id\": 0}) # 'input_text' alanını almak için \"_id\": 0 ekleyin\n",
464
+ "\n",
465
+ " # Cursor'ı döküman listesine dönüştürme\n",
466
+ " input_texts_from_db = list(cursor)\n",
467
+ "\n",
468
+ " # Input text'leri döndürme\n",
469
+ " return input_texts_from_db\n",
470
+ "\n",
471
+ "input_texts_from_db= get_input_texts()\n",
472
+ "# Input text'leri al ve yazdır\n",
473
+ "\n",
474
+ "#tokenizer ı yükle\n",
475
+ "tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')\n",
476
+ " \n",
477
+ "#encode etmek için gerekli olan bilgiler \n",
478
+ "input_texts=[doc[\"Prompt\"] for doc in input_texts_from_db ]\n",
479
+ "\n",
480
+ "#encoding işleminde inputlar \n",
481
+ "\n",
482
+ "# Tokenize the input texts\n",
483
+ "encoded_inputs = tokenizer.batch_encode_plus(\n",
484
+ " input_texts,\n",
485
+ " padding=True,\n",
486
+ " truncation=True,\n",
487
+ " max_length=100,\n",
488
+ " return_attention_mask=True,\n",
489
+ " return_tensors='pt'\n",
490
+ ")"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": null,
496
+ "metadata": {},
497
+ "outputs": [],
498
+ "source": [
499
+ "print(f\"encoded_inputs:{encoded_inputs}\")"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": null,
505
+ "metadata": {},
506
+ "outputs": [],
507
+ "source": [
508
+ "\n",
509
+ "#maskeleme yönetmiyle eğitim\n",
510
+ "# Define the number of epochs and learning rate\n",
511
+ "num_epochs = 3\n",
512
+ "learning_rate = 1e-4\n",
513
+ "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n",
514
+ "\n",
515
+ "#Iterate over the epochs\n",
516
+ "for epoch in range(num_epochs):\n",
517
+ " total_loss = 0\n",
518
+ " for input_ids, attention_mask, labels in encoded_inputs:\n",
519
+ " #reset gradients\n",
520
+ " optimizer.zero_grad()\n",
521
+ " #forward pass \n",
522
+ " outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n",
523
+ " loss = outputs.loss\n",
524
+ " #backward pass \n",
525
+ " loss.backward()\n",
526
+ " #update optimizer \n",
527
+ " optimizer.step()\n",
528
+ " #accumulate total loss\n",
529
+ " total_loss += loss.item()\n",
530
+ " #calculate average loss\n",
531
+ " average_loss = total_loss / len(encoded_inputs)\n",
532
+ " #print the loss for current epoch\n",
533
+ " print(f\"Epoch {epoch+1} - Loss: {average_loss:.4f}\")\n",
534
+ "\n",
535
+ " #tüm bu verileri tutan bir \"batch_of_attention_masks\" verisini tanımlamam gerek"
536
+ ]
537
+ },
538
+ {
539
+ "cell_type": "code",
540
+ "execution_count": null,
541
+ "metadata": {},
542
+ "outputs": [],
543
+ "source": [
544
+ "from torch.utils.data import DataLoader,TensorDataset\n",
545
+ "import torch\n",
546
+ "from transformers import BertTokenizer\n",
547
+ "\n",
548
+ "#hdef değerlerle karşılaştırma yapabilmek için ve doğruluğu ölçmek için\n",
549
+ "\n",
550
+ "# Assuming you have tokenized input texts and labels\n",
551
+ "#attetion mask bert dilinde modelin sadece gerçek tokenler üzerinde çalışmasını sağlar.\n",
552
+ "input_ids = encoded_inputs['input_ids'] # Replace with your tokenized input texts\n",
553
+ "attention_masks = encoded_inputs['attention_mask']\n",
554
+ "\n",
555
+ "\n",
556
+ "labels = torch.tensor([1]*len(input_ids))\n",
557
+ "\n",
558
+ "# Create a TensorDataset\n",
559
+ "dataset = TensorDataset(input_ids, attention_masks, labels)\n",
560
+ "\n",
561
+ "batch_size=10000\n",
562
+ "# Create a data loader\n",
563
+ "data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
564
+ "\n",
565
+ "for batch in data_loader:\n",
566
+ " input_ids,attention_masks,labels\n",
567
+ " print(f\"ınput ıds :{input_texts}\")\n",
568
+ " print(f\"attetion masks: {attention_masks}\")\n",
569
+ " print(f\"labels:{labels}\")\n",
570
+ " break"
571
+ ]
572
+ },
573
+ {
574
+ "cell_type": "code",
575
+ "execution_count": null,
576
+ "metadata": {},
577
+ "outputs": [],
578
+ "source": [
579
+ " # This function performs the prediction and return the reponse to the flask app\n",
580
+ " # This function performs the prediction and return the reponse to the flask app\n",
581
+ "RawResult = collection.namedtuple(\"RawResult\",[\"unique_id\", \"start_logits\", \"end_logits\"])\n",
582
+ "\n",
583
+ "def predict(self,passage :str,question :str): \n",
584
+ " example = input_to_squad_example(passage,question) \n",
585
+ " features = squad_examples_to_features(example,self.tokenizer,self.max_seq_length,self.doc_stride,self.max_query_length) \n",
586
+ " all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
587
+ " all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
588
+ " all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n",
589
+ " all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
590
+ " dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n",
591
+ " all_example_index)\n",
592
+ " eval_sampler = SequentialSampler(dataset)\n",
593
+ " eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)\n",
594
+ " \n",
595
+ " all_results = []\n",
596
+ " for batch in eval_dataloader:\n",
597
+ " batch = tuple(t.to(self.device) for t in batch)\n",
598
+ " with torch.no_grad():\n",
599
+ " inputs = {'input_ids': batch[0],\n",
600
+ " 'attention_mask': batch[1],\n",
601
+ " 'token_type_ids': batch[2] \n",
602
+ " } \n",
603
+ " example_indices = batch[3] \n",
604
+ " outputs = self.model(**inputs)\n",
605
+ " \n",
606
+ " for i, example_index in enumerate(example_indices):\n",
607
+ " eval_feature = features[example_index.item()]\n",
608
+ " unique_id = int(eval_feature.unique_id)\n",
609
+ " result = RawResult(unique_id = unique_id,\n",
610
+ " start_logits = to_list(outputs[0][i]),\n",
611
+ " end_logits = to_list(outputs[1][i]))\n",
612
+ " all_results.append(result)\n",
613
+ " \n",
614
+ " answer = get_answer(example,features,all_results,self.n_best_size,self.max_answer_length,self.do_lower_case)\n",
615
+ " \n",
616
+ " return answer"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": null,
622
+ "metadata": {},
623
+ "outputs": [],
624
+ "source": [
625
+ "tokenizer.batch_encode_plus()\n",
626
+ "torch.utils.data.DataLoader\n",
627
+ "input_ids = torch.tensor(batch_of_tokenized_input_texts)\n",
628
+ "attention_mask = torch.tensor(batch_of_attention_masks)\n",
629
+ "labels = torch.tensor(batch_of_labels)"
630
+ ]
631
+ },
632
+ {
633
+ "cell_type": "code",
634
+ "execution_count": null,
635
+ "metadata": {},
636
+ "outputs": [],
637
+ "source": [
638
+ "model.save_pretrained(output_model_path)\n",
639
+ "tokenizer.save_pretrained(output_model_path)"
640
+ ]
641
+ },
642
+ {
643
+ "cell_type": "code",
644
+ "execution_count": null,
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": [
648
+ "from app import train_model_route\n",
649
+ "\n",
650
+ "#ön yüzle ilişkilendirme\n",
651
+ "\n",
652
+ "train_model_route\n",
653
+ "\n",
654
+ "#title category ile ilişkilendirlecek\n",
655
+ "\n",
656
+ "\n",
657
+ "#subheadingler subcategroy ile ilişkilendirieck\n",
658
+ "\n",
659
+ "#prompt token uzunlukları kontrol edilerek bütün tokenlerin aynı uzunlukta olması sağlanmalıdır.\n",
660
+ "\n"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "code",
665
+ "execution_count": null,
666
+ "metadata": {},
667
+ "outputs": [],
668
+ "source": [
669
+ "\n"
670
+ ]
671
+ }
672
+ ],
673
+ "metadata": {
674
+ "kernelspec": {
675
+ "display_name": "myenv",
676
+ "language": "python",
677
+ "name": "python3"
678
+ },
679
+ "language_info": {
680
+ "codemirror_mode": {
681
+ "name": "ipython",
682
+ "version": 3
683
+ },
684
+ "file_extension": ".py",
685
+ "mimetype": "text/x-python",
686
+ "name": "python",
687
+ "nbconvert_exporter": "python",
688
+ "pygments_lexer": "ipython3",
689
+ "version": "3.11.9"
690
+ }
691
+ },
692
+ "nbformat": 4,
693
+ "nbformat_minor": 2
694
+ }
load2.ipynb ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "Kütüphanelerin Yüklenmesi"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "ename": "ModuleNotFoundError",
17
+ "evalue": "No module named 'datasets'",
18
+ "output_type": "error",
19
+ "traceback": [
20
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
21
+ "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
22
+ "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n",
23
+ "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'datasets'"
24
+ ]
25
+ }
26
+ ],
27
+ "source": [
28
+ "import datasets\n",
29
+ "from datasets import load_dataset\n",
30
+ "import pandas as pd \n"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 7,
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "ename": "OSError",
40
+ "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
41
+ "output_type": "error",
42
+ "traceback": [
43
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
44
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
45
+ "Cell \u001b[1;32mIn[7], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# Load model directly\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoModel,AutoTokenizer\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (WEIGHTS_NAME, BertConfig,\n\u001b[0;32m 5\u001b[0m BertForQuestionAnswering, BertTokenizer)\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataLoader, SequentialSampler, TensorDataset\n",
46
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
47
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
48
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
49
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
50
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
51
+ "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \n",
57
+ "# Load model directly\n",
58
+ "from transformers import AutoModel,AutoTokenizer\n",
59
+ "from transformers import (WEIGHTS_NAME, BertConfig,\n",
60
+ " BertForQuestionAnswering, BertTokenizer)\n",
61
+ "from torch.utils.data import DataLoader, SequentialSampler, TensorDataset\n",
62
+ "\n",
63
+ "#from utils import (get_answer, input_to_squad_example,squad_examples_to_features, to_list)\n",
64
+ "import collections\n",
65
+ "# Load model directly\n",
66
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "markdown",
71
+ "metadata": {},
72
+ "source": [
73
+ "Train ve Test Verilerine İlişkin Databaselerin İçerisindeki Bilgilerin Alınması "
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 8,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "#birleştirilcek dosyaların listesi \n",
83
+ "train_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00000-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00001-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00002-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00003-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00004-of-00007.parquet']\n",
84
+ "test_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00005-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00006-of-00007.parquet']"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "#dosyaları yükleyin ve birleştirin\n",
94
+ "train_dfs=[pd.read_parquet(file) for file in train_files]\n",
95
+ "test_dfs=[pd.read_parquet(file) for file in test_files]"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "#parque dosyalarının birleştirilmesi\n",
105
+ "train_df=pd.concat(train_dfs,ignore_index=True)\n",
106
+ "test_df=pd.concat(test_dfs,ignore_index=True)\n",
107
+ "\n",
108
+ "print(train_df.head())\n",
109
+ "print(train_df.head())"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 9,
115
+ "metadata": {},
116
+ "outputs": [
117
+ {
118
+ "ename": "NameError",
119
+ "evalue": "name 'train_df' is not defined",
120
+ "output_type": "error",
121
+ "traceback": [
122
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
123
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
124
+ "Cell \u001b[1;32mIn[9], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#train ve test dosyaları oluşturma \u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mtrain_df\u001b[49m\u001b[38;5;241m.\u001b[39mto_parquet(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mC:\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mgitProjects\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mdeneme\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124megitim\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mtrain_Egitim\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mmerged_train.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 3\u001b[0m test_df\u001b[38;5;241m.\u001b[39mto_parquet(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mC:\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mgitProjects\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mdeneme\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mtest_Egitim\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mmerged_train.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
125
+ "\u001b[1;31mNameError\u001b[0m: name 'train_df' is not defined"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "#train ve test dosyaları oluşturma \n",
131
+ "train_df.to_parquet('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\train_Egitim\\\\merged_train.parquet')\n",
132
+ "test_df.to_parquet('C:\\\\gitProjects\\\\deneme\\\\test_Egitim\\\\merged_train.parquet')"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 10,
138
+ "metadata": {},
139
+ "outputs": [
140
+ {
141
+ "name": "stdout",
142
+ "output_type": "stream",
143
+ "text": [
144
+ " Prompt_ID \\\n",
145
+ "0 bb26c95639b18fd88857bf0964cd1fb5 \n",
146
+ "1 56743c1870327184e058292a34ce12a8 \n",
147
+ "2 88aa2f72d37cb8671ff68a6f481e382b \n",
148
+ "3 703c086f7ffd9d8cc0497e82732860c7 \n",
149
+ "4 a310cb6ed3f48e721473ec0525239e4e \n",
150
+ "\n",
151
+ " Prompt \\\n",
152
+ "0 What were the crucial factors that contributed... \n",
153
+ "1 Create a comprehensive guide to understanding ... \n",
154
+ "2 Explore the historical significance and impact... \n",
155
+ "3 How can advanced data analytics be leveraged t... \n",
156
+ "4 Design a comprehensive diversity training prog... \n",
157
+ "\n",
158
+ " Response \\\n",
159
+ "0 **Crucial Factors Contributing to the Success ... \n",
160
+ "1 ## Comprehensive Guide to Weather Front Types:... \n",
161
+ "2 ## The Fall of the Berlin Wall: Historical Sig... \n",
162
+ "3 **1. Real-Time Sentiment Analysis:**\\n\\n* Anal... \n",
163
+ "4 **Phase 1: Awareness and Self-Reflection**\\n\\n... \n",
164
+ "\n",
165
+ " Category Subcategory \\\n",
166
+ "0 Voskhod program Voskhod 1 mission \n",
167
+ "1 Science mnemonics Weather front types \n",
168
+ "2 Political history The Fall of the Berlin Wall \n",
169
+ "3 Test matches Data analytics \n",
170
+ "4 Majority–minority relations Diversity training \n",
171
+ "\n",
172
+ " Prompt_token_length \n",
173
+ "0 34 \n",
174
+ "1 48 \n",
175
+ "2 67 \n",
176
+ "3 78 \n",
177
+ "4 55 \n",
178
+ " Prompt_ID \\\n",
179
+ "0 e75b977d9abe55f0d4b33d7ee6a77e43 \n",
180
+ "1 da7b42506d0c24c5f1d2371e0f53b8fe \n",
181
+ "2 dc1e302eb77f44f32623f958bdf5b1f5 \n",
182
+ "3 3e276bb9e578d719809b9654d710d6f5 \n",
183
+ "4 3efc98322cc67bcf32abcf25576d6ba1 \n",
184
+ "\n",
185
+ " Prompt \\\n",
186
+ "0 In the grand arena of intellectual discourse, ... \n",
187
+ "1 Amidst the tapestry of human knowledge, we inv... \n",
188
+ "2 In a world teeming with ideas and viewpoints, ... \n",
189
+ "3 Amidst the tapestry of human knowledge, we inv... \n",
190
+ "4 In the grand odyssey of intellectual discourse... \n",
191
+ "\n",
192
+ " Response Category Subcategory \\\n",
193
+ "0 In the spirit of the renowned English physicia... None None \n",
194
+ "1 Title: The Interplay of Politics and Psycholog... None None \n",
195
+ "2 Energy conservation has become a critical topi... None None \n",
196
+ "3 Title: Workplace Bullying: A Silent Epidemic\\n... None None \n",
197
+ "4 Title: The Grand Odyssey of Grito: A Historica... None None \n",
198
+ "\n",
199
+ " Prompt_token_length \n",
200
+ "0 134 \n",
201
+ "1 121 \n",
202
+ "2 191 \n",
203
+ "3 128 \n",
204
+ "4 190 \n"
205
+ ]
206
+ }
207
+ ],
208
+ "source": [
209
+ "#test ve train yollarını belirleme ve test, traindeki önemli sütunları alma\n",
210
+ "train_file_path=('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\train_Egitim\\\\merged_train.parquet')\n",
211
+ "test_file_path=('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\test_Egitim\\\\merged_train.parquet')\n",
212
+ "\n",
213
+ "train_df=pd.read_parquet(train_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
214
+ "test_df=pd.read_parquet(test_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
215
+ "\n",
216
+ "print(train_df.head())\n",
217
+ "print(test_df.head())"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "markdown",
222
+ "metadata": {},
223
+ "source": [
224
+ "Modelin Tokenizer ve İsminin Girilmesi"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 13,
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "ename": "OSError",
234
+ "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
235
+ "output_type": "error",
236
+ "traceback": [
237
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
238
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
239
+ "Cell \u001b[1;32mIn[13], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoModel,AutoTokenizer,AutoModelForSeq2SeqLM\n\u001b[0;32m 2\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mphilschmid/bart-large-cnn-samsum\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 3\u001b[0m model \u001b[38;5;241m=\u001b[39m AutoModelForSeq2SeqLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mphilschmid/bart-large-cnn-samsum\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
240
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
241
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
242
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
243
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
244
+ "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
245
+ "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
246
+ ]
247
+ }
248
+ ],
249
+ "source": [
250
+ "from transformers import AutoModel,AutoTokenizer,AutoModelForSeq2SeqLM\n",
251
+ "tokenizer = AutoTokenizer.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n",
252
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "markdown",
257
+ "metadata": {},
258
+ "source": [
259
+ "MongoDb üzerinden önemli sütunların çekilmesi"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 17,
265
+ "metadata": {},
266
+ "outputs": [
267
+ {
268
+ "name": "stdout",
269
+ "output_type": "stream",
270
+ "text": [
271
+ "Data successfully loaded into MongoDB.\n"
272
+ ]
273
+ }
274
+ ],
275
+ "source": [
276
+ "from pymongo import MongoClient\n",
277
+ "import pandas as pd\n",
278
+ "\n",
279
+ "# MongoDB connection settings\n",
280
+ "\n",
281
+ "def get_mongodb(database_name='yeniDatabase', collection_name='train', host='localhost', port=27017):\n",
282
+ " \"\"\"\n",
283
+ " MongoDB connection and collection selection\n",
284
+ " \"\"\"\n",
285
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
286
+ " db = client[database_name]\n",
287
+ " collection = db[collection_name]\n",
288
+ " return collection\n",
289
+ "\n",
290
+ "# Function to load dataset into MongoDB\n",
291
+ "def dataset_read():\n",
292
+ " train_file_path = ('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\train_Egitim\\\\merged_train.parquet')\n",
293
+ " data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
294
+ " data_dict = data.to_dict(\"records\")\n",
295
+ "\n",
296
+ " # Get the MongoDB collection\n",
297
+ " source_collection = get_mongodb(database_name='yeniDatabase', collection_name='train') # Collection for translation\n",
298
+ "\n",
299
+ " # Insert data into MongoDB\n",
300
+ " source_collection.insert_many(data_dict)\n",
301
+ "\n",
302
+ " print(\"Data successfully loaded into MongoDB.\")\n",
303
+ " return source_collection\n",
304
+ "\n",
305
+ "# Call the function to load the dataset into MongoDB\n",
306
+ "source_collection = dataset_read()"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 18,
312
+ "metadata": {},
313
+ "outputs": [
314
+ {
315
+ "name": "stdout",
316
+ "output_type": "stream",
317
+ "text": [
318
+ "Data successfully loaded into MongoDB.\n"
319
+ ]
320
+ }
321
+ ],
322
+ "source": [
323
+ "from pymongo import MongoClient\n",
324
+ "import pandas as pd\n",
325
+ "\n",
326
+ "# MongoDB connection settings\n",
327
+ "\n",
328
+ "def get_mongodb(database_name='yeniDatabase', collection_name='test', host='localhost', port=27017):\n",
329
+ " \"\"\"\n",
330
+ " MongoDB connection and collection selection\n",
331
+ " \"\"\"\n",
332
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
333
+ " db = client[database_name]\n",
334
+ " collection = db[collection_name]\n",
335
+ " return collection\n",
336
+ "\n",
337
+ "# Function to load dataset into MongoDB\n",
338
+ "def dataset_read():\n",
339
+ " train_file_path = ('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\test_Egitim\\\\merged_train.parquet')\n",
340
+ " data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
341
+ " data_dict = data.to_dict(\"records\")\n",
342
+ "\n",
343
+ " # Get the MongoDB collection\n",
344
+ " source_collection = get_mongodb(database_name='yeniDatabase', collection_name='test') # Collection for translation\n",
345
+ "\n",
346
+ " # Insert data into MongoDB\n",
347
+ " source_collection.insert_many(data_dict)\n",
348
+ "\n",
349
+ " print(\"Data successfully loaded into MongoDB.\")\n",
350
+ " return source_collection\n",
351
+ "\n",
352
+ "# Call the function to load the dataset into MongoDB\n",
353
+ "source_collection = dataset_read()"
354
+ ]
355
+ }
356
+ ],
357
+ "metadata": {
358
+ "kernelspec": {
359
+ "display_name": ".venv",
360
+ "language": "python",
361
+ "name": "python3"
362
+ },
363
+ "language_info": {
364
+ "codemirror_mode": {
365
+ "name": "ipython",
366
+ "version": 3
367
+ },
368
+ "file_extension": ".py",
369
+ "mimetype": "text/x-python",
370
+ "name": "python",
371
+ "nbconvert_exporter": "python",
372
+ "pygments_lexer": "ipython3",
373
+ "version": "3.11.9"
374
+ }
375
+ },
376
+ "nbformat": 4,
377
+ "nbformat_minor": 2
378
+ }
merged_train.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f808996f2ad6145efb8d94e05c4cace910ed6b0f64ac205c87d876ff43673b7
3
+ size 1271727822
model.ipynb ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "ModuleNotFoundError",
10
+ "evalue": "No module named 'bs4'",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[1;32mIn[2], line 6\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mbs4\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BeautifulSoup\n",
16
+ "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'bs4'"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "import csv\n",
22
+ "import pandas as pd \n",
23
+ "from pymongo import MongoClient\n",
24
+ "\n",
25
+ "import requests\n",
26
+ "from bs4 import BeautifulSoup\n"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# Connect to MongoDB\n",
36
+ "client = MongoClient(\"mongodb://localhost:27017/\")\n",
37
+ "db = client[\"myDatabase\"]\n",
38
+ "source_collection = db[\"data\"]"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 9,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "# Export translated data to a CSV file #bu dosyayı json olarak indirdim\n",
48
+ "\"\"\"yeni_data = list(source_collection.find())\n",
49
+ "print(yeni_data)\"\"\""
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 3,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "name": "stdout",
59
+ "output_type": "stream",
60
+ "text": [
61
+ " _id title \\\n",
62
+ "0 {'$oid': '66a1020f29abc84d21689044'} Mental Note Vol. 24 \n",
63
+ "1 {'$oid': '66a1020f29abc84d21689045'} Your Brain On Coronavirus \n",
64
+ "2 {'$oid': '66a1020f29abc84d21689046'} Mind Your Nose \n",
65
+ "3 {'$oid': '66a1020f29abc84d21689047'} The 4 Purposes of Dreams \n",
66
+ "4 {'$oid': '66a1020f29abc84d21689048'} Surviving a Rod Through the Head \n",
67
+ "\n",
68
+ " url authors \\\n",
69
+ "0 https://medium.com/invisible-illness/mental-no... ['Ryan Fan'] \n",
70
+ "1 https://medium.com/age-of-awareness/how-the-pa... ['Simon Spichak'] \n",
71
+ "2 https://medium.com/neodotlife/mind-your-nose-f... [] \n",
72
+ "3 https://medium.com/science-for-real/the-4-purp... ['Eshan Samaranayake'] \n",
73
+ "4 https://medium.com/live-your-life-on-purpose/s... ['Rishav Sinha'] \n",
74
+ "\n",
75
+ " timestamp \\\n",
76
+ "0 2020-12-26 03:38:10.479000+00:00 \n",
77
+ "1 2020-09-23 22:10:17.126000+00:00 \n",
78
+ "2 2020-10-10 20:17:37.132000+00:00 \n",
79
+ "3 2020-12-21 16:05:19.524000+00:00 \n",
80
+ "4 2020-02-26 00:01:01.576000+00:00 \n",
81
+ "\n",
82
+ " tags \n",
83
+ "0 ['Mental Health', 'Health', 'Psychology', 'Sci... \n",
84
+ "1 ['Mental Health', 'Coronavirus', 'Science', 'P... \n",
85
+ "2 ['Biotechnology', 'Neuroscience', 'Brain', 'We... \n",
86
+ "3 ['Health', 'Neuroscience', 'Mental Health', 'P... \n",
87
+ "4 ['Brain', 'Health', 'Development', 'Psychology... \n"
88
+ ]
89
+ }
90
+ ],
91
+ "source": [
92
+ "#csv dosyası olarak yüklenmesi\n",
93
+ "df=pd.read_json('myDatabase.data.json')\n",
94
+ "print(df.head())"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 4,
100
+ "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "data": {
104
+ "text/html": [
105
+ "<div>\n",
106
+ "<style scoped>\n",
107
+ " .dataframe tbody tr th:only-of-type {\n",
108
+ " vertical-align: middle;\n",
109
+ " }\n",
110
+ "\n",
111
+ " .dataframe tbody tr th {\n",
112
+ " vertical-align: top;\n",
113
+ " }\n",
114
+ "\n",
115
+ " .dataframe thead th {\n",
116
+ " text-align: right;\n",
117
+ " }\n",
118
+ "</style>\n",
119
+ "<table border=\"1\" class=\"dataframe\">\n",
120
+ " <thead>\n",
121
+ " <tr style=\"text-align: right;\">\n",
122
+ " <th></th>\n",
123
+ " <th>_id</th>\n",
124
+ " <th>title</th>\n",
125
+ " <th>url</th>\n",
126
+ " <th>authors</th>\n",
127
+ " <th>timestamp</th>\n",
128
+ " <th>tags</th>\n",
129
+ " </tr>\n",
130
+ " </thead>\n",
131
+ " <tbody>\n",
132
+ " <tr>\n",
133
+ " <th>0</th>\n",
134
+ " <td>{'$oid': '66a1020f29abc84d21689044'}</td>\n",
135
+ " <td>Mental Note Vol. 24</td>\n",
136
+ " <td>https://medium.com/invisible-illness/mental-no...</td>\n",
137
+ " <td>['Ryan Fan']</td>\n",
138
+ " <td>2020-12-26 03:38:10.479000+00:00</td>\n",
139
+ " <td>['Mental Health', 'Health', 'Psychology', 'Sci...</td>\n",
140
+ " </tr>\n",
141
+ " <tr>\n",
142
+ " <th>1</th>\n",
143
+ " <td>{'$oid': '66a1020f29abc84d21689045'}</td>\n",
144
+ " <td>Your Brain On Coronavirus</td>\n",
145
+ " <td>https://medium.com/age-of-awareness/how-the-pa...</td>\n",
146
+ " <td>['Simon Spichak']</td>\n",
147
+ " <td>2020-09-23 22:10:17.126000+00:00</td>\n",
148
+ " <td>['Mental Health', 'Coronavirus', 'Science', 'P...</td>\n",
149
+ " </tr>\n",
150
+ " <tr>\n",
151
+ " <th>2</th>\n",
152
+ " <td>{'$oid': '66a1020f29abc84d21689046'}</td>\n",
153
+ " <td>Mind Your Nose</td>\n",
154
+ " <td>https://medium.com/neodotlife/mind-your-nose-f...</td>\n",
155
+ " <td>[]</td>\n",
156
+ " <td>2020-10-10 20:17:37.132000+00:00</td>\n",
157
+ " <td>['Biotechnology', 'Neuroscience', 'Brain', 'We...</td>\n",
158
+ " </tr>\n",
159
+ " <tr>\n",
160
+ " <th>3</th>\n",
161
+ " <td>{'$oid': '66a1020f29abc84d21689047'}</td>\n",
162
+ " <td>The 4 Purposes of Dreams</td>\n",
163
+ " <td>https://medium.com/science-for-real/the-4-purp...</td>\n",
164
+ " <td>['Eshan Samaranayake']</td>\n",
165
+ " <td>2020-12-21 16:05:19.524000+00:00</td>\n",
166
+ " <td>['Health', 'Neuroscience', 'Mental Health', 'P...</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th>4</th>\n",
170
+ " <td>{'$oid': '66a1020f29abc84d21689048'}</td>\n",
171
+ " <td>Surviving a Rod Through the Head</td>\n",
172
+ " <td>https://medium.com/live-your-life-on-purpose/s...</td>\n",
173
+ " <td>['Rishav Sinha']</td>\n",
174
+ " <td>2020-02-26 00:01:01.576000+00:00</td>\n",
175
+ " <td>['Brain', 'Health', 'Development', 'Psychology...</td>\n",
176
+ " </tr>\n",
177
+ " </tbody>\n",
178
+ "</table>\n",
179
+ "</div>"
180
+ ],
181
+ "text/plain": [
182
+ " _id title \\\n",
183
+ "0 {'$oid': '66a1020f29abc84d21689044'} Mental Note Vol. 24 \n",
184
+ "1 {'$oid': '66a1020f29abc84d21689045'} Your Brain On Coronavirus \n",
185
+ "2 {'$oid': '66a1020f29abc84d21689046'} Mind Your Nose \n",
186
+ "3 {'$oid': '66a1020f29abc84d21689047'} The 4 Purposes of Dreams \n",
187
+ "4 {'$oid': '66a1020f29abc84d21689048'} Surviving a Rod Through the Head \n",
188
+ "\n",
189
+ " url authors \\\n",
190
+ "0 https://medium.com/invisible-illness/mental-no... ['Ryan Fan'] \n",
191
+ "1 https://medium.com/age-of-awareness/how-the-pa... ['Simon Spichak'] \n",
192
+ "2 https://medium.com/neodotlife/mind-your-nose-f... [] \n",
193
+ "3 https://medium.com/science-for-real/the-4-purp... ['Eshan Samaranayake'] \n",
194
+ "4 https://medium.com/live-your-life-on-purpose/s... ['Rishav Sinha'] \n",
195
+ "\n",
196
+ " timestamp \\\n",
197
+ "0 2020-12-26 03:38:10.479000+00:00 \n",
198
+ "1 2020-09-23 22:10:17.126000+00:00 \n",
199
+ "2 2020-10-10 20:17:37.132000+00:00 \n",
200
+ "3 2020-12-21 16:05:19.524000+00:00 \n",
201
+ "4 2020-02-26 00:01:01.576000+00:00 \n",
202
+ "\n",
203
+ " tags \n",
204
+ "0 ['Mental Health', 'Health', 'Psychology', 'Sci... \n",
205
+ "1 ['Mental Health', 'Coronavirus', 'Science', 'P... \n",
206
+ "2 ['Biotechnology', 'Neuroscience', 'Brain', 'We... \n",
207
+ "3 ['Health', 'Neuroscience', 'Mental Health', 'P... \n",
208
+ "4 ['Brain', 'Health', 'Development', 'Psychology... "
209
+ ]
210
+ },
211
+ "execution_count": 4,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "df.head()"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 5,
223
+ "metadata": {},
224
+ "outputs": [
225
+ {
226
+ "data": {
227
+ "text/plain": [
228
+ "_id object\n",
229
+ "title object\n",
230
+ "url object\n",
231
+ "authors object\n",
232
+ "timestamp object\n",
233
+ "tags object\n",
234
+ "dtype: object"
235
+ ]
236
+ },
237
+ "execution_count": 5,
238
+ "metadata": {},
239
+ "output_type": "execute_result"
240
+ }
241
+ ],
242
+ "source": [
243
+ "df.dtypes"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 8,
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "name": "stdout",
253
+ "output_type": "stream",
254
+ "text": [
255
+ "0 <class 'dict'>\n",
256
+ "Name: _id, dtype: object\n",
257
+ "0 <class 'str'>\n",
258
+ "Name: title, dtype: object\n",
259
+ "0 <class 'str'>\n",
260
+ "Name: url, dtype: object\n",
261
+ "0 <class 'str'>\n",
262
+ "Name: authors, dtype: object\n",
263
+ "0 <class 'str'>\n",
264
+ "Name: timestamp, dtype: object\n",
265
+ "0 <class 'str'>\n",
266
+ "Name: tags, dtype: object\n"
267
+ ]
268
+ }
269
+ ],
270
+ "source": [
271
+ "for i in df.columns:\n",
272
+ " print(df[i].apply(lambda x:type(x)).head(1))"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 9,
278
+ "metadata": {},
279
+ "outputs": [],
280
+ "source": [
281
+ "#içeriklerin saklanacağı bir liste oluştrun\n",
282
+ "contents=[]\n",
283
+ "#her url için içeriği çekin \n",
284
+ "\n",
285
+ "for url in df['url']:\n",
286
+ " try:\n",
287
+ " response=requests.get(url)\n",
288
+ " soup=BeautifulSoup(response.content,'html.parser')\n",
289
+ "\n",
290
+ " #medium içeriğini çekmek için uygun seçiciyi kullanın\n",
291
+ " article_content=soup.find('articles')\n",
292
+ " content=article_content.get_text(separator='') if article_content else 'content not found'\n",
293
+ "\n",
294
+ " contents.append(content)\n",
295
+ " except Exception as e:\n",
296
+ " contents.append(f'error retrieving content: {e}')\n",
297
+ "\n",
298
+ "#içerikleri veri çerçevesine ekleyin.\n",
299
+ "df['content']= contents\n",
300
+ "\n",
301
+ "#yeni veri kümesini kontrol edin\n",
302
+ "print(df.head())"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": null,
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "\n",
312
+ "#modeleğitimi için test valid değerleriğ oluşturma \n",
313
+ "\n",
314
+ "from sklearn.model_selection import train_test_split\n",
315
+ "\n",
316
+ "X_train, X_val, y_train, y_val = train_test_split(translated_data, translated_data, test_size=0.2, random_state=42)"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": [
325
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
326
+ "from sklearn.svm import SVC\n",
327
+ "\n",
328
+ "vectorizer = TfidfVectorizer()\n",
329
+ "X_train_transformed = vectorizer.fit_transform(X_train)\n",
330
+ "X_val_transformed = vectorizer.transform(X_val)\n",
331
+ "\n",
332
+ "model = SVC()\n",
333
+ "model.fit(X_train_transformed, y_train)"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "metadata": {},
340
+ "outputs": [],
341
+ "source": [
342
+ "from sklearn.metrics import accuracy_score\n",
343
+ "\n",
344
+ "y_pred = model.predict(X_val_transformed)\n",
345
+ "accuracy = accuracy_score(y_val, y_pred)\n",
346
+ "print(f\"Accuracy: {accuracy:.2f}\")"
347
+ ]
348
+ }
349
+ ],
350
+ "metadata": {
351
+ "kernelspec": {
352
+ "display_name": "myenv",
353
+ "language": "python",
354
+ "name": "python3"
355
+ },
356
+ "language_info": {
357
+ "codemirror_mode": {
358
+ "name": "ipython",
359
+ "version": 3
360
+ },
361
+ "file_extension": ".py",
362
+ "mimetype": "text/x-python",
363
+ "name": "python",
364
+ "nbconvert_exporter": "python",
365
+ "pygments_lexer": "ipython3",
366
+ "version": "3.11.9"
367
+ }
368
+ },
369
+ "nbformat": 4,
370
+ "nbformat_minor": 2
371
+ }
mongoDb.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ import pandas as pd
3
+
4
+
5
+
6
+ #mongodb bağlantı ayarları
7
+
8
+ def get_mongodb(database_name='myDatabase',collection_name='new',host='localhost',port=27017):
9
+ "mongodb bağlantısı ve koleksiyon seçimi "
10
+ client=MongoClient(f'mongodb://{host}:{port}/')
11
+ db=client[database_name]
12
+ collection=db[collection_name]
13
+ return collection
14
+
15
+
16
+
17
+ """#koleksiyonun varlığını kontrol eder.
18
+ def get_collection(self, collection_name):
19
+ #Get a collection if it exists, otherwise return None.
20
+ if self.check_collection_exists(collection_name):
21
+ return self.db[collection_name]
22
+ else:
23
+ print(f"Collection '{collection_name}' does not exist.")
24
+ return None"""
25
+
26
+ #dataseti mongodb ye yükleme
27
+ def dataset_read():
28
+ data=pd.read_csv('C:\gitProjects\medium-articles\medium_articles_no_text.csv')
29
+ data_dict=data.to_dict("records")
30
+ source_collection=get_mongodb(database_name='myDatabase',collection_name='data') #çeviri için kullanılan kaynak koleksiyonu
31
+ source_collection.insert_many(data_dict)
32
+ print("kayıt mongodb ye yüklendi." )
33
+ return source_collection
34
+
35
+ if __name__=='__main__':
36
+ dataset_read()
37
+
38
+
39
+
mongoDb_2.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ import pandas as pd
3
+
4
+
5
+ #mongodb bağlantı ayarları
6
+
7
+ def get_mongodb(database_name='yedekDatabase',collection_name='yeni',host='localhost',port=27017):
8
+ "mongodb bağlantısı ve koleksiyon seçimi "
9
+ client=MongoClient(f'mongodb://{host}:{port}/')
10
+ db=client[database_name]
11
+ collection=db[collection_name]
12
+ return collection
13
+
14
+ #dataseti mongodb ye yükleme
15
+ def dataset_read():
16
+ data=pd.read_csv('C:\gitProjects\medium-articles\medium_articles_no_text.csv')
17
+ data_dict=data.to_dict("records")
18
+ source_collection=get_mongodb(database_name='yedekDatabase',collection_name='yeni') #çeviri için kullanılan kaynak koleksiyonu
19
+ source_collection.insert_many(data_dict)
20
+ print("kayıt mongodb ye yüklendi." )
21
+ return source_collection
22
+
23
+ if __name__=='__main__':
24
+ dataset_read()
mongodb_egitim.py ADDED
@@ -0,0 +1 @@
 
 
1
+
pyvenv.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ home = C:\Users\info\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0
2
+ include-system-site-packages = false
3
+ version = 3.11.9
4
+ executable = C:\Users\info\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
5
+ command = C:\Users\info\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m venv c:\gitProjects\deneme\.venv
requirements.txt CHANGED
@@ -1,2 +1,6 @@
1
- huggingface_hub==0.22.2
2
- gradio
 
 
 
 
 
1
+ gradio==4.40.0.*
2
+ pymongo==4.8.0.*
3
+ pandas==2.2.2.*
4
+ datasets==2.20.0.*
5
+ torch==2.4.0.*
6
+ transformers==4.43.4.*