yonkasoft commited on
Commit
fa8e9f4
1 Parent(s): 8e61200

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. combined.ipynb +512 -79
  3. processed_data.csv +3 -0
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  combined_output.csv filter=lfs diff=lfs merge=lfs -text
37
  combined_texts.csv filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  combined_output.csv filter=lfs diff=lfs merge=lfs -text
37
  combined_texts.csv filter=lfs diff=lfs merge=lfs -text
38
+ processed_data.csv filter=lfs diff=lfs merge=lfs -text
combined.ipynb CHANGED
@@ -9,7 +9,7 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": 4,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
@@ -35,87 +35,392 @@
35
  "cell_type": "markdown",
36
  "metadata": {},
37
  "source": [
38
- "MongoDb'den database'in çekilmesi"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ]
40
  },
41
  {
42
  "cell_type": "code",
43
- "execution_count": 8,
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
47
  "#mongodb üzerinden combined_textleri çek\n",
 
 
48
  "\n",
49
- "def mongo_db_combined_texts(database_name='combined', collection_name='combined_output', host='localhost', port=27017,batch_size=1000):\n",
50
  " client = MongoClient(f'mongodb://{host}:{port}/')\n",
51
  " db = client[database_name]\n",
52
  " collection = db[collection_name]\n",
53
  " \n",
54
  " #toplam döküman sayısını al\n",
55
  " total_documents = collection.count_documents({})\n",
56
- " batch_documents = []\n",
 
 
 
 
 
57
  "\n",
58
  " # Belirtilen batch_size kadar dökümanları almak için döngü\n",
59
  " for i in range(0, total_documents, batch_size):\n",
60
  " cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
61
- " combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc]\n",
62
- " batch_documents.append((combined_texts, len(combined_texts)))\n",
63
- " \n",
64
- " return batch_documents\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "\n",
66
  "# Dökümanları ve döküman sayısını batch olarak çekin\n",
67
- "batch_documents = mongo_db_combined_texts(batch_size=1000)\n",
68
  "\n",
69
  "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
70
- "for batch_index, (combined_texts, document_count) in enumerate(batch_documents):\n",
71
- " print(f\"Batch {batch_index + 1}: {document_count} documents\")\n",
 
 
 
 
72
  "\n",
73
  " "
74
  ]
75
  },
76
  {
77
- "cell_type": "markdown",
 
78
  "metadata": {},
79
- "source": [
80
- "Gereksiz kelimelerin 'gereksiz_kelimeler.txt' üzerinden import edilmesi"
81
- ]
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 3,
86
  "metadata": {},
87
  "outputs": [],
88
  "source": [
89
- "\"\"\"\"\"\"\n",
90
- "#- burada turkish_stop_words'ü alıyoruz\n",
91
- "def load_stop_words(file_path, existing_stop_words='gereksiz_kelimeler.txt'):\n",
92
- " \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur. \n",
93
- " Mevcut stop words'ler varsa bunları dikkate alır.\"\"\"\n",
 
 
 
94
  " \n",
95
- " if existing_stop_words is None:\n",
96
- " existing_stop_words = set()\n",
97
- " else:\n",
98
- " existing_stop_words = set(existing_stop_words)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  " \n",
100
- " with open(file_path, 'r', encoding='utf-8') as file:\n",
101
- " for line in file:\n",
102
- " word = line.strip()\n",
103
- " if word and word not in existing_stop_words:\n",
104
- " existing_stop_words.add(word)\n",
105
  " \n",
106
- " return list(existing_stop_words)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  "\n",
108
- "# Mevcut stop words'leri kontrol ederek Türkçe stop words dosyasını yükleyin\n",
109
- "stop_words_list = load_stop_words('gereksiz_kelimeler.txt')\n",
 
 
 
 
 
 
 
 
 
110
  "\n",
111
- "#----------------------------------------------------------------------------------------------------"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  ]
113
  },
114
  {
115
- "cell_type": "markdown",
 
116
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  "source": [
118
- "TF-IDF Skorları "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  ]
120
  },
121
  {
@@ -123,6 +428,89 @@
123
  "execution_count": null,
124
  "metadata": {},
125
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  "source": [
127
  "\n",
128
  "# BERT Tokenizer ve Model'i yükleyin\n",
@@ -132,17 +520,23 @@
132
  "def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
133
  " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
134
  " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
135
- " X = vectorizer.fit_transform(combined_texts)\n",
136
  " feature_names = vectorizer.get_feature_names_out()\n",
137
  " #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
138
- " top_keywords_per_document = []\n",
139
- "\n",
 
 
140
  " for row in X:\n",
141
  " tfidf_scores = row.toarray().flatten()\n",
142
  " top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
143
  " top_keywords = [feature_names[i] for i in top_indices]\n",
144
- " top_keywords_per_document.append(top_keywords)\n",
 
145
  "\n",
 
 
 
146
  " return top_keywords_per_document\n",
147
  "\n",
148
  "# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
@@ -151,7 +545,7 @@
151
  " \n",
152
  " for text in combined_texts:\n",
153
  " # Anahtar kelimeleri çıkar\n",
154
- " keywords = extract_keywords_tfidf(text, stop_words_list,top_n)\n",
155
  " \n",
156
  " # BERT ile embedding oluştur\n",
157
  " inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
@@ -165,62 +559,101 @@
165
  " 'embedding': embeddings\n",
166
  " })\n",
167
  " \n",
168
- " return results"
 
 
 
 
 
 
 
 
 
 
 
169
  ]
170
  },
171
  {
172
  "cell_type": "code",
173
- "execution_count": 16,
174
  "metadata": {},
175
  "outputs": [
176
  {
177
  "name": "stdout",
178
  "output_type": "stream",
179
  "text": [
180
- "Keyword: test, Similarity: 0.19360324687858618\n"
181
  ]
182
  }
183
  ],
184
  "source": [
185
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
186
- "from sklearn.metrics.pairwise import cosine_similarity\n",
187
- "\n",
188
- "def calculate_keyword_similarity(text, keywords):\n",
189
- " # Metin ve anahtar kelimelerden oluşan bir liste oluştur\n",
190
- " similarity_array = []\n",
191
  "\n",
192
- " for keyword in keywords:\n",
193
- " # Metin ve anahtar kelimeyi bir listeye ekle\n",
194
- " documents = [text, keyword]\n",
195
- " \n",
196
- " # TF-IDF matrisini oluştur\n",
197
- " vectorizer = TfidfVectorizer()\n",
198
- " tfidf_matrix = vectorizer.fit_transform(documents)\n",
199
- " \n",
200
- " # Metin vektörünü ve anahtar kelimeler vektörünü al\n",
201
- " text_vector = tfidf_matrix[0]\n",
202
- " keywords_vector = tfidf_matrix[1]\n",
203
  " \n",
204
- " # Cosine similarity ile benzerlik hesapla\n",
205
- " similarity = cosine_similarity(text_vector, keywords_vector)[0][0]\n",
 
206
  "\n",
 
 
 
 
207
  "\n",
208
- " similarity_array.append((keyword,similarity))\n",
209
- " \n",
210
- " return similarity_array\n",
211
- "# Örnek metin ve anahtar kelimeler\n",
212
- "#combined verileri \n",
213
- "text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
214
- "keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"]\n",
215
- "# Uygunluk skorunu hesapla\n",
216
- "similarity_results = calculate_keyword_similarity(text, keywords)\n",
217
- "top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
218
- "# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
 
 
219
  "\n",
220
- "for keyword, similarity in top_5_keywords:\n",
221
- " print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
222
- " #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
223
- "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  ]
225
  },
226
  {
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": 3,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
 
35
  "cell_type": "markdown",
36
  "metadata": {},
37
  "source": [
38
+ "Turkish stop wordslerin tanımlanması"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 4,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "\"\"\"\"\"\"\n",
48
+ "#- burada turkish_stop_words'ü alıyoruz\n",
49
+ "def load_stop_words(file_path, existing_stop_words='gereksiz_kelimeler.txt'):\n",
50
+ " \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur. \n",
51
+ " Mevcut stop words'ler varsa bunları dikkate alır.\"\"\"\n",
52
+ " \n",
53
+ " if existing_stop_words is None:\n",
54
+ " existing_stop_words = set()\n",
55
+ " else:\n",
56
+ " existing_stop_words = set(existing_stop_words)\n",
57
+ " \n",
58
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
59
+ " for line in file:\n",
60
+ " word = line.strip()\n",
61
+ " if word and word not in existing_stop_words:\n",
62
+ " existing_stop_words.add(word)\n",
63
+ " \n",
64
+ " return list(existing_stop_words)\n",
65
+ "\n",
66
+ "# Mevcut stop words'leri kontrol ederek Türkçe stop words dosyasını yükleyin\n",
67
+ "stop_words_list = load_stop_words('gereksiz_kelimeler.txt')\n",
68
+ "\n",
69
+ "#----------------------------------------------------------------------------------------------------"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "markdown",
74
+ "metadata": {},
75
+ "source": [
76
+ "MongoDb'deki combined_text koleksiyonunun verilerini csv ye çekme "
77
  ]
78
  },
79
  {
80
  "cell_type": "code",
81
+ "execution_count": null,
82
  "metadata": {},
83
  "outputs": [],
84
  "source": [
85
  "#mongodb üzerinden combined_textleri çek\n",
86
+ "import csv\n",
87
+ "from pymongo import MongoClient\n",
88
  "\n",
89
+ "def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=100,output_file='combined_texts.csv'):\n",
90
  " client = MongoClient(f'mongodb://{host}:{port}/')\n",
91
  " db = client[database_name]\n",
92
  " collection = db[collection_name]\n",
93
  " \n",
94
  " #toplam döküman sayısını al\n",
95
  " total_documents = collection.count_documents({})\n",
96
+ " #batch_documents = []\n",
97
+ "\n",
98
+ " # CSV dosyasını aç ve yazmaya hazırla\n",
99
+ " with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
100
+ " writer = csv.writer(file)\n",
101
+ " writer.writerow([\"combined\"]) # CSV başlığı\n",
102
  "\n",
103
  " # Belirtilen batch_size kadar dökümanları almak için döngü\n",
104
  " for i in range(0, total_documents, batch_size):\n",
105
  " cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
106
+ " combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
107
+ "\n",
108
+ " # Batch verilerini CSV'ye yaz\n",
109
+ " with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
110
+ " writer = csv.writer(file)\n",
111
+ " \n",
112
+ " for text in combined_texts:\n",
113
+ " writer.writerow([text])\n",
114
+ " \n",
115
+ " \n",
116
+ "\n",
117
+ " print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
118
+ "\n",
119
+ "# Dökümanları CSV dosyasına yazdır\n",
120
+ "text=mongo_db_combined_texts_to_csv(batch_size=100)\n",
121
+ " #batch_documents.extend((combined_texts, len(combined_texts)))\n",
122
+ " #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
123
+ " #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
124
+ " #return batch_documents\n",
125
  "\n",
126
  "# Dökümanları ve döküman sayısını batch olarak çekin\n",
127
+ "#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
128
  "\n",
129
  "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
130
+ "#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
131
+ "\n",
132
+ "#for index, text in enumerate (combined_texts[:10]):\n",
133
+ " #print(f\"Döküman {index + 1}: {text}\")\n",
134
+ "\n",
135
+ "#print(combined_texts)\n",
136
  "\n",
137
  " "
138
  ]
139
  },
140
  {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
  "metadata": {},
144
+ "outputs": [],
145
+ "source": []
 
146
  },
147
  {
148
  "cell_type": "code",
149
+ "execution_count": 11,
150
  "metadata": {},
151
  "outputs": [],
152
  "source": [
153
+ "import csv\n",
154
+ "from pymongo import MongoClient\n",
155
+ "import pandas as pd\n",
156
+ "\n",
157
+ "def fetch_from_database(database_name='combined_text', collection_name='text', host='localhost', port=27017, batch_size=100):\n",
158
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
159
+ " db = client[database_name]\n",
160
+ " collection = db[collection_name]\n",
161
  " \n",
162
+ " # Toplam döküman sayısını al\n",
163
+ " total_documents = collection.count_documents({})\n",
164
+ " combined_texts = []\n",
165
+ "\n",
166
+ " # Belirtilen batch_size kadar dökümanları almak için döngü\n",
167
+ " for i in range(0, total_documents, batch_size):\n",
168
+ " cursor = collection.find({}, {\"combined\": 1, \"_id\": 0}).skip(i).limit(batch_size)\n",
169
+ " combined_texts.extend([doc['combined'] for doc in cursor if 'combined' in doc]) # combined sütununa ilişkin verileri çeker \n",
170
+ "\n",
171
+ " return combined_texts\n",
172
+ "\n",
173
+ "# Metinleri kısaltma fonksiyonu\n",
174
+ "def truncate_text(text, max_words=300):\n",
175
+ " words = text.split() # Metni kelimelere böler\n",
176
+ " return ' '.join(words[:max_words]) # İlk max_words kadar kelimeyi alır\n",
177
+ "\n",
178
+ "# Veritabanından veri çekme ve kısaltma\n",
179
+ "def fetch_and_truncate_data(database_name, collection_name, host, port, max_words=300):\n",
180
+ " # Veritabanından veri çekme\n",
181
+ " combined_texts = fetch_from_database(database_name, collection_name, host, port)\n",
182
  " \n",
183
+ " # Metinleri kısaltma\n",
184
+ " truncated_texts = [truncate_text(text, max_words) for text in combined_texts]\n",
 
 
 
185
  " \n",
186
+ " return truncated_texts\n",
187
+ "\n",
188
+ "# Kısaltılmış veriyi CSV'ye kaydetme\n",
189
+ "def save_to_csv(data, file_path):\n",
190
+ " df = pd.DataFrame(data, columns=['combined'])\n",
191
+ " df.to_csv(file_path, encoding='utf-8', index=False)\n",
192
+ "\n",
193
+ "# Doğru değişken tanımlamaları\n",
194
+ "database_name = 'combined_text'\n",
195
+ "collection_name = 'text'\n",
196
+ "host = 'localhost'\n",
197
+ "port = 27017\n",
198
+ "batch_size = 100\n",
199
+ "max_words = 300\n",
200
+ "output_file = 'processed_data.csv'\n",
201
+ "\n",
202
+ "# Veriyi çekme ve işleme\n",
203
+ "truncated_texts = fetch_and_truncate_data(database_name, collection_name, host, port, max_words)\n",
204
+ "save_to_csv(truncated_texts, output_file)\n"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": []
213
+ },
214
+ {
215
+ "cell_type": "markdown",
216
+ "metadata": {},
217
+ "source": [
218
+ "Tf-Idf ile keywordsleri alma "
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 11,
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "name": "stderr",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
231
+ " warnings.warn(\n"
232
+ ]
233
+ },
234
+ {
235
+ "ename": "KeyboardInterrupt",
236
+ "evalue": "",
237
+ "output_type": "error",
238
+ "traceback": [
239
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
240
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
241
+ "Cell \u001b[1;32mIn[11], line 33\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m 32\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 33\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
242
+ "Cell \u001b[1;32mIn[11], line 21\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n)\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m X:\n\u001b[0;32m 20\u001b[0m tfidf_scores \u001b[38;5;241m=\u001b[39m row\u001b[38;5;241m.\u001b[39mtoarray()\u001b[38;5;241m.\u001b[39mflatten() \u001b[38;5;66;03m#değişkenleri düz bir değişken haline getirme\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m top_indices \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_scores\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margsort\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m-\u001b[39mtop_n:][::\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;66;03m# En yüksek n skoru bul\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m#en yüksek skorlu kelimleri ve skorları bul\u001b[39;00m\n\u001b[0;32m 24\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m [feature_names[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n",
243
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
244
+ ]
245
+ }
246
+ ],
247
+ "source": [
248
+ "import csv\n",
249
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
250
+ "from joblib import Parallel, delayed\n",
251
+ "import pandas as pd\n",
252
  "\n",
253
+ "df=pd.read_csv('combined_texts.csv')\n",
254
+ "combined= df['combined'].tolist()\n",
255
+ "def extract_keywords_tfidf(combined, stop_words_list,top_n=10):\n",
256
+ " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
257
+ " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
258
+ " X = vectorizer.fit_transform(combined) #bunu csv den oku \n",
259
+ " feature_names = vectorizer.get_feature_names_out() #her kelimenin tf-ıdf vektöründeki karşılığını tutar \n",
260
+ " #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
261
+ " \n",
262
+ " top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
263
+ " top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
264
  "\n",
265
+ " # Her dökümanı işleme\n",
266
+ " for row in X:\n",
267
+ " tfidf_scores = row.toarray().flatten() #değişkenleri düz bir değişken haline getirme\n",
268
+ " top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
269
+ " \n",
270
+ " #en yüksek skorlu kelimleri ve skorları bul\n",
271
+ " top_keywords = [feature_names[i] for i in top_indices]\n",
272
+ " top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
273
+ " \n",
274
+ " top_keywords_per_document.append(top_keywords)\n",
275
+ " top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
276
+ " \n",
277
+ " return top_keywords_per_document, top_tfidf_scores_per_document\n",
278
+ "\n",
279
+ "# Anahtar kelimeleri çıkar ve sonuçları al\n",
280
+ "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10)\n",
281
+ " \n",
282
+ "\n",
283
+ "# Sonuçları görüntüleme\n",
284
+ "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
285
+ " print(f\"Döküman {i+1}:\")\n",
286
+ " for keyword, score in zip(keywords, scores):\n",
287
+ " print(f\"{keyword}: {score:.4f}\")\n",
288
+ " print(\"\\n\")\n"
289
  ]
290
  },
291
  {
292
+ "cell_type": "code",
293
+ "execution_count": 5,
294
  "metadata": {},
295
+ "outputs": [
296
+ {
297
+ "name": "stderr",
298
+ "output_type": "stream",
299
+ "text": [
300
+ "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
301
+ " warnings.warn(\n"
302
+ ]
303
+ },
304
+ {
305
+ "ename": "KeyboardInterrupt",
306
+ "evalue": "",
307
+ "output_type": "error",
308
+ "traceback": [
309
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
310
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
311
+ "Cell \u001b[1;32mIn[5], line 53\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m 52\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
312
+ "Cell \u001b[1;32mIn[5], line 45\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n, n_jobs)\u001b[0m\n\u001b[0;32m 42\u001b[0m top_tfidf_scores \u001b[38;5;241m=\u001b[39m [tfidf_scores[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords, top_tfidf_scores\n\u001b[1;32m---> 45\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_row\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[38;5;66;03m# Sonuçları listelere ayırma\u001b[39;00m\n\u001b[0;32m 48\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39mresults)\n",
313
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
314
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
315
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
316
+ "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
317
+ ]
318
+ }
319
+ ],
320
  "source": [
321
+ "import re \n",
322
+ "import pandas as pd\n",
323
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
324
+ "from joblib import Parallel, delayed\n",
325
+ "\n",
326
+ "\n",
327
+ "# CSV dosyasını okuma\n",
328
+ "df = pd.read_csv('combined_texts.csv')\n",
329
+ "combined = df['combined'].tolist()\n",
330
+ "\n",
331
+ "\n",
332
+ "def pad_or_truncate(texts,max_lenght):\n",
333
+ " \"metinleri belirli bir uzunluğua kısaltır ve padler\"\n",
334
+ " padded_texts=[]\n",
335
+ " for text in texts:\n",
336
+ " words= text.split()\n",
337
+ " if len(words)> max_lenght:\n",
338
+ " padded_texts.append(''.join(words[:max_lenght]))\n",
339
+ " else:\n",
340
+ " padded_texts.append(' '.join(words + [''] * (max_length - len(words))))\n",
341
+ " return padded_texts\n",
342
+ "\n",
343
+ "# Padding uzunluğu\n",
344
+ "max_length = 300 # Örneğin, metin uzunluğunu 300 kelimeyle sınırlandırma\n",
345
+ "\n",
346
+ "# Metinleri pad etme veya kısaltma\n",
347
+ "combined_padded = pad_or_truncate(combined, max_length)\n",
348
+ "\n",
349
+ "def parse_text(text):\n",
350
+ " \"\"\"Verilen metni ayrıştırarak düzenli bir yapıya dönüştürür.\"\"\"\n",
351
+ " # Satırları ayır\n",
352
+ " lines = text.split('|-')\n",
353
+ " \n",
354
+ " data = []\n",
355
+ " for line in lines:\n",
356
+ " line = line.strip()\n",
357
+ " if not line or line.startswith(\"align\"):\n",
358
+ " continue\n",
359
+ "\n",
360
+ " # Satırı parçalara ayır\n",
361
+ " parts = re.split(r'\\s*\\|\\s*', line) #satırları nasıl parçalara ayırır ??\n",
362
+ " \n",
363
+ " # Verileri temizle ve yapıyı oluştur\n",
364
+ " if len(parts) >= 2: # season ve team neler ve neden değişken olarak tanadı.\n",
365
+ " season = parts[0].strip()\n",
366
+ " team = parts[1].strip()\n",
367
+ " stats = [p.strip() for p in parts[2:] if p.strip()]\n",
368
+ " data.append([season, team] + stats)\n",
369
+ "\n",
370
+ " return data \n",
371
+ "\n",
372
+ "def clean_data(file_path):\n",
373
+ " \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
374
+ " with open(file_path, 'r') as file:\n",
375
+ " raw_text = file.read()\n",
376
+ " \n",
377
+ " data = parse_text(raw_text)\n",
378
+ " \n",
379
+ " # Veri çerçevesi oluştur\n",
380
+ " df = pd.DataFrame(data, columns=['kaynakça'])\n",
381
+ " \n",
382
+ " return df\n",
383
+ "\n",
384
+ "# CSV dosyasını temizleyip düzenli bir DataFrame oluştur\n",
385
+ "cleaned_df = clean_data('combined_texts.csv')\n",
386
+ "\n",
387
+ "# Düzenlenmiş veriyi kontrol et\n",
388
+ "print(cleaned_df.head())\n",
389
+ "\n",
390
+ "def extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1):\n",
391
+ " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır ve paralel işlem yapar.\"\"\"\n",
392
+ " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
393
+ " \n",
394
+ " # TF-IDF matrisini oluşturma (CPU kullanımını optimize etmek için n_jobs kullanılır)\n",
395
+ " X = vectorizer.fit_transform(combined) # bunu csv'den oku\n",
396
+ " feature_names = vectorizer.get_feature_names_out() # Her kelimenin tf-idf vektöründeki karşılığını tutar\n",
397
+ "\n",
398
+ " # Her döküman için en iyi keywords'leri ve tf-idf değerlerini paralel işlemeyle bulma\n",
399
+ " def process_row(row):\n",
400
+ " tfidf_scores = row.toarray().flatten() # Düz bir değişken haline getirme\n",
401
+ " top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
402
+ " \n",
403
+ " # En yüksek skorlu kelimeleri ve skorları bul\n",
404
+ " top_keywords = [feature_names[i] for i in top_indices]\n",
405
+ " top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
406
+ " return top_keywords, top_tfidf_scores\n",
407
+ "\n",
408
+ " results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in X)\n",
409
+ "\n",
410
+ " # Sonuçları listelere ayırma\n",
411
+ " top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n",
412
+ "\n",
413
+ " return top_keywords_per_document, top_tfidf_scores_per_document\n",
414
+ "\n",
415
+ "# Anahtar kelimeleri çıkar ve sonuçları al\n",
416
+ "top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1)\n",
417
+ "\n",
418
+ "# Sonuçları görüntüleme\n",
419
+ "for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
420
+ " print(f\"Döküman {i+1}:\")\n",
421
+ " for keyword, score in zip(keywords, scores):\n",
422
+ " print(f\"{keyword}: {score:.4f}\")\n",
423
+ " print(\"\\n\")\n"
424
  ]
425
  },
426
  {
 
428
  "execution_count": null,
429
  "metadata": {},
430
  "outputs": [],
431
+ "source": [
432
+ "from sentence_transformers import SentenceTransformer\n",
433
+ "\n",
434
+ "model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')\n",
435
+ "\n",
436
+ "# Top_keywords embedding\n",
437
+ "keyword_embeddings = model.encode(top_keywords_per_document)\n"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 3,
443
+ "metadata": {},
444
+ "outputs": [
445
+ {
446
+ "name": "stdout",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "Keyword: bir, Similarity: 0.26726124191242445\n",
450
+ "Keyword: anahtar, Similarity: 0.26726124191242445\n",
451
+ "Keyword: kelimeleri, Similarity: 0.26726124191242445\n",
452
+ "Keyword: test, Similarity: 0.26726124191242445\n",
453
+ "Keyword: başka, Similarity: 0.0\n"
454
+ ]
455
+ }
456
+ ],
457
+ "source": [
458
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
459
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
460
+ "\n",
461
+ "def calculate_keyword_similarity(text, keywords):\n",
462
+ " # TF-IDF matrisini oluştur\n",
463
+ " tfidf_vectorizer = TfidfVectorizer()\n",
464
+ "\n",
465
+ " #texti ve anahtar kelimeleri tf-ıdf vektörlerine dönüştür\n",
466
+ " text_tfidf = tfidf_vectorizer.fit_transform([text]) #burayı combined sütunundan almalıyım\n",
467
+ " #benzerlik hesaplama \n",
468
+ " similarity_array = []\n",
469
+ " for keyword in keywords:\n",
470
+ " # Her bir anahtar kelimeyi TF-IDF vektörüne dönüştür\n",
471
+ " keyword_tfidf = tfidf_vectorizer.transform([keyword]) #keywordleri teker teker alma fonksiyonu\n",
472
+ " \n",
473
+ " # Cosine similarity ile benzerlik hesapla\n",
474
+ " similarity = cosine_similarity(text_tfidf, keyword_tfidf)[0][0]\n",
475
+ " \n",
476
+ " # Anahtar kelime ve benzerlik skorunu kaydet\n",
477
+ " similarity_array.append((keyword, similarity))\n",
478
+ " \n",
479
+ " return similarity_array\n",
480
+ " \n",
481
+ "\n",
482
+ "# Örnek metin ve anahtar kelimeler\n",
483
+ "#combined verileri \n",
484
+ "text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
485
+ "keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"] #bu keywordsler tf-değerinden alınarak arraylere çevrilmeli \n",
486
+ " \n",
487
+ "# Uygunluk skorunu hesapla\n",
488
+ "similarity_results = calculate_keyword_similarity(text, keywords)\n",
489
+ "top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
490
+ "# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
491
+ "\n",
492
+ "for keyword, similarity in top_5_keywords:\n",
493
+ " print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
494
+ " #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
495
+ "\n"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": 10,
501
+ "metadata": {},
502
+ "outputs": [
503
+ {
504
+ "data": {
505
+ "text/plain": [
506
+ "<function __main__.process_texts(combined_texts, stop_words_list, top_n)>"
507
+ ]
508
+ },
509
+ "execution_count": 10,
510
+ "metadata": {},
511
+ "output_type": "execute_result"
512
+ }
513
+ ],
514
  "source": [
515
  "\n",
516
  "# BERT Tokenizer ve Model'i yükleyin\n",
 
520
  "def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
521
  " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
522
  " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
523
+ " X = vectorizer.fit_transform(combined_texts) #bunu csv den oku \n",
524
  " feature_names = vectorizer.get_feature_names_out()\n",
525
  " #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
526
+ " \n",
527
+ " top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
528
+ " top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
529
+ "#------------------------------------------------------------------------------------------\n",
530
  " for row in X:\n",
531
  " tfidf_scores = row.toarray().flatten()\n",
532
  " top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
533
  " top_keywords = [feature_names[i] for i in top_indices]\n",
534
+ " top_tfidf_scores_per_document = [tfidf_scores[i] for i in top_indices]\n",
535
+ "\n",
536
  "\n",
537
+ " top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
538
+ " top_keywords_per_document.append(top_keywords)\n",
539
+ " \n",
540
  " return top_keywords_per_document\n",
541
  "\n",
542
  "# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
 
545
  " \n",
546
  " for text in combined_texts:\n",
547
  " # Anahtar kelimeleri çıkar\n",
548
+ " keywords = extract_keywords_tfidf(text, stop_words_list,top_n=15)\n",
549
  " \n",
550
  " # BERT ile embedding oluştur\n",
551
  " inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
 
559
  " 'embedding': embeddings\n",
560
  " })\n",
561
  " \n",
562
+ " return results\n",
563
+ "\n",
564
+ "results=process_texts\n",
565
+ "results\n",
566
+ "#tüm metinleri işle\n"
567
+ ]
568
+ },
569
+ {
570
+ "cell_type": "markdown",
571
+ "metadata": {},
572
+ "source": [
573
+ "MongoDb'den database'in çekilmesi"
574
  ]
575
  },
576
  {
577
  "cell_type": "code",
578
+ "execution_count": 5,
579
  "metadata": {},
580
  "outputs": [
581
  {
582
  "name": "stdout",
583
  "output_type": "stream",
584
  "text": [
585
+ "combined metinler 'combined_texts.csv' dosyasına başarıyla yazıldı.\n"
586
  ]
587
  }
588
  ],
589
  "source": [
590
+ "#mongodb üzerinden combined_textleri çek\n",
591
+ "import csv\n",
592
+ "from pymongo import MongoClient\n",
 
 
 
593
  "\n",
594
+ "def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=1000,output_file='combined_texts.csv'):\n",
595
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
596
+ " db = client[database_name]\n",
597
+ " collection = db[collection_name]\n",
 
 
 
 
 
 
 
598
  " \n",
599
+ " #toplam döküman sayısını al\n",
600
+ " total_documents = collection.count_documents({})\n",
601
+ " #batch_documents = []\n",
602
  "\n",
603
+ " # CSV dosyasını aç ve yazmaya hazırla\n",
604
+ " with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
605
+ " writer = csv.writer(file)\n",
606
+ " writer.writerow([\"combined\"]) # CSV başlığı\n",
607
  "\n",
608
+ " # Belirtilen batch_size kadar dökümanları almak için döngü\n",
609
+ " for i in range(0, total_documents, batch_size):\n",
610
+ " cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
611
+ " combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
612
+ "\n",
613
+ " # Batch verilerini CSV'ye yaz\n",
614
+ " with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
615
+ " writer = csv.writer(file)\n",
616
+ " \n",
617
+ " for text in combined_texts:\n",
618
+ " writer.writerow([text])\n",
619
+ " \n",
620
+ " \n",
621
  "\n",
622
+ " print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
623
+ "\n",
624
+ "# Dökümanları CSV dosyasına yazdır\n",
625
+ "text=mongo_db_combined_texts_to_csv(batch_size=1000)\n",
626
+ " #batch_documents.extend((combined_texts, len(combined_texts)))\n",
627
+ " #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
628
+ " #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
629
+ " #return batch_documents\n",
630
+ "\n",
631
+ "# Dökümanları ve döküman sayısını batch olarak çekin\n",
632
+ "#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
633
+ "\n",
634
+ "# Her batch'i ayrı ayrı işleyebilirsiniz\n",
635
+ "#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
636
+ "\n",
637
+ "#for index, text in enumerate (combined_texts[:10]):\n",
638
+ " #print(f\"Döküman {index + 1}: {text}\")\n",
639
+ "\n",
640
+ "#print(combined_texts)\n",
641
+ "\n",
642
+ " "
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "markdown",
647
+ "metadata": {},
648
+ "source": [
649
+ "Gereksiz kelimelerin 'gereksiz_kelimeler.txt' üzerinden import edilmesi"
650
+ ]
651
+ },
652
+ {
653
+ "cell_type": "markdown",
654
+ "metadata": {},
655
+ "source": [
656
+ "TF-IDF Skorları "
657
  ]
658
  },
659
  {
processed_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97993e92396f03aa90162dad808bbd3c655a988b37d7ba45704b0058371c6172
3
+ size 419458630