Spaces:
Build error
Build error
Upload 2 files
Browse files- .gitattributes +1 -0
- combined.ipynb +512 -79
- processed_data.csv +3 -0
.gitattributes
CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
combined_output.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
combined_texts.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
combined_output.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
combined_texts.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
processed_data.csv filter=lfs diff=lfs merge=lfs -text
|
combined.ipynb
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
-
"execution_count":
|
13 |
"metadata": {},
|
14 |
"outputs": [],
|
15 |
"source": [
|
@@ -35,87 +35,392 @@
|
|
35 |
"cell_type": "markdown",
|
36 |
"metadata": {},
|
37 |
"source": [
|
38 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
]
|
40 |
},
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
-
"execution_count":
|
44 |
"metadata": {},
|
45 |
"outputs": [],
|
46 |
"source": [
|
47 |
"#mongodb üzerinden combined_textleri çek\n",
|
|
|
|
|
48 |
"\n",
|
49 |
-
"def
|
50 |
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
51 |
" db = client[database_name]\n",
|
52 |
" collection = db[collection_name]\n",
|
53 |
" \n",
|
54 |
" #toplam döküman sayısını al\n",
|
55 |
" total_documents = collection.count_documents({})\n",
|
56 |
-
" batch_documents = []\n",
|
|
|
|
|
|
|
|
|
|
|
57 |
"\n",
|
58 |
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
59 |
" for i in range(0, total_documents, batch_size):\n",
|
60 |
" cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
61 |
-
" combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc]\n",
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
"\n",
|
66 |
"# Dökümanları ve döküman sayısını batch olarak çekin\n",
|
67 |
-
"
|
68 |
"\n",
|
69 |
"# Her batch'i ayrı ayrı işleyebilirsiniz\n",
|
70 |
-
"
|
71 |
-
"
|
|
|
|
|
|
|
|
|
72 |
"\n",
|
73 |
" "
|
74 |
]
|
75 |
},
|
76 |
{
|
77 |
-
"cell_type": "
|
|
|
78 |
"metadata": {},
|
79 |
-
"
|
80 |
-
|
81 |
-
]
|
82 |
},
|
83 |
{
|
84 |
"cell_type": "code",
|
85 |
-
"execution_count":
|
86 |
"metadata": {},
|
87 |
"outputs": [],
|
88 |
"source": [
|
89 |
-
"\
|
90 |
-
"
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
|
|
|
|
|
|
94 |
" \n",
|
95 |
-
"
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
" \n",
|
100 |
-
"
|
101 |
-
"
|
102 |
-
" word = line.strip()\n",
|
103 |
-
" if word and word not in existing_stop_words:\n",
|
104 |
-
" existing_stop_words.add(word)\n",
|
105 |
" \n",
|
106 |
-
" return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
"\n",
|
108 |
-
"
|
109 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
"\n",
|
111 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
]
|
113 |
},
|
114 |
{
|
115 |
-
"cell_type": "
|
|
|
116 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
"source": [
|
118 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
]
|
120 |
},
|
121 |
{
|
@@ -123,6 +428,89 @@
|
|
123 |
"execution_count": null,
|
124 |
"metadata": {},
|
125 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
"source": [
|
127 |
"\n",
|
128 |
"# BERT Tokenizer ve Model'i yükleyin\n",
|
@@ -132,17 +520,23 @@
|
|
132 |
"def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
|
133 |
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
134 |
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
135 |
-
" X = vectorizer.fit_transform(combined_texts)\n",
|
136 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
137 |
" #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
138 |
-
"
|
139 |
-
"\n",
|
|
|
|
|
140 |
" for row in X:\n",
|
141 |
" tfidf_scores = row.toarray().flatten()\n",
|
142 |
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
143 |
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
144 |
-
"
|
|
|
145 |
"\n",
|
|
|
|
|
|
|
146 |
" return top_keywords_per_document\n",
|
147 |
"\n",
|
148 |
"# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
|
@@ -151,7 +545,7 @@
|
|
151 |
" \n",
|
152 |
" for text in combined_texts:\n",
|
153 |
" # Anahtar kelimeleri çıkar\n",
|
154 |
-
" keywords = extract_keywords_tfidf(text, stop_words_list,top_n)\n",
|
155 |
" \n",
|
156 |
" # BERT ile embedding oluştur\n",
|
157 |
" inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
|
@@ -165,62 +559,101 @@
|
|
165 |
" 'embedding': embeddings\n",
|
166 |
" })\n",
|
167 |
" \n",
|
168 |
-
" return results"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
]
|
170 |
},
|
171 |
{
|
172 |
"cell_type": "code",
|
173 |
-
"execution_count":
|
174 |
"metadata": {},
|
175 |
"outputs": [
|
176 |
{
|
177 |
"name": "stdout",
|
178 |
"output_type": "stream",
|
179 |
"text": [
|
180 |
-
"
|
181 |
]
|
182 |
}
|
183 |
],
|
184 |
"source": [
|
185 |
-
"
|
186 |
-
"
|
187 |
-
"\n",
|
188 |
-
"def calculate_keyword_similarity(text, keywords):\n",
|
189 |
-
" # Metin ve anahtar kelimelerden oluşan bir liste oluştur\n",
|
190 |
-
" similarity_array = []\n",
|
191 |
"\n",
|
192 |
-
"
|
193 |
-
"
|
194 |
-
"
|
195 |
-
" \n",
|
196 |
-
" # TF-IDF matrisini oluştur\n",
|
197 |
-
" vectorizer = TfidfVectorizer()\n",
|
198 |
-
" tfidf_matrix = vectorizer.fit_transform(documents)\n",
|
199 |
-
" \n",
|
200 |
-
" # Metin vektörünü ve anahtar kelimeler vektörünü al\n",
|
201 |
-
" text_vector = tfidf_matrix[0]\n",
|
202 |
-
" keywords_vector = tfidf_matrix[1]\n",
|
203 |
" \n",
|
204 |
-
" #
|
205 |
-
"
|
|
|
206 |
"\n",
|
|
|
|
|
|
|
|
|
207 |
"\n",
|
208 |
-
"
|
209 |
-
"
|
210 |
-
"
|
211 |
-
"#
|
212 |
-
"
|
213 |
-
"
|
214 |
-
"
|
215 |
-
"
|
216 |
-
"
|
217 |
-
"
|
218 |
-
"
|
|
|
|
|
219 |
"\n",
|
220 |
-
"
|
221 |
-
"
|
222 |
-
"
|
223 |
-
"\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
]
|
225 |
},
|
226 |
{
|
|
|
9 |
},
|
10 |
{
|
11 |
"cell_type": "code",
|
12 |
+
"execution_count": 3,
|
13 |
"metadata": {},
|
14 |
"outputs": [],
|
15 |
"source": [
|
|
|
35 |
"cell_type": "markdown",
|
36 |
"metadata": {},
|
37 |
"source": [
|
38 |
+
"Turkish stop wordslerin tanımlanması"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": 4,
|
44 |
+
"metadata": {},
|
45 |
+
"outputs": [],
|
46 |
+
"source": [
|
47 |
+
"\"\"\"\"\"\"\n",
|
48 |
+
"#- burada turkish_stop_words'ü alıyoruz\n",
|
49 |
+
"def load_stop_words(file_path, existing_stop_words='gereksiz_kelimeler.txt'):\n",
|
50 |
+
" \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur. \n",
|
51 |
+
" Mevcut stop words'ler varsa bunları dikkate alır.\"\"\"\n",
|
52 |
+
" \n",
|
53 |
+
" if existing_stop_words is None:\n",
|
54 |
+
" existing_stop_words = set()\n",
|
55 |
+
" else:\n",
|
56 |
+
" existing_stop_words = set(existing_stop_words)\n",
|
57 |
+
" \n",
|
58 |
+
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
59 |
+
" for line in file:\n",
|
60 |
+
" word = line.strip()\n",
|
61 |
+
" if word and word not in existing_stop_words:\n",
|
62 |
+
" existing_stop_words.add(word)\n",
|
63 |
+
" \n",
|
64 |
+
" return list(existing_stop_words)\n",
|
65 |
+
"\n",
|
66 |
+
"# Mevcut stop words'leri kontrol ederek Türkçe stop words dosyasını yükleyin\n",
|
67 |
+
"stop_words_list = load_stop_words('gereksiz_kelimeler.txt')\n",
|
68 |
+
"\n",
|
69 |
+
"#----------------------------------------------------------------------------------------------------"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "markdown",
|
74 |
+
"metadata": {},
|
75 |
+
"source": [
|
76 |
+
"MongoDb'deki combined_text koleksiyonunun verilerini csv ye çekme "
|
77 |
]
|
78 |
},
|
79 |
{
|
80 |
"cell_type": "code",
|
81 |
+
"execution_count": null,
|
82 |
"metadata": {},
|
83 |
"outputs": [],
|
84 |
"source": [
|
85 |
"#mongodb üzerinden combined_textleri çek\n",
|
86 |
+
"import csv\n",
|
87 |
+
"from pymongo import MongoClient\n",
|
88 |
"\n",
|
89 |
+
"def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=100,output_file='combined_texts.csv'):\n",
|
90 |
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
91 |
" db = client[database_name]\n",
|
92 |
" collection = db[collection_name]\n",
|
93 |
" \n",
|
94 |
" #toplam döküman sayısını al\n",
|
95 |
" total_documents = collection.count_documents({})\n",
|
96 |
+
" #batch_documents = []\n",
|
97 |
+
"\n",
|
98 |
+
" # CSV dosyasını aç ve yazmaya hazırla\n",
|
99 |
+
" with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
|
100 |
+
" writer = csv.writer(file)\n",
|
101 |
+
" writer.writerow([\"combined\"]) # CSV başlığı\n",
|
102 |
"\n",
|
103 |
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
104 |
" for i in range(0, total_documents, batch_size):\n",
|
105 |
" cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
106 |
+
" combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
|
107 |
+
"\n",
|
108 |
+
" # Batch verilerini CSV'ye yaz\n",
|
109 |
+
" with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
|
110 |
+
" writer = csv.writer(file)\n",
|
111 |
+
" \n",
|
112 |
+
" for text in combined_texts:\n",
|
113 |
+
" writer.writerow([text])\n",
|
114 |
+
" \n",
|
115 |
+
" \n",
|
116 |
+
"\n",
|
117 |
+
" print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
|
118 |
+
"\n",
|
119 |
+
"# Dökümanları CSV dosyasına yazdır\n",
|
120 |
+
"text=mongo_db_combined_texts_to_csv(batch_size=100)\n",
|
121 |
+
" #batch_documents.extend((combined_texts, len(combined_texts)))\n",
|
122 |
+
" #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
|
123 |
+
" #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
|
124 |
+
" #return batch_documents\n",
|
125 |
"\n",
|
126 |
"# Dökümanları ve döküman sayısını batch olarak çekin\n",
|
127 |
+
"#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
|
128 |
"\n",
|
129 |
"# Her batch'i ayrı ayrı işleyebilirsiniz\n",
|
130 |
+
"#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
|
131 |
+
"\n",
|
132 |
+
"#for index, text in enumerate (combined_texts[:10]):\n",
|
133 |
+
" #print(f\"Döküman {index + 1}: {text}\")\n",
|
134 |
+
"\n",
|
135 |
+
"#print(combined_texts)\n",
|
136 |
"\n",
|
137 |
" "
|
138 |
]
|
139 |
},
|
140 |
{
|
141 |
+
"cell_type": "code",
|
142 |
+
"execution_count": null,
|
143 |
"metadata": {},
|
144 |
+
"outputs": [],
|
145 |
+
"source": []
|
|
|
146 |
},
|
147 |
{
|
148 |
"cell_type": "code",
|
149 |
+
"execution_count": 11,
|
150 |
"metadata": {},
|
151 |
"outputs": [],
|
152 |
"source": [
|
153 |
+
"import csv\n",
|
154 |
+
"from pymongo import MongoClient\n",
|
155 |
+
"import pandas as pd\n",
|
156 |
+
"\n",
|
157 |
+
"def fetch_from_database(database_name='combined_text', collection_name='text', host='localhost', port=27017, batch_size=100):\n",
|
158 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
159 |
+
" db = client[database_name]\n",
|
160 |
+
" collection = db[collection_name]\n",
|
161 |
" \n",
|
162 |
+
" # Toplam döküman sayısını al\n",
|
163 |
+
" total_documents = collection.count_documents({})\n",
|
164 |
+
" combined_texts = []\n",
|
165 |
+
"\n",
|
166 |
+
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
167 |
+
" for i in range(0, total_documents, batch_size):\n",
|
168 |
+
" cursor = collection.find({}, {\"combined\": 1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
169 |
+
" combined_texts.extend([doc['combined'] for doc in cursor if 'combined' in doc]) # combined sütununa ilişkin verileri çeker \n",
|
170 |
+
"\n",
|
171 |
+
" return combined_texts\n",
|
172 |
+
"\n",
|
173 |
+
"# Metinleri kısaltma fonksiyonu\n",
|
174 |
+
"def truncate_text(text, max_words=300):\n",
|
175 |
+
" words = text.split() # Metni kelimelere böler\n",
|
176 |
+
" return ' '.join(words[:max_words]) # İlk max_words kadar kelimeyi alır\n",
|
177 |
+
"\n",
|
178 |
+
"# Veritabanından veri çekme ve kısaltma\n",
|
179 |
+
"def fetch_and_truncate_data(database_name, collection_name, host, port, max_words=300):\n",
|
180 |
+
" # Veritabanından veri çekme\n",
|
181 |
+
" combined_texts = fetch_from_database(database_name, collection_name, host, port)\n",
|
182 |
" \n",
|
183 |
+
" # Metinleri kısaltma\n",
|
184 |
+
" truncated_texts = [truncate_text(text, max_words) for text in combined_texts]\n",
|
|
|
|
|
|
|
185 |
" \n",
|
186 |
+
" return truncated_texts\n",
|
187 |
+
"\n",
|
188 |
+
"# Kısaltılmış veriyi CSV'ye kaydetme\n",
|
189 |
+
"def save_to_csv(data, file_path):\n",
|
190 |
+
" df = pd.DataFrame(data, columns=['combined'])\n",
|
191 |
+
" df.to_csv(file_path, encoding='utf-8', index=False)\n",
|
192 |
+
"\n",
|
193 |
+
"# Doğru değişken tanımlamaları\n",
|
194 |
+
"database_name = 'combined_text'\n",
|
195 |
+
"collection_name = 'text'\n",
|
196 |
+
"host = 'localhost'\n",
|
197 |
+
"port = 27017\n",
|
198 |
+
"batch_size = 100\n",
|
199 |
+
"max_words = 300\n",
|
200 |
+
"output_file = 'processed_data.csv'\n",
|
201 |
+
"\n",
|
202 |
+
"# Veriyi çekme ve işleme\n",
|
203 |
+
"truncated_texts = fetch_and_truncate_data(database_name, collection_name, host, port, max_words)\n",
|
204 |
+
"save_to_csv(truncated_texts, output_file)\n"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"cell_type": "code",
|
209 |
+
"execution_count": null,
|
210 |
+
"metadata": {},
|
211 |
+
"outputs": [],
|
212 |
+
"source": []
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"cell_type": "markdown",
|
216 |
+
"metadata": {},
|
217 |
+
"source": [
|
218 |
+
"Tf-Idf ile keywordsleri alma "
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 11,
|
224 |
+
"metadata": {},
|
225 |
+
"outputs": [
|
226 |
+
{
|
227 |
+
"name": "stderr",
|
228 |
+
"output_type": "stream",
|
229 |
+
"text": [
|
230 |
+
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
|
231 |
+
" warnings.warn(\n"
|
232 |
+
]
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"ename": "KeyboardInterrupt",
|
236 |
+
"evalue": "",
|
237 |
+
"output_type": "error",
|
238 |
+
"traceback": [
|
239 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
240 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
241 |
+
"Cell \u001b[1;32mIn[11], line 33\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m 32\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 33\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
242 |
+
"Cell \u001b[1;32mIn[11], line 21\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n)\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m X:\n\u001b[0;32m 20\u001b[0m tfidf_scores \u001b[38;5;241m=\u001b[39m row\u001b[38;5;241m.\u001b[39mtoarray()\u001b[38;5;241m.\u001b[39mflatten() \u001b[38;5;66;03m#değişkenleri düz bir değişken haline getirme\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m top_indices \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_scores\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margsort\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m-\u001b[39mtop_n:][::\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;66;03m# En yüksek n skoru bul\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m#en yüksek skorlu kelimleri ve skorları bul\u001b[39;00m\n\u001b[0;32m 24\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m [feature_names[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n",
|
243 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
244 |
+
]
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"source": [
|
248 |
+
"import csv\n",
|
249 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
250 |
+
"from joblib import Parallel, delayed\n",
|
251 |
+
"import pandas as pd\n",
|
252 |
"\n",
|
253 |
+
"df=pd.read_csv('combined_texts.csv')\n",
|
254 |
+
"combined= df['combined'].tolist()\n",
|
255 |
+
"def extract_keywords_tfidf(combined, stop_words_list,top_n=10):\n",
|
256 |
+
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
257 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
258 |
+
" X = vectorizer.fit_transform(combined) #bunu csv den oku \n",
|
259 |
+
" feature_names = vectorizer.get_feature_names_out() #her kelimenin tf-ıdf vektöründeki karşılığını tutar \n",
|
260 |
+
" #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
261 |
+
" \n",
|
262 |
+
" top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
|
263 |
+
" top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
|
264 |
"\n",
|
265 |
+
" # Her dökümanı işleme\n",
|
266 |
+
" for row in X:\n",
|
267 |
+
" tfidf_scores = row.toarray().flatten() #değişkenleri düz bir değişken haline getirme\n",
|
268 |
+
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
269 |
+
" \n",
|
270 |
+
" #en yüksek skorlu kelimleri ve skorları bul\n",
|
271 |
+
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
272 |
+
" top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
|
273 |
+
" \n",
|
274 |
+
" top_keywords_per_document.append(top_keywords)\n",
|
275 |
+
" top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
|
276 |
+
" \n",
|
277 |
+
" return top_keywords_per_document, top_tfidf_scores_per_document\n",
|
278 |
+
"\n",
|
279 |
+
"# Anahtar kelimeleri çıkar ve sonuçları al\n",
|
280 |
+
"top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10)\n",
|
281 |
+
" \n",
|
282 |
+
"\n",
|
283 |
+
"# Sonuçları görüntüleme\n",
|
284 |
+
"for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
285 |
+
" print(f\"Döküman {i+1}:\")\n",
|
286 |
+
" for keyword, score in zip(keywords, scores):\n",
|
287 |
+
" print(f\"{keyword}: {score:.4f}\")\n",
|
288 |
+
" print(\"\\n\")\n"
|
289 |
]
|
290 |
},
|
291 |
{
|
292 |
+
"cell_type": "code",
|
293 |
+
"execution_count": 5,
|
294 |
"metadata": {},
|
295 |
+
"outputs": [
|
296 |
+
{
|
297 |
+
"name": "stderr",
|
298 |
+
"output_type": "stream",
|
299 |
+
"text": [
|
300 |
+
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:406: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['leh'] not in stop_words.\n",
|
301 |
+
" warnings.warn(\n"
|
302 |
+
]
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"ename": "KeyboardInterrupt",
|
306 |
+
"evalue": "",
|
307 |
+
"output_type": "error",
|
308 |
+
"traceback": [
|
309 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
310 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
311 |
+
"Cell \u001b[1;32mIn[5], line 53\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords_per_document, top_tfidf_scores_per_document\n\u001b[0;32m 52\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeleri çıkar ve sonuçları al\u001b[39;00m\n\u001b[1;32m---> 53\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcombined\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop_words_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_n\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 55\u001b[0m \u001b[38;5;66;03m# Sonuçları görüntüleme\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, (keywords, scores) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mzip\u001b[39m(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
312 |
+
"Cell \u001b[1;32mIn[5], line 45\u001b[0m, in \u001b[0;36mextract_keywords_tfidf\u001b[1;34m(combined, stop_words_list, top_n, n_jobs)\u001b[0m\n\u001b[0;32m 42\u001b[0m top_tfidf_scores \u001b[38;5;241m=\u001b[39m [tfidf_scores[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_indices]\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m top_keywords, top_tfidf_scores\n\u001b[1;32m---> 45\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_row\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrow\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 47\u001b[0m \u001b[38;5;66;03m# Sonuçları listelere ayırma\u001b[39;00m\n\u001b[0;32m 48\u001b[0m top_keywords_per_document, top_tfidf_scores_per_document \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mzip\u001b[39m(\u001b[38;5;241m*\u001b[39mresults)\n",
|
313 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:2007\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[38;5;66;03m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[0;32m 2002\u001b[0m \u001b[38;5;66;03m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[0;32m 2003\u001b[0m \u001b[38;5;66;03m# reaches the first `yield` statement. This starts the asynchronous\u001b[39;00m\n\u001b[0;32m 2004\u001b[0m \u001b[38;5;66;03m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[0;32m 2005\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 2007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n",
|
314 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1650\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[1;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[0;32m 1647\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[0;32m 1649\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backend\u001b[38;5;241m.\u001b[39mretrieval_context():\n\u001b[1;32m-> 1650\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_retrieve()\n\u001b[0;32m 1652\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mGeneratorExit\u001b[39;00m:\n\u001b[0;32m 1653\u001b[0m \u001b[38;5;66;03m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[0;32m 1654\u001b[0m \u001b[38;5;66;03m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[0;32m 1655\u001b[0m \u001b[38;5;66;03m# the user if necessary.\u001b[39;00m\n\u001b[0;32m 1656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
315 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\joblib\\parallel.py:1762\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1757\u001b[0m \u001b[38;5;66;03m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[0;32m 1758\u001b[0m \u001b[38;5;66;03m# async callbacks to progress.\u001b[39;00m\n\u001b[0;32m 1759\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ((\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m\n\u001b[0;32m 1760\u001b[0m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jobs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mget_status(\n\u001b[0;32m 1761\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtimeout) \u001b[38;5;241m==\u001b[39m TASK_PENDING)):\n\u001b[1;32m-> 1762\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0.01\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1763\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 1765\u001b[0m \u001b[38;5;66;03m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[0;32m 1766\u001b[0m \u001b[38;5;66;03m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[0;32m 1767\u001b[0m \u001b[38;5;66;03m# default hence the use of the lock\u001b[39;00m\n",
|
316 |
+
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
317 |
+
]
|
318 |
+
}
|
319 |
+
],
|
320 |
"source": [
|
321 |
+
"import re \n",
|
322 |
+
"import pandas as pd\n",
|
323 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
324 |
+
"from joblib import Parallel, delayed\n",
|
325 |
+
"\n",
|
326 |
+
"\n",
|
327 |
+
"# CSV dosyasını okuma\n",
|
328 |
+
"df = pd.read_csv('combined_texts.csv')\n",
|
329 |
+
"combined = df['combined'].tolist()\n",
|
330 |
+
"\n",
|
331 |
+
"\n",
|
332 |
+
"def pad_or_truncate(texts,max_lenght):\n",
|
333 |
+
" \"metinleri belirli bir uzunluğua kısaltır ve padler\"\n",
|
334 |
+
" padded_texts=[]\n",
|
335 |
+
" for text in texts:\n",
|
336 |
+
" words= text.split()\n",
|
337 |
+
" if len(words)> max_lenght:\n",
|
338 |
+
" padded_texts.append(''.join(words[:max_lenght]))\n",
|
339 |
+
" else:\n",
|
340 |
+
" padded_texts.append(' '.join(words + [''] * (max_length - len(words))))\n",
|
341 |
+
" return padded_texts\n",
|
342 |
+
"\n",
|
343 |
+
"# Padding uzunluğu\n",
|
344 |
+
"max_length = 300 # Örneğin, metin uzunluğunu 300 kelimeyle sınırlandırma\n",
|
345 |
+
"\n",
|
346 |
+
"# Metinleri pad etme veya kısaltma\n",
|
347 |
+
"combined_padded = pad_or_truncate(combined, max_length)\n",
|
348 |
+
"\n",
|
349 |
+
"def parse_text(text):\n",
|
350 |
+
" \"\"\"Verilen metni ayrıştırarak düzenli bir yapıya dönüştürür.\"\"\"\n",
|
351 |
+
" # Satırları ayır\n",
|
352 |
+
" lines = text.split('|-')\n",
|
353 |
+
" \n",
|
354 |
+
" data = []\n",
|
355 |
+
" for line in lines:\n",
|
356 |
+
" line = line.strip()\n",
|
357 |
+
" if not line or line.startswith(\"align\"):\n",
|
358 |
+
" continue\n",
|
359 |
+
"\n",
|
360 |
+
" # Satırı parçalara ayır\n",
|
361 |
+
" parts = re.split(r'\\s*\\|\\s*', line) #satırları nasıl parçalara ayırır ??\n",
|
362 |
+
" \n",
|
363 |
+
" # Verileri temizle ve yapıyı oluştur\n",
|
364 |
+
" if len(parts) >= 2: # season ve team neler ve neden değişken olarak tanadı.\n",
|
365 |
+
" season = parts[0].strip()\n",
|
366 |
+
" team = parts[1].strip()\n",
|
367 |
+
" stats = [p.strip() for p in parts[2:] if p.strip()]\n",
|
368 |
+
" data.append([season, team] + stats)\n",
|
369 |
+
"\n",
|
370 |
+
" return data \n",
|
371 |
+
"\n",
|
372 |
+
"def clean_data(file_path):\n",
|
373 |
+
" \"\"\"CSV dosyasını okur ve veriyi düzenler.\"\"\"\n",
|
374 |
+
" with open(file_path, 'r') as file:\n",
|
375 |
+
" raw_text = file.read()\n",
|
376 |
+
" \n",
|
377 |
+
" data = parse_text(raw_text)\n",
|
378 |
+
" \n",
|
379 |
+
" # Veri çerçevesi oluştur\n",
|
380 |
+
" df = pd.DataFrame(data, columns=['kaynakça'])\n",
|
381 |
+
" \n",
|
382 |
+
" return df\n",
|
383 |
+
"\n",
|
384 |
+
"# CSV dosyasını temizleyip düzenli bir DataFrame oluştur\n",
|
385 |
+
"cleaned_df = clean_data('combined_texts.csv')\n",
|
386 |
+
"\n",
|
387 |
+
"# Düzenlenmiş veriyi kontrol et\n",
|
388 |
+
"print(cleaned_df.head())\n",
|
389 |
+
"\n",
|
390 |
+
"def extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1):\n",
|
391 |
+
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır ve paralel işlem yapar.\"\"\"\n",
|
392 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
393 |
+
" \n",
|
394 |
+
" # TF-IDF matrisini oluşturma (CPU kullanımını optimize etmek için n_jobs kullanılır)\n",
|
395 |
+
" X = vectorizer.fit_transform(combined) # bunu csv'den oku\n",
|
396 |
+
" feature_names = vectorizer.get_feature_names_out() # Her kelimenin tf-idf vektöründeki karşılığını tutar\n",
|
397 |
+
"\n",
|
398 |
+
" # Her döküman için en iyi keywords'leri ve tf-idf değerlerini paralel işlemeyle bulma\n",
|
399 |
+
" def process_row(row):\n",
|
400 |
+
" tfidf_scores = row.toarray().flatten() # Düz bir değişken haline getirme\n",
|
401 |
+
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
402 |
+
" \n",
|
403 |
+
" # En yüksek skorlu kelimeleri ve skorları bul\n",
|
404 |
+
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
405 |
+
" top_tfidf_scores = [tfidf_scores[i] for i in top_indices]\n",
|
406 |
+
" return top_keywords, top_tfidf_scores\n",
|
407 |
+
"\n",
|
408 |
+
" results = Parallel(n_jobs=n_jobs)(delayed(process_row)(row) for row in X)\n",
|
409 |
+
"\n",
|
410 |
+
" # Sonuçları listelere ayırma\n",
|
411 |
+
" top_keywords_per_document, top_tfidf_scores_per_document = zip(*results)\n",
|
412 |
+
"\n",
|
413 |
+
" return top_keywords_per_document, top_tfidf_scores_per_document\n",
|
414 |
+
"\n",
|
415 |
+
"# Anahtar kelimeleri çıkar ve sonuçları al\n",
|
416 |
+
"top_keywords_per_document, top_tfidf_scores_per_document = extract_keywords_tfidf(combined, stop_words_list, top_n=10, n_jobs=-1)\n",
|
417 |
+
"\n",
|
418 |
+
"# Sonuçları görüntüleme\n",
|
419 |
+
"for i, (keywords, scores) in enumerate(zip(top_keywords_per_document, top_tfidf_scores_per_document)):\n",
|
420 |
+
" print(f\"Döküman {i+1}:\")\n",
|
421 |
+
" for keyword, score in zip(keywords, scores):\n",
|
422 |
+
" print(f\"{keyword}: {score:.4f}\")\n",
|
423 |
+
" print(\"\\n\")\n"
|
424 |
]
|
425 |
},
|
426 |
{
|
|
|
428 |
"execution_count": null,
|
429 |
"metadata": {},
|
430 |
"outputs": [],
|
431 |
+
"source": [
|
432 |
+
"from sentence_transformers import SentenceTransformer\n",
|
433 |
+
"\n",
|
434 |
+
"model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')\n",
|
435 |
+
"\n",
|
436 |
+
"# Top_keywords embedding\n",
|
437 |
+
"keyword_embeddings = model.encode(top_keywords_per_document)\n"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"cell_type": "code",
|
442 |
+
"execution_count": 3,
|
443 |
+
"metadata": {},
|
444 |
+
"outputs": [
|
445 |
+
{
|
446 |
+
"name": "stdout",
|
447 |
+
"output_type": "stream",
|
448 |
+
"text": [
|
449 |
+
"Keyword: bir, Similarity: 0.26726124191242445\n",
|
450 |
+
"Keyword: anahtar, Similarity: 0.26726124191242445\n",
|
451 |
+
"Keyword: kelimeleri, Similarity: 0.26726124191242445\n",
|
452 |
+
"Keyword: test, Similarity: 0.26726124191242445\n",
|
453 |
+
"Keyword: başka, Similarity: 0.0\n"
|
454 |
+
]
|
455 |
+
}
|
456 |
+
],
|
457 |
+
"source": [
|
458 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
459 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
460 |
+
"\n",
|
461 |
+
"def calculate_keyword_similarity(text, keywords):\n",
|
462 |
+
" # TF-IDF matrisini oluştur\n",
|
463 |
+
" tfidf_vectorizer = TfidfVectorizer()\n",
|
464 |
+
"\n",
|
465 |
+
" #texti ve anahtar kelimeleri tf-ıdf vektörlerine dönüştür\n",
|
466 |
+
" text_tfidf = tfidf_vectorizer.fit_transform([text]) #burayı combined sütunundan almalıyım\n",
|
467 |
+
" #benzerlik hesaplama \n",
|
468 |
+
" similarity_array = []\n",
|
469 |
+
" for keyword in keywords:\n",
|
470 |
+
" # Her bir anahtar kelimeyi TF-IDF vektörüne dönüştür\n",
|
471 |
+
" keyword_tfidf = tfidf_vectorizer.transform([keyword]) #keywordleri teker teker alma fonksiyonu\n",
|
472 |
+
" \n",
|
473 |
+
" # Cosine similarity ile benzerlik hesapla\n",
|
474 |
+
" similarity = cosine_similarity(text_tfidf, keyword_tfidf)[0][0]\n",
|
475 |
+
" \n",
|
476 |
+
" # Anahtar kelime ve benzerlik skorunu kaydet\n",
|
477 |
+
" similarity_array.append((keyword, similarity))\n",
|
478 |
+
" \n",
|
479 |
+
" return similarity_array\n",
|
480 |
+
" \n",
|
481 |
+
"\n",
|
482 |
+
"# Örnek metin ve anahtar kelimeler\n",
|
483 |
+
"#combined verileri \n",
|
484 |
+
"text = \"Bu bir örnek metindir ve bu metin üzerinde anahtar kelimeleri test ediyoruz.\"\n",
|
485 |
+
"keywords = [\"başka\", \"bir\", \"anahtar\", \"kelimeleri\", \"test\"] #bu keywordsler tf-değerinden alınarak arraylere çevrilmeli \n",
|
486 |
+
" \n",
|
487 |
+
"# Uygunluk skorunu hesapla\n",
|
488 |
+
"similarity_results = calculate_keyword_similarity(text, keywords)\n",
|
489 |
+
"top_5_keywords = sorted(similarity_results, key=lambda x: x[1], reverse=True)[:5]\n",
|
490 |
+
"# Her bir anahtar kelimenin uyumluluk skorunu yazdır\n",
|
491 |
+
"\n",
|
492 |
+
"for keyword, similarity in top_5_keywords:\n",
|
493 |
+
" print(f\"Keyword: {keyword}, Similarity: {similarity}\")\n",
|
494 |
+
" #print(f\"Keyword: '{keyword}' - Relevance score: {score:.4f}\")\n",
|
495 |
+
"\n"
|
496 |
+
]
|
497 |
+
},
|
498 |
+
{
|
499 |
+
"cell_type": "code",
|
500 |
+
"execution_count": 10,
|
501 |
+
"metadata": {},
|
502 |
+
"outputs": [
|
503 |
+
{
|
504 |
+
"data": {
|
505 |
+
"text/plain": [
|
506 |
+
"<function __main__.process_texts(combined_texts, stop_words_list, top_n)>"
|
507 |
+
]
|
508 |
+
},
|
509 |
+
"execution_count": 10,
|
510 |
+
"metadata": {},
|
511 |
+
"output_type": "execute_result"
|
512 |
+
}
|
513 |
+
],
|
514 |
"source": [
|
515 |
"\n",
|
516 |
"# BERT Tokenizer ve Model'i yükleyin\n",
|
|
|
520 |
"def extract_keywords_tfidf(combined_texts, stop_words_list,top_n=5):\n",
|
521 |
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
522 |
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
523 |
+
" X = vectorizer.fit_transform(combined_texts) #bunu csv den oku \n",
|
524 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
525 |
" #sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
526 |
+
" \n",
|
527 |
+
" top_keywords_per_document = [] #her döküman için en iyi keywordsleri alır\n",
|
528 |
+
" top_tfidf_scores_per_document = [] #tf-ıdf değeri en yüksek olan dökümanlar\n",
|
529 |
+
"#------------------------------------------------------------------------------------------\n",
|
530 |
" for row in X:\n",
|
531 |
" tfidf_scores = row.toarray().flatten()\n",
|
532 |
" top_indices = tfidf_scores.argsort()[-top_n:][::-1] # En yüksek n skoru bul\n",
|
533 |
" top_keywords = [feature_names[i] for i in top_indices]\n",
|
534 |
+
" top_tfidf_scores_per_document = [tfidf_scores[i] for i in top_indices]\n",
|
535 |
+
"\n",
|
536 |
"\n",
|
537 |
+
" top_tfidf_scores_per_document.append(top_tfidf_scores)\n",
|
538 |
+
" top_keywords_per_document.append(top_keywords)\n",
|
539 |
+
" \n",
|
540 |
" return top_keywords_per_document\n",
|
541 |
"\n",
|
542 |
"# Anahtar kelimeleri çıkar ve BERT ile embedding oluştur\n",
|
|
|
545 |
" \n",
|
546 |
" for text in combined_texts:\n",
|
547 |
" # Anahtar kelimeleri çıkar\n",
|
548 |
+
" keywords = extract_keywords_tfidf(text, stop_words_list,top_n=15)\n",
|
549 |
" \n",
|
550 |
" # BERT ile embedding oluştur\n",
|
551 |
" inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)\n",
|
|
|
559 |
" 'embedding': embeddings\n",
|
560 |
" })\n",
|
561 |
" \n",
|
562 |
+
" return results\n",
|
563 |
+
"\n",
|
564 |
+
"results=process_texts\n",
|
565 |
+
"results\n",
|
566 |
+
"#tüm metinleri işle\n"
|
567 |
+
]
|
568 |
+
},
|
569 |
+
{
|
570 |
+
"cell_type": "markdown",
|
571 |
+
"metadata": {},
|
572 |
+
"source": [
|
573 |
+
"MongoDb'den database'in çekilmesi"
|
574 |
]
|
575 |
},
|
576 |
{
|
577 |
"cell_type": "code",
|
578 |
+
"execution_count": 5,
|
579 |
"metadata": {},
|
580 |
"outputs": [
|
581 |
{
|
582 |
"name": "stdout",
|
583 |
"output_type": "stream",
|
584 |
"text": [
|
585 |
+
"combined metinler 'combined_texts.csv' dosyasına başarıyla yazıldı.\n"
|
586 |
]
|
587 |
}
|
588 |
],
|
589 |
"source": [
|
590 |
+
"#mongodb üzerinden combined_textleri çek\n",
|
591 |
+
"import csv\n",
|
592 |
+
"from pymongo import MongoClient\n",
|
|
|
|
|
|
|
593 |
"\n",
|
594 |
+
"def mongo_db_combined_texts_to_csv(database_name='combined_text', collection_name='text', host='localhost', port=27017,batch_size=1000,output_file='combined_texts.csv'):\n",
|
595 |
+
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
|
596 |
+
" db = client[database_name]\n",
|
597 |
+
" collection = db[collection_name]\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
" \n",
|
599 |
+
" #toplam döküman sayısını al\n",
|
600 |
+
" total_documents = collection.count_documents({})\n",
|
601 |
+
" #batch_documents = []\n",
|
602 |
"\n",
|
603 |
+
" # CSV dosyasını aç ve yazmaya hazırla\n",
|
604 |
+
" with open(output_file, mode='w', newline='', encoding='utf-8') as file:\n",
|
605 |
+
" writer = csv.writer(file)\n",
|
606 |
+
" writer.writerow([\"combined\"]) # CSV başlığı\n",
|
607 |
"\n",
|
608 |
+
" # Belirtilen batch_size kadar dökümanları almak için döngü\n",
|
609 |
+
" for i in range(0, total_documents, batch_size):\n",
|
610 |
+
" cursor = collection.find({}, {\"combined\":1, \"_id\": 0}).skip(i).limit(batch_size)\n",
|
611 |
+
" combined_texts = [doc['combined'] for doc in cursor if 'combined' in doc] #combined sütununa ilişkin verileri çeker \n",
|
612 |
+
"\n",
|
613 |
+
" # Batch verilerini CSV'ye yaz\n",
|
614 |
+
" with open(output_file, mode='a', newline='', encoding='utf-8') as file:\n",
|
615 |
+
" writer = csv.writer(file)\n",
|
616 |
+
" \n",
|
617 |
+
" for text in combined_texts:\n",
|
618 |
+
" writer.writerow([text])\n",
|
619 |
+
" \n",
|
620 |
+
" \n",
|
621 |
"\n",
|
622 |
+
" print(f\"combined metinler '{output_file}' dosyasına başarıyla yazıldı.\")\n",
|
623 |
+
"\n",
|
624 |
+
"# Dökümanları CSV dosyasına yazdır\n",
|
625 |
+
"text=mongo_db_combined_texts_to_csv(batch_size=1000)\n",
|
626 |
+
" #batch_documents.extend((combined_texts, len(combined_texts)))\n",
|
627 |
+
" #append fonksiyonu listenin içerisine tek bir eleman gibi ekler yani list1 = [1, 2, 3, [4, 5]]\n",
|
628 |
+
" #fakat extend fonksiyonu list1 = [1, 2, 3, 4, 5] bir listeye yeni bir liste eklemeyi teker teker gerçekleştirir.\n",
|
629 |
+
" #return batch_documents\n",
|
630 |
+
"\n",
|
631 |
+
"# Dökümanları ve döküman sayısını batch olarak çekin\n",
|
632 |
+
"#combined_texts = mongo_db_combined_texts(batch_size=1000)\n",
|
633 |
+
"\n",
|
634 |
+
"# Her batch'i ayrı ayrı işleyebilirsiniz\n",
|
635 |
+
"#print(f\"Toplam döküman sayısı:{len(combined_texts)}\")\n",
|
636 |
+
"\n",
|
637 |
+
"#for index, text in enumerate (combined_texts[:10]):\n",
|
638 |
+
" #print(f\"Döküman {index + 1}: {text}\")\n",
|
639 |
+
"\n",
|
640 |
+
"#print(combined_texts)\n",
|
641 |
+
"\n",
|
642 |
+
" "
|
643 |
+
]
|
644 |
+
},
|
645 |
+
{
|
646 |
+
"cell_type": "markdown",
|
647 |
+
"metadata": {},
|
648 |
+
"source": [
|
649 |
+
"Gereksiz kelimelerin 'gereksiz_kelimeler.txt' üzerinden import edilmesi"
|
650 |
+
]
|
651 |
+
},
|
652 |
+
{
|
653 |
+
"cell_type": "markdown",
|
654 |
+
"metadata": {},
|
655 |
+
"source": [
|
656 |
+
"TF-IDF Skorları "
|
657 |
]
|
658 |
},
|
659 |
{
|
processed_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:97993e92396f03aa90162dad808bbd3c655a988b37d7ba45704b0058371c6172
|
3 |
+
size 419458630
|