Spaces:
Build error
Build error
Upload 9 files
Browse files- .gitattributes +1 -0
- MLM.ipynb +539 -0
- alternatif.txt +10 -0
- combined_output.csv +3 -0
- datasets.ipynb +749 -166
- deneme.ipynb +69 -0
- gereksiz_kelimeler.txt +0 -0
- kelimeler.txt +0 -0
- requirements.txt +127 -6
- turkish_stop_words.txt +428 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
combined_output.csv filter=lfs diff=lfs merge=lfs -text
|
MLM.ipynb
ADDED
@@ -0,0 +1,539 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"ename": "OSError",
|
10 |
+
"evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
|
11 |
+
"output_type": "error",
|
12 |
+
"traceback": [
|
13 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
14 |
+
"\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
|
15 |
+
"Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n",
|
16 |
+
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
|
17 |
+
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
|
18 |
+
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
|
19 |
+
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
|
20 |
+
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
|
21 |
+
"\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
|
22 |
+
]
|
23 |
+
}
|
24 |
+
],
|
25 |
+
"source": [
|
26 |
+
"from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
|
27 |
+
"import torch "
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": 2,
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [
|
35 |
+
{
|
36 |
+
"name": "stderr",
|
37 |
+
"output_type": "stream",
|
38 |
+
"text": [
|
39 |
+
"c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
40 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
41 |
+
" warnings.warn(message)\n",
|
42 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
43 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
44 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
45 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
46 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
47 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
48 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
49 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
50 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
51 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
52 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
53 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
54 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
55 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
56 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
57 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
58 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
59 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
60 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
61 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
62 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
63 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
64 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
65 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
66 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
67 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
68 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
69 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
70 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
71 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
72 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
73 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
74 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
75 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
76 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
77 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
78 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
79 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
80 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
81 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
82 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
83 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
84 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
85 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
86 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
87 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
88 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
89 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
90 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
91 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
92 |
+
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
|
93 |
+
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
|
94 |
+
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
|
95 |
+
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
96 |
+
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
97 |
+
]
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"source": [
|
101 |
+
"tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
|
102 |
+
"model=BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
|
103 |
+
"\n",
|
104 |
+
"text=(\"After reading these reports,\"\n",
|
105 |
+
" \"we start an outline of the application of ML.\"\n",
|
106 |
+
" \"It includes the [MASK] process \"\n",
|
107 |
+
" \"and various applications (from various software development to hardware development), to [MASK] of IT systems, and various approaches on analytics.\"\n",
|
108 |
+
" \"The approach incorporates [MASK] as well as computing and data mining.\"\n",
|
109 |
+
" \"For example, software developers and manufacturing engineers used AI \"\n",
|
110 |
+
" \"in manufacturing to develop their applications.\"\n",
|
111 |
+
" )"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"cell_type": "code",
|
116 |
+
"execution_count": 4,
|
117 |
+
"metadata": {},
|
118 |
+
"outputs": [
|
119 |
+
{
|
120 |
+
"data": {
|
121 |
+
"text/plain": [
|
122 |
+
"dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
"execution_count": 4,
|
126 |
+
"metadata": {},
|
127 |
+
"output_type": "execute_result"
|
128 |
+
}
|
129 |
+
],
|
130 |
+
"source": [
|
131 |
+
"#maskeleme yaptıktan sonra tokenlere çeviriyoruz\n",
|
132 |
+
"inputs= tokenizer(text,return_tensors='pt')\n",
|
133 |
+
"inputs.keys()"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"cell_type": "code",
|
138 |
+
"execution_count": 5,
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [
|
141 |
+
{
|
142 |
+
"data": {
|
143 |
+
"text/plain": [
|
144 |
+
"tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
|
145 |
+
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
|
146 |
+
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
|
147 |
+
" 8051, 2458, 1007, 1010, 2000, 103, 1997, 2009, 3001, 1010,\n",
|
148 |
+
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 103,\n",
|
149 |
+
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
|
150 |
+
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
|
151 |
+
" 2000, 4503, 2037, 5097, 1012, 102]])"
|
152 |
+
]
|
153 |
+
},
|
154 |
+
"execution_count": 5,
|
155 |
+
"metadata": {},
|
156 |
+
"output_type": "execute_result"
|
157 |
+
}
|
158 |
+
],
|
159 |
+
"source": [
|
160 |
+
"inputs.input_ids"
|
161 |
+
]
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"cell_type": "code",
|
165 |
+
"execution_count": 6,
|
166 |
+
"metadata": {},
|
167 |
+
"outputs": [],
|
168 |
+
"source": [
|
169 |
+
"text_normal= (\"After reading these reports,\"\n",
|
170 |
+
" \"we start an outline of the application of ML.\"\n",
|
171 |
+
" \"It includes the learning process \"\n",
|
172 |
+
" \"and various applications (from various software development to hardware development), to analysis of IT systems, and various approaches on analytics.\"\n",
|
173 |
+
" \"The approach incorporates AI as well as computing and data mining.\"\n",
|
174 |
+
" \"For example, software developers and manufacturing engineers used AI \"\n",
|
175 |
+
" \"in manufacturing to develop their applications.\"\n",
|
176 |
+
" )"
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": 8,
|
182 |
+
"metadata": {},
|
183 |
+
"outputs": [
|
184 |
+
{
|
185 |
+
"data": {
|
186 |
+
"text/plain": [
|
187 |
+
"dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
"execution_count": 8,
|
191 |
+
"metadata": {},
|
192 |
+
"output_type": "execute_result"
|
193 |
+
}
|
194 |
+
],
|
195 |
+
"source": [
|
196 |
+
"#texti tokenlere çeviriyoruz\n",
|
197 |
+
"inputs_2= tokenizer(text_normal,return_tensors='pt')\n",
|
198 |
+
"inputs_2.keys()"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"cell_type": "code",
|
203 |
+
"execution_count": 9,
|
204 |
+
"metadata": {},
|
205 |
+
"outputs": [
|
206 |
+
{
|
207 |
+
"data": {
|
208 |
+
"text/plain": [
|
209 |
+
"tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
|
210 |
+
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
|
211 |
+
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
|
212 |
+
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
|
213 |
+
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
|
214 |
+
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
|
215 |
+
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
|
216 |
+
" 2000, 4503, 2037, 5097, 1012, 102]])"
|
217 |
+
]
|
218 |
+
},
|
219 |
+
"execution_count": 9,
|
220 |
+
"metadata": {},
|
221 |
+
"output_type": "execute_result"
|
222 |
+
}
|
223 |
+
],
|
224 |
+
"source": [
|
225 |
+
"inputs_2.input_ids"
|
226 |
+
]
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"cell_type": "code",
|
230 |
+
"execution_count": 10,
|
231 |
+
"metadata": {},
|
232 |
+
"outputs": [],
|
233 |
+
"source": [
|
234 |
+
"inputs_2['labels']= inputs_2.input_ids.detach().clone()"
|
235 |
+
]
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"cell_type": "code",
|
239 |
+
"execution_count": 11,
|
240 |
+
"metadata": {},
|
241 |
+
"outputs": [
|
242 |
+
{
|
243 |
+
"data": {
|
244 |
+
"text/plain": [
|
245 |
+
"{'input_ids': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
|
246 |
+
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
|
247 |
+
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
|
248 |
+
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
|
249 |
+
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
|
250 |
+
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
|
251 |
+
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
|
252 |
+
" 2000, 4503, 2037, 5097, 1012, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
253 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
254 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
255 |
+
" 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
256 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
257 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
258 |
+
" 1, 1, 1, 1]]), 'labels': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
|
259 |
+
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
|
260 |
+
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
|
261 |
+
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
|
262 |
+
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
|
263 |
+
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
|
264 |
+
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
|
265 |
+
" 2000, 4503, 2037, 5097, 1012, 102]])}"
|
266 |
+
]
|
267 |
+
},
|
268 |
+
"execution_count": 11,
|
269 |
+
"metadata": {},
|
270 |
+
"output_type": "execute_result"
|
271 |
+
}
|
272 |
+
],
|
273 |
+
"source": [
|
274 |
+
"inputs_2"
|
275 |
+
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "code",
|
279 |
+
"execution_count": 12,
|
280 |
+
"metadata": {},
|
281 |
+
"outputs": [
|
282 |
+
{
|
283 |
+
"data": {
|
284 |
+
"text/plain": [
|
285 |
+
"torch.Size([1, 76])"
|
286 |
+
]
|
287 |
+
},
|
288 |
+
"execution_count": 12,
|
289 |
+
"metadata": {},
|
290 |
+
"output_type": "execute_result"
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"source": [
|
294 |
+
"#random tokenler oluşturacağız labelsiz\n",
|
295 |
+
"rand=torch.rand(inputs_2.input_ids.shape)\n",
|
296 |
+
"rand.shape"
|
297 |
+
]
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"cell_type": "code",
|
301 |
+
"execution_count": 13,
|
302 |
+
"metadata": {},
|
303 |
+
"outputs": [
|
304 |
+
{
|
305 |
+
"data": {
|
306 |
+
"text/plain": [
|
307 |
+
"tensor([[0.9397, 0.1325, 0.1893, 0.8258, 0.7453, 0.1766, 0.9338, 0.0806, 0.0626,\n",
|
308 |
+
" 0.6665, 0.4240, 0.3946, 0.5413, 0.3799, 0.4023, 0.8699, 0.8159, 0.1511,\n",
|
309 |
+
" 0.6842, 0.0242, 0.7235, 0.0063, 0.1857, 0.9684, 0.8930, 0.8208, 0.5711,\n",
|
310 |
+
" 0.0345, 0.9919, 0.1140, 0.7597, 0.4546, 0.6478, 0.2295, 0.2846, 0.6314,\n",
|
311 |
+
" 0.3640, 0.9291, 0.3843, 0.3553, 0.1125, 0.0790, 0.4261, 0.4307, 0.6724,\n",
|
312 |
+
" 0.8569, 0.4476, 0.8032, 0.0241, 0.0152, 0.4196, 0.5609, 0.0010, 0.7240,\n",
|
313 |
+
" 0.4531, 0.5834, 0.5232, 0.3602, 0.6575, 0.9012, 0.1519, 0.2255, 0.0799,\n",
|
314 |
+
" 0.5673, 0.7244, 0.4387, 0.2713, 0.4243, 0.8435, 0.1670, 0.8664, 0.6261,\n",
|
315 |
+
" 0.4090, 0.2988, 0.3379, 0.7784]])"
|
316 |
+
]
|
317 |
+
},
|
318 |
+
"execution_count": 13,
|
319 |
+
"metadata": {},
|
320 |
+
"output_type": "execute_result"
|
321 |
+
}
|
322 |
+
],
|
323 |
+
"source": [
|
324 |
+
"rand"
|
325 |
+
]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"cell_type": "code",
|
329 |
+
"execution_count": 14,
|
330 |
+
"metadata": {},
|
331 |
+
"outputs": [
|
332 |
+
{
|
333 |
+
"data": {
|
334 |
+
"text/plain": [
|
335 |
+
"tensor([[False, True, False, False, False, False, False, True, True, False,\n",
|
336 |
+
" False, False, False, False, False, False, False, False, False, True,\n",
|
337 |
+
" False, True, False, False, False, False, False, True, False, True,\n",
|
338 |
+
" False, False, False, False, False, False, False, False, False, False,\n",
|
339 |
+
" True, True, False, False, False, False, False, False, True, True,\n",
|
340 |
+
" False, False, True, False, False, False, False, False, False, False,\n",
|
341 |
+
" False, False, True, False, False, False, False, False, False, False,\n",
|
342 |
+
" False, False, False, False, False, False]])"
|
343 |
+
]
|
344 |
+
},
|
345 |
+
"execution_count": 14,
|
346 |
+
"metadata": {},
|
347 |
+
"output_type": "execute_result"
|
348 |
+
}
|
349 |
+
],
|
350 |
+
"source": [
|
351 |
+
"#cümledeki toknelerin yüzde 15 alınır \n",
|
352 |
+
"#mask_arr = rand < 0.15 ifadesi, rand fonksiyonunun her bir token için rastgele bir sayı üreteceğini ve bu sayının 0.15'ten küçük olup olmadığına bakarak token'ın maskelenip maskelenmeyeceğini belirler. Eğer sayı 0.15'ten küçükse, token maskelenir; değilse, maskelenmez. \n",
|
353 |
+
"mask_arr = rand < 0.15\n",
|
354 |
+
"mask_arr"
|
355 |
+
]
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"cell_type": "code",
|
359 |
+
"execution_count": 15,
|
360 |
+
"metadata": {},
|
361 |
+
"outputs": [
|
362 |
+
{
|
363 |
+
"data": {
|
364 |
+
"text/plain": [
|
365 |
+
"[1, 7, 8, 19, 21, 27, 29, 40, 41, 48, 49, 52, 62]"
|
366 |
+
]
|
367 |
+
},
|
368 |
+
"execution_count": 15,
|
369 |
+
"metadata": {},
|
370 |
+
"output_type": "execute_result"
|
371 |
+
}
|
372 |
+
],
|
373 |
+
"source": [
|
374 |
+
"#burada seçilen değer maskeleme yapılan tokenlarda 0 olmayan karakterlerin yazılmasıdır.\n",
|
375 |
+
"#torch flatten özelliği listeden çıkartarak yalnızca bir array olmasını sağladı\n",
|
376 |
+
"selection= torch.flatten(mask_arr[0].nonzero()).tolist()\n",
|
377 |
+
"selection"
|
378 |
+
]
|
379 |
+
},
|
380 |
+
{
|
381 |
+
"cell_type": "code",
|
382 |
+
"execution_count": 16,
|
383 |
+
"metadata": {},
|
384 |
+
"outputs": [
|
385 |
+
{
|
386 |
+
"data": {
|
387 |
+
"text/plain": [
|
388 |
+
"tensor([[ 101, 103, 3752, 2122, 4311, 1010, 2057, 103, 103, 12685,\n",
|
389 |
+
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
|
390 |
+
" 2832, 103, 2536, 5097, 1006, 2013, 2536, 103, 2458, 103,\n",
|
391 |
+
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
|
392 |
+
" 103, 103, 8107, 2006, 25095, 1012, 1996, 3921, 103, 103,\n",
|
393 |
+
" 2004, 2092, 103, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
|
394 |
+
" 1010, 4007, 103, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
|
395 |
+
" 2000, 4503, 2037, 5097, 1012, 102]])"
|
396 |
+
]
|
397 |
+
},
|
398 |
+
"execution_count": 16,
|
399 |
+
"metadata": {},
|
400 |
+
"output_type": "execute_result"
|
401 |
+
}
|
402 |
+
],
|
403 |
+
"source": [
|
404 |
+
"#input_ids değerleri 0 olanlar için 103 değerinim atadık \n",
|
405 |
+
"inputs_2.input_ids[0,selection]=103\n",
|
406 |
+
"inputs_2.input_ids"
|
407 |
+
]
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"cell_type": "code",
|
411 |
+
"execution_count": 17,
|
412 |
+
"metadata": {},
|
413 |
+
"outputs": [],
|
414 |
+
"source": [
|
415 |
+
"outputs= model(**inputs_2)"
|
416 |
+
]
|
417 |
+
},
|
418 |
+
{
|
419 |
+
"cell_type": "code",
|
420 |
+
"execution_count": 18,
|
421 |
+
"metadata": {},
|
422 |
+
"outputs": [
|
423 |
+
{
|
424 |
+
"data": {
|
425 |
+
"text/plain": [
|
426 |
+
"odict_keys(['loss', 'logits'])"
|
427 |
+
]
|
428 |
+
},
|
429 |
+
"execution_count": 18,
|
430 |
+
"metadata": {},
|
431 |
+
"output_type": "execute_result"
|
432 |
+
}
|
433 |
+
],
|
434 |
+
"source": [
|
435 |
+
"outputs.keys()"
|
436 |
+
]
|
437 |
+
},
|
438 |
+
{
|
439 |
+
"cell_type": "code",
|
440 |
+
"execution_count": 19,
|
441 |
+
"metadata": {},
|
442 |
+
"outputs": [
|
443 |
+
{
|
444 |
+
"data": {
|
445 |
+
"text/plain": [
|
446 |
+
"tensor(0.8399, grad_fn=<NllLossBackward0>)"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
"execution_count": 19,
|
450 |
+
"metadata": {},
|
451 |
+
"output_type": "execute_result"
|
452 |
+
}
|
453 |
+
],
|
454 |
+
"source": [
|
455 |
+
"outputs.loss"
|
456 |
+
]
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"cell_type": "code",
|
460 |
+
"execution_count": 22,
|
461 |
+
"metadata": {},
|
462 |
+
"outputs": [
|
463 |
+
{
|
464 |
+
"name": "stderr",
|
465 |
+
"output_type": "stream",
|
466 |
+
"text": [
|
467 |
+
"c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--facebook--dpr-ctx_encoder-single-nq-base. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
|
468 |
+
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
|
469 |
+
" warnings.warn(message)\n",
|
470 |
+
"Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
|
471 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
472 |
+
"Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
|
473 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
|
474 |
+
"Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']\n",
|
475 |
+
"- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
476 |
+
"- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
477 |
+
]
|
478 |
+
}
|
479 |
+
],
|
480 |
+
"source": [
|
481 |
+
"from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer\n",
|
482 |
+
"\n",
|
483 |
+
"ctx_model=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
|
484 |
+
"ctx_tokenizer=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
|
485 |
+
"\n",
|
486 |
+
"question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
|
487 |
+
"question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
|
488 |
+
"\n"
|
489 |
+
]
|
490 |
+
},
|
491 |
+
{
|
492 |
+
"cell_type": "code",
|
493 |
+
"execution_count": null,
|
494 |
+
"metadata": {},
|
495 |
+
"outputs": [],
|
496 |
+
"source": [
|
497 |
+
"\"\"\"title = [\"2024 Yılında Mobil Teknoloji Trendleri\"]\n",
|
498 |
+
"keywords = [\"mobil teknoloji\", \"2024 trendleri\", \"akıllı telefon yenilikleri\", \"5G teknolojisi\", \"giyilebilir cihazlar\"]\n",
|
499 |
+
"subheading = [\n",
|
500 |
+
" \"2024'te Akıllı Telefonlardaki Yenilikler\",\n",
|
501 |
+
" \"Giyilebilir Teknolojiler: Sağlık ve Fitness Trendleri\",\n",
|
502 |
+
" \"5G'nin Mobil Cihazlar Üzerindeki Etkisi\",\n",
|
503 |
+
" \"Mobil Güvenlikte Yeni Yaklaşımlar\"\n",
|
504 |
+
"]\"\"\"\n"
|
505 |
+
]
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"cell_type": "code",
|
509 |
+
"execution_count": null,
|
510 |
+
"metadata": {},
|
511 |
+
"outputs": [],
|
512 |
+
"source": [
|
513 |
+
"\n",
|
514 |
+
"xb_tokens=ctx_tokenizer()\n"
|
515 |
+
]
|
516 |
+
}
|
517 |
+
],
|
518 |
+
"metadata": {
|
519 |
+
"kernelspec": {
|
520 |
+
"display_name": "myenv",
|
521 |
+
"language": "python",
|
522 |
+
"name": "python3"
|
523 |
+
},
|
524 |
+
"language_info": {
|
525 |
+
"codemirror_mode": {
|
526 |
+
"name": "ipython",
|
527 |
+
"version": 3
|
528 |
+
},
|
529 |
+
"file_extension": ".py",
|
530 |
+
"mimetype": "text/x-python",
|
531 |
+
"name": "python",
|
532 |
+
"nbconvert_exporter": "python",
|
533 |
+
"pygments_lexer": "ipython3",
|
534 |
+
"version": "3.12.4"
|
535 |
+
}
|
536 |
+
},
|
537 |
+
"nbformat": 4,
|
538 |
+
"nbformat_minor": 2
|
539 |
+
}
|
alternatif.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Alternatif Yollar
|
2 |
+
LDA (Latent Dirichlet Allocation): Konu modelleme yöntemlerinden biridir ve belgelerdeki ana temaları belirlemek için kullanılabilir.
|
3 |
+
|
4 |
+
BERT ve Transformer Modelleri: Önerilen transformer tabanlı modeller, bağlam analizi için güçlüdür ve metinleri anlamak ve anahtar kelimeleri çıkarmak için kullanılabilir.
|
5 |
+
|
6 |
+
TF-IDF ile Birlikte LDA: TF-IDF ve LDA’nın kombinasyonu, hem anahtar kelime hem de konu modellemesi için kullanılabilir.
|
7 |
+
|
8 |
+
Sentence Embeddings ve Similitude Modeling: Sentence-BERT gibi modeller kullanarak metinler arasındaki benzerlikleri ölçebilir ve bu benzerlikleri anahtar kelime çıkarımı ile birleştirebilirsiniz.
|
9 |
+
|
10 |
+
Pretrained Language Models: Hugging Face gibi kütüphaneler üzerinden önceden eğitilmiş modeller kullanarak, daha doğru ve bağlamlı anahtar kelime çıkarımı ve metin analizi yapılabilir.
|
combined_output.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d132586c17782ba7dabc58b6b8c11b797a65890e95a605b38b208caa08d984e
|
3 |
+
size 1476218231
|
datasets.ipynb
CHANGED
@@ -437,9 +437,16 @@
|
|
437 |
"TF-IDF HESAPLAMA"
|
438 |
]
|
439 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
{
|
441 |
"cell_type": "code",
|
442 |
-
"execution_count":
|
443 |
"metadata": {},
|
444 |
"outputs": [
|
445 |
{
|
@@ -447,165 +454,104 @@
|
|
447 |
"output_type": "stream",
|
448 |
"text": [
|
449 |
"Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
|
450 |
-
"Average Embedding Shape: torch.Size([
|
451 |
-
"Average Embedding: tensor([
|
452 |
-
"
|
453 |
-
"
|
454 |
-
"
|
455 |
-
"
|
456 |
-
" -
|
457 |
-
"
|
458 |
-
"
|
459 |
-
"
|
460 |
-
"
|
461 |
-
"
|
462 |
-
"
|
463 |
-
"
|
464 |
-
"
|
465 |
-
"
|
466 |
-
" -
|
467 |
-
" -
|
468 |
-
" -
|
469 |
-
" -
|
470 |
-
"
|
471 |
-
"
|
472 |
-
"
|
473 |
-
"
|
474 |
-
"
|
475 |
-
"
|
476 |
-
" -
|
477 |
-
"
|
478 |
-
"
|
479 |
-
"
|
480 |
-
" -4.
|
481 |
-
" -
|
482 |
-
" -1.
|
483 |
-
"
|
484 |
-
"
|
485 |
-
"
|
486 |
-
"
|
487 |
-
"
|
488 |
-
"
|
489 |
-
"
|
490 |
-
" -2.
|
491 |
-
"
|
492 |
-
"
|
493 |
-
"
|
494 |
-
"
|
495 |
-
"
|
496 |
-
" -
|
497 |
-
"
|
498 |
-
"
|
499 |
-
"
|
500 |
-
"
|
501 |
-
"
|
502 |
-
" -
|
503 |
-
"
|
504 |
-
"
|
505 |
-
"
|
506 |
-
"
|
507 |
-
"
|
508 |
-
"
|
509 |
-
"
|
510 |
-
"
|
511 |
-
"
|
512 |
-
"
|
513 |
-
"
|
514 |
-
"
|
515 |
-
"
|
516 |
-
"
|
517 |
-
"
|
518 |
-
"
|
519 |
-
"
|
520 |
-
"
|
521 |
-
"
|
522 |
-
"
|
523 |
-
"
|
524 |
-
"
|
525 |
-
"
|
526 |
-
" -4.6655e-01, -4.3796e-01, 6.7476e-02, 3.4367e-01, 1.8640e-01,\n",
|
527 |
-
" 3.3172e-01, -4.1092e-01, 2.6630e-02, -4.9168e-02, -3.4948e-01,\n",
|
528 |
-
" 1.6500e-02, -4.3398e-01, 2.6911e-01, 3.4227e-02, -2.1475e-01,\n",
|
529 |
-
" 9.7154e-01, -2.9554e-01, 8.5149e-01, -6.0231e-01, 1.0421e-01,\n",
|
530 |
-
" 6.2897e-01, 1.8700e-02, 1.6866e-01, -7.0568e-03, -6.9820e-01,\n",
|
531 |
-
" -1.3916e-01, 3.2686e-01, -1.5017e-01, 6.5600e-01, 2.9388e-02,\n",
|
532 |
-
" -6.0431e-01, 3.8548e-02, -1.2187e-01, -4.8818e-01, 1.5922e-01,\n",
|
533 |
-
" -2.1494e-02, -2.1316e-01, -1.5983e-01, -3.7928e-01, 5.6203e-01,\n",
|
534 |
-
" 3.1285e-01, -4.0310e-01, 3.8763e-01, -4.1886e-01, 1.6276e-01,\n",
|
535 |
-
" 1.2610e-01, 3.5952e-01, 1.3288e-01, 6.0504e-01, -3.4769e-01,\n",
|
536 |
-
" -1.5976e-01, 2.9626e-01, -2.2079e-01, -1.5934e-01, -5.8491e-01,\n",
|
537 |
-
" -5.7811e-02, -4.7510e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
|
538 |
-
" 9.2435e-02, 2.3198e-01, -5.8704e-01, -1.9506e-01, -5.3740e-01,\n",
|
539 |
-
" 1.8715e-01, -3.5691e-01, 2.5481e-01, 3.2795e-01, -9.4206e-02,\n",
|
540 |
-
" -2.2492e-01, -3.1406e-01, 4.5814e-01, -1.7896e-01, -3.9470e-01,\n",
|
541 |
-
" 1.9183e-01, -4.3177e-01, 2.7146e-01, 1.9477e-01, -1.7568e-02,\n",
|
542 |
-
" -2.0134e-01, 5.7984e-03, 3.0490e-01, -2.7846e-01, 9.8830e-03,\n",
|
543 |
-
" -3.0119e-01, -4.1994e-01, -1.0905e-02, 6.9638e-01, 9.4965e-02,\n",
|
544 |
-
" -2.6103e-01, 8.8206e-02, -1.0292e-01, -1.2342e-01, -2.2317e-03,\n",
|
545 |
-
" -5.2474e-02, -2.1636e-01, -1.6554e-01, 2.3173e-01, 1.2170e-01,\n",
|
546 |
-
" 4.5793e-01, -1.1033e-01, 1.4489e-01, 2.2540e-01, 5.2360e-01,\n",
|
547 |
-
" -3.6468e-01, -1.5081e-01, -2.3761e-02, 2.7475e-01, 5.3707e-01,\n",
|
548 |
-
" 9.3503e-02, -4.9759e-01, 1.5903e-01, -1.2017e-01, 3.4478e-01,\n",
|
549 |
-
" -2.1399e-01, 3.9456e-01, -3.2861e-01, 1.7182e-01, -1.1697e-01,\n",
|
550 |
-
" 5.6727e-03, -1.9770e-01, -2.3682e-01, 2.7554e-01, -3.9236e-01,\n",
|
551 |
-
" 2.0691e-01, 1.6439e-01, -3.7138e-01, -7.8304e-01, -1.9874e-01,\n",
|
552 |
-
" 6.4637e-01, -2.4494e-01, -4.1920e-01, -3.7675e-01, 1.3178e-01,\n",
|
553 |
-
" 1.9076e-01, -1.2906e-01, -6.4864e-04, -9.7821e-03, -1.2172e-01,\n",
|
554 |
-
" -5.5357e-02, 2.2997e-01, -3.2848e-01, -4.1649e-01, 9.9676e-04,\n",
|
555 |
-
" -4.5320e-01, -2.2864e-01, -1.6760e-01, -7.9657e-02, -6.0780e-02,\n",
|
556 |
-
" -1.7627e-01, -4.1947e-02, 2.3884e-01, -4.7784e-03, -3.1593e-01,\n",
|
557 |
-
" -1.0243e-01, 5.3464e-01, 2.7388e-01, -4.2258e-02, -1.5521e-01,\n",
|
558 |
-
" -1.0183e-01, -2.9342e-01, -1.0132e+00, 2.3122e-01, -3.3482e-01,\n",
|
559 |
-
" 3.2136e-01, -2.3603e-01, -1.4938e-01, -2.3986e-01, 6.1094e-02,\n",
|
560 |
-
" 1.6784e-01, -3.8075e-02, 5.6459e-01, -2.0828e-02, -1.7406e-01,\n",
|
561 |
-
" -2.9475e-01, -5.0143e-01, -1.6885e-01, 4.4070e-01, 3.1866e-01,\n",
|
562 |
-
" -2.7534e-01, 4.1410e-01, -7.2704e-02, -2.9659e-01, 3.0922e-01,\n",
|
563 |
-
" -5.1553e-01, -2.7293e-01, -1.2403e-01, 5.3698e-01, 8.8994e-02,\n",
|
564 |
-
" 4.1334e-01, 2.5389e-01, 6.0110e-01, -2.3192e-01, -9.9463e+00,\n",
|
565 |
-
" 3.8342e-01, -3.4833e-01, 3.5175e-02, -3.3336e-01, 2.5660e-01,\n",
|
566 |
-
" 8.5744e-01, -3.4563e-01, 3.0483e-03, 3.4735e-01, 3.8450e-01,\n",
|
567 |
-
" 3.9665e-01, 2.2100e-01, 6.5109e-02, -5.5761e-01, -6.2348e-01,\n",
|
568 |
-
" -1.8679e-01, 1.9003e-01, 7.4262e-02, -5.9655e-02, -3.9839e-01,\n",
|
569 |
-
" -2.2625e-02, -7.6319e-02, 2.9763e-01, 1.4098e-01, -2.8759e-01,\n",
|
570 |
-
" -4.0783e-01, 1.1544e-01, 3.2446e-01, -2.9828e-01, 1.4054e-02,\n",
|
571 |
-
" 1.6943e-01, -2.0345e-01, -2.1174e-02, 1.1417e-01, 3.3420e-01,\n",
|
572 |
-
" -1.0892e-01, -3.1187e-01, -5.7087e-01, -1.1561e-02, 4.2107e-02,\n",
|
573 |
-
" 4.9406e-01, -3.7056e-01, -3.2354e-01, 5.4846e-02, 2.4392e-01,\n",
|
574 |
-
" -1.2840e-01, -4.3743e-01, 2.4391e-01, 2.1046e-01, -6.3811e-01,\n",
|
575 |
-
" 3.5563e-01, -2.0561e-01, -3.0996e-01, 1.6479e-01, -5.1947e-02,\n",
|
576 |
-
" 3.2559e-01, -6.3670e-03, -2.7855e-01, -4.2847e-01, -1.2022e-01,\n",
|
577 |
-
" 4.0702e-01, 9.6086e-01, 1.3305e-01, -2.0369e-01, 7.5751e-02,\n",
|
578 |
-
" -1.2915e-01, -8.5741e-02, 2.7087e-01, 9.1068e-02, -1.5946e-01,\n",
|
579 |
-
" 4.7289e-01, 1.0613e-01, 1.3504e-01, 2.7304e-01, -7.9823e-01,\n",
|
580 |
-
" 1.1986e-01, 4.7432e-01, -1.4133e-01, 3.9729e-01, -1.6949e-01,\n",
|
581 |
-
" -9.2290e-01, -1.9302e-01, -7.9017e-02, -6.5796e-01, 1.3385e-02,\n",
|
582 |
-
" 1.6185e-01, -3.4487e-01, 5.8601e-01, -1.5023e-01, 5.8034e-01,\n",
|
583 |
-
" -2.8326e-01, -1.6494e-01, -2.9796e-01, 6.7479e-03, -6.3622e-01,\n",
|
584 |
-
" -1.7732e-02, -1.6043e-01, -8.2452e-01, -2.4934e-02, -1.3969e-01,\n",
|
585 |
-
" -1.2475e-01, 2.1235e-01, 6.9211e-02, 1.1795e-01, -2.5098e-02,\n",
|
586 |
-
" 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01, 2.7441e-01,\n",
|
587 |
-
" -2.6457e-01, -3.3007e-01, -3.1083e-01, 4.9623e-01, -2.7829e-01,\n",
|
588 |
-
" -3.0000e-01, -2.5620e-01, 2.1623e-01, -1.0724e-01, -5.0995e-01,\n",
|
589 |
-
" -4.9460e-01, 8.4283e-02, -3.2844e-01, -6.0080e-01, -1.1809e-01,\n",
|
590 |
-
" 1.1040e-01, 3.7749e-02, 3.9097e-01, 2.7157e-02, -3.5270e-01,\n",
|
591 |
-
" -1.0008e-01, -3.1026e-01, -1.9041e-01, 3.7090e-01, -4.5056e-01,\n",
|
592 |
-
" -8.3087e-02, -3.6450e-01, -1.0154e+00, -1.3134e-01, -5.0261e-02,\n",
|
593 |
-
" 3.6961e-01, -1.1989e-01, -1.2336e-01, 2.6829e-01, -6.0926e-01,\n",
|
594 |
-
" -3.0037e-01, -1.0460e+00, -2.1501e-01, 1.7171e-01, 1.7970e-02,\n",
|
595 |
-
" -2.0708e-01, -1.3656e-01, -3.2854e-01, 1.2158e-01, -3.0438e-01,\n",
|
596 |
-
" -4.6487e-02, 1.8717e-01, -2.3236e-01, -1.4668e-01, -6.9169e-01,\n",
|
597 |
-
" -2.1502e-01, -1.2722e-01, 3.5600e-01, 1.5203e-03, -3.7041e-01,\n",
|
598 |
-
" -6.5877e-01, 2.1490e-01, -5.1359e-02, 2.2720e-01, -1.6363e-01,\n",
|
599 |
-
" -1.0862e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, -2.5987e-01,\n",
|
600 |
-
" -2.0222e-01, 3.4466e-02, 5.8733e-01, -1.6877e-01, -4.8642e-01,\n",
|
601 |
-
" -7.8254e-03, 1.2950e-01, -5.6791e-01, -6.6342e-01, -1.5021e-01,\n",
|
602 |
-
" -4.4367e-01, -2.8434e-01, -1.7593e-01, -4.2538e-01, -3.7350e-01,\n",
|
603 |
-
" -4.0185e-02, -6.1727e-01, 2.3771e-01, -4.1247e-01, 3.9440e-01,\n",
|
604 |
-
" 1.0506e-01, -4.0222e-01, 5.9232e-01])\n",
|
605 |
-
"TF-IDF Keywords: [('rus', np.float64(0.33567254331867563)), ('ahecaqo', np.float64(0.25175440748900674)), ('000', np.float64(0.16783627165933782)), ('1777', np.float64(0.16783627165933782)), ('ile', np.float64(0.16783627165933782)), ('pşıqo', np.float64(0.16783627165933782)), ('türkçe', np.float64(0.16783627165933782)), ('vardı', np.float64(0.16783627165933782)), ('çerkes', np.float64(0.16783627165933782)), ('çerkesya', np.float64(0.16783627165933782)), ('12', np.float64(0.08391813582966891)), ('1837', np.float64(0.08391813582966891)), ('1838', np.float64(0.08391813582966891)), ('adamıydı', np.float64(0.08391813582966891)), ('adlardaki', np.float64(0.08391813582966891)), ('anlamlıdır', np.float64(0.08391813582966891)), ('anlamına', np.float64(0.08391813582966891)), ('askeri', np.float64(0.08391813582966891)), ('askerî', np.float64(0.08391813582966891)), ('atlıdan', np.float64(0.08391813582966891)), ('atlıya', np.float64(0.08391813582966891)), ('az', np.float64(0.08391813582966891)), ('becerisinin', np.float64(0.08391813582966891)), ('belirtir', np.float64(0.08391813582966891)), ('beyoğlu', np.float64(0.08391813582966891)), ('bilgiler', np.float64(0.08391813582966891)), ('birliklerine', np.float64(0.08391813582966891)), ('biyografi', np.float64(0.08391813582966891)), ('bjeduğ', np.float64(0.08391813582966891)), ('bölgesinde', np.float64(0.08391813582966891)), ('büyütüldü', np.float64(0.08391813582966891)), ('devlet', np.float64(0.08391813582966891)), ('doğdu', np.float64(0.08391813582966891)), ('doğumlular', np.float64(0.08391813582966891)), ('duyulan', np.float64(0.08391813582966891)), ('dönem', np.float64(0.08391813582966891)), ('ek', np.float64(0.08391813582966891)), ('ekiyle', np.float64(0.08391813582966891)), ('erken', np.float64(0.08391813582966891)), ('eğitim', np.float64(0.08391813582966891)), ('eş', np.float64(0.08391813582966891)), ('geldiği', np.float64(0.08391813582966891)), ('gelen', np.float64(0.08391813582966891)), ('geçti', np.float64(0.08391813582966891)), ('hakkında', np.float64(0.08391813582966891)), ('hastalıktan', np.float64(0.08391813582966891)), ('ismi', np.float64(0.08391813582966891)), ('ismidir', np.float64(0.08391813582966891)), ('için', np.float64(0.08391813582966891)), ('kafkas', np.float64(0.08391813582966891)), ('kalıcı', np.float64(0.08391813582966891)), ('katılımı', np.float64(0.08391813582966891)), ('kaydedilmedi', np.float64(0.08391813582966891)), ('kaynak', np.float64(0.08391813582966891)), ('kaynaklarından', np.float64(0.08391813582966891)), ('kaynakça', np.float64(0.08391813582966891)), ('kazandığı', np.float64(0.08391813582966891)), ('kişi', np.float64(0.08391813582966891)), ('kişiler', np.float64(0.08391813582966891)), ('lakapları', np.float64(0.08391813582966891)), ('lakaptır', np.float64(0.08391813582966891)), ('müfrezesi', np.float64(0.08391813582966891)), ('nda', np.float64(0.08391813582966891)), ('nun', np.float64(0.08391813582966891)), ('nın', np.float64(0.08391813582966891)), ('olduğunu', np.float64(0.08391813582966891)), ('oluşan', np.float64(0.08391813582966891)), ('ordusu', np.float64(0.08391813582966891)), ('oğlu', np.float64(0.08391813582966891)), ('pek', np.float64(0.08391813582966891)), ('qo', np.float64(0.08391813582966891)), ('savaşı', np.float64(0.08391813582966891)), ('savaşına', np.float64(0.08391813582966891)), ('saygı', np.float64(0.08391813582966891)), ('sim', np.float64(0.08391813582966891)), ('soneki', np.float64(0.08391813582966891)), ('sonra', np.float64(0.08391813582966891)), ('soy', np.float64(0.08391813582966891)), ('soyadları', np.float64(0.08391813582966891)), ('soylular', np.float64(0.08391813582966891)), ('sıra', np.float64(0.08391813582966891)), ('sırasında', np.float64(0.08391813582966891)), ('tarafına', np.float64(0.08391813582966891)), ('tarihlerini', np.float64(0.08391813582966891)), ('ulaşıyordu', np.float64(0.08391813582966891)), ('yazmadıkları', np.float64(0.08391813582966891)), ('yıl', np.float64(0.08391813582966891)), ('zaferlerle', np.float64(0.08391813582966891)), ('çatışmalar', np.float64(0.08391813582966891)), ('çerkesler', np.float64(0.08391813582966891)), ('çerkeslerin', np.float64(0.08391813582966891)), ('öldü', np.float64(0.08391813582966891)), ('ölenler', np.float64(0.08391813582966891)), ('ölüm', np.float64(0.08391813582966891)), ('ünlüydü', np.float64(0.08391813582966891))]\n",
|
606 |
"BERT Embeddings:\n",
|
607 |
"Text 1 embedding shape: torch.Size([233, 768])\n"
|
608 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
609 |
}
|
610 |
],
|
611 |
"source": [
|
@@ -617,7 +563,9 @@
|
|
617 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
618 |
"from nltk.corpus import stopwords as nltk_stopwords\n",
|
619 |
"from transformers import BertTokenizer, BertModel\n",
|
|
|
620 |
"import torch\n",
|
|
|
621 |
"\n",
|
622 |
"# BERT Tokenizer ve Model'i yükleyin\n",
|
623 |
"tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
|
@@ -661,9 +609,10 @@
|
|
661 |
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
662 |
" X = vectorizer.fit_transform(corpus)\n",
|
663 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
664 |
-
" scores = np.asarray(X.sum(axis=0)).flatten()\n",
|
665 |
-
"
|
666 |
-
"
|
|
|
667 |
" return sorted_keywords\n",
|
668 |
"\n",
|
669 |
"#tokenleri kelimelere dönüştürür ve listeler \n",
|
@@ -695,9 +644,6 @@
|
|
695 |
" \n",
|
696 |
" \n",
|
697 |
"\n",
|
698 |
-
" \n",
|
699 |
-
"\n",
|
700 |
-
"\n",
|
701 |
"#token ıd leri ve bert gömme vektörleri\n",
|
702 |
"for text in texts:\n",
|
703 |
" input_ids,embeddings= get_bert_embeddings(text)\n",
|
@@ -708,10 +654,18 @@
|
|
708 |
" # Tokenları ve ortalama vektörleri al\n",
|
709 |
" tokens = decode_tokens(input_ids)\n",
|
710 |
" avg_embedding = average_embeddings(embeddings)\n",
|
|
|
|
|
|
|
|
|
|
|
711 |
" print(f\"Tokens: {tokens}\")\n",
|
712 |
-
" print(f\"Average Embedding Shape: {
|
713 |
-
" print(f\"Average Embedding: {
|
|
|
|
|
714 |
"\n",
|
|
|
715 |
"# TF-IDF anahtar kelimelerini çıkar\n",
|
716 |
"keywords = extract_keywords_tfidf(texts,stop_words_list)\n",
|
717 |
"print(\"TF-IDF Keywords:\", keywords)\n",
|
@@ -721,10 +675,566 @@
|
|
721 |
"for i, emb in enumerate(embeddings):\n",
|
722 |
" print(f\"Text {i+1} embedding shape: {emb.shape}\")\n",
|
723 |
"\n",
|
724 |
-
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
725 |
"\n"
|
726 |
]
|
727 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
728 |
{
|
729 |
"cell_type": "code",
|
730 |
"execution_count": 8,
|
@@ -790,6 +1300,79 @@
|
|
790 |
"test_stop_words_effectiveness(texts, stop_words_list)\n"
|
791 |
]
|
792 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
{
|
794 |
"cell_type": "code",
|
795 |
"execution_count": 20,
|
|
|
437 |
"TF-IDF HESAPLAMA"
|
438 |
]
|
439 |
},
|
440 |
+
{
|
441 |
+
"cell_type": "markdown",
|
442 |
+
"metadata": {},
|
443 |
+
"source": [
|
444 |
+
"Token vektörlerinin ortalamasını alarak metin düzeyinde özet oluşturacak şekilde k-means ve tf-ıdf algoritmalarını kullanarak keyword oluşturmak "
|
445 |
+
]
|
446 |
+
},
|
447 |
{
|
448 |
"cell_type": "code",
|
449 |
+
"execution_count": 31,
|
450 |
"metadata": {},
|
451 |
"outputs": [
|
452 |
{
|
|
|
454 |
"output_type": "stream",
|
455 |
"text": [
|
456 |
"Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
|
457 |
+
"Positive Average Embedding Shape: torch.Size([353])\n",
|
458 |
+
"Positive Average Embedding: tensor([3.1219e-01, 1.1118e-02, 1.3312e-02, 8.7684e-02, 6.0835e-01, 4.2102e-01,\n",
|
459 |
+
" 3.7467e-01, 2.5975e-01, 6.8351e-02, 1.1375e-01, 6.9892e-02, 3.4909e-01,\n",
|
460 |
+
" 4.2718e-02, 6.9431e-01, 8.2034e-02, 4.9043e-01, 4.0028e-02, 2.4516e-02,\n",
|
461 |
+
" 4.0203e-01, 1.7956e-01, 2.7692e-01, 4.2539e-01, 3.9989e-01, 1.1785e-01,\n",
|
462 |
+
" 1.2440e-01, 1.7583e-01, 4.7179e-01, 3.4876e-01, 4.3870e-01, 3.8414e-01,\n",
|
463 |
+
" 3.6902e-01, 2.5584e-01, 2.0225e-01, 1.4411e-01, 2.9933e-01, 3.6910e-01,\n",
|
464 |
+
" 2.3893e-01, 6.0434e-01, 1.5669e-01, 3.6258e-01, 4.5186e-01, 3.8370e-01,\n",
|
465 |
+
" 4.9858e-01, 1.1362e-02, 2.1302e-01, 3.4201e-01, 7.4201e-01, 7.6336e-02,\n",
|
466 |
+
" 2.6290e-01, 2.3984e-01, 4.8434e-01, 4.7557e-01, 2.2432e-01, 3.5924e-01,\n",
|
467 |
+
" 5.3896e-02, 1.0477e-01, 3.8852e-01, 2.9142e-01, 3.6626e-01, 7.9898e-02,\n",
|
468 |
+
" 2.2686e-01, 2.3253e-02, 3.2550e+00, 1.1168e-01, 4.2853e-01, 7.7213e-02,\n",
|
469 |
+
" 3.1671e-01, 5.6494e-01, 2.0392e-01, 5.1432e-02, 2.2806e-01, 4.0886e-01,\n",
|
470 |
+
" 2.2627e-02, 2.4151e-01, 5.5605e-01, 2.1589e-01, 8.9567e-02, 4.1183e-01,\n",
|
471 |
+
" 3.7691e-01, 6.1995e-02, 5.1504e-01, 4.9226e-01, 1.0083e-01, 1.9789e-01,\n",
|
472 |
+
" 6.5205e-01, 3.4597e-02, 9.5440e-02, 6.5158e-01, 1.9009e-01, 1.1314e-01,\n",
|
473 |
+
" 1.0752e-01, 4.7765e-01, 2.5196e-01, 1.3468e-01, 4.2977e-01, 2.7336e-01,\n",
|
474 |
+
" 4.7672e-02, 2.3097e-01, 1.5998e-01, 3.8424e-01, 2.9264e-02, 7.9061e-02,\n",
|
475 |
+
" 2.8095e-01, 2.0505e-01, 8.8469e-02, 1.6993e-01, 2.5519e-01, 5.7010e-01,\n",
|
476 |
+
" 6.1551e-03, 7.0113e-02, 1.1820e-01, 5.2899e-01, 1.3287e-01, 1.0696e+00,\n",
|
477 |
+
" 3.1219e-01, 1.1373e-01, 2.6080e-01, 6.1457e-03, 5.5064e-02, 5.2089e-01,\n",
|
478 |
+
" 1.3195e-01, 4.0164e-01, 8.4919e-01, 1.8478e-02, 1.6000e-01, 3.3307e-01,\n",
|
479 |
+
" 2.8522e-01, 1.7133e-01, 2.4794e-02, 1.7487e-01, 1.0915e-01, 2.5974e-01,\n",
|
480 |
+
" 1.8174e-02, 8.9919e-02, 1.6508e+00, 4.9391e-01, 7.9321e-02, 3.2023e-02,\n",
|
481 |
+
" 3.1216e-01, 3.5055e-01, 2.4602e-01, 4.0553e-01, 1.3428e-02, 4.7906e-01,\n",
|
482 |
+
" 2.2494e-01, 3.5909e-01, 1.2861e-01, 9.8253e-02, 2.3110e-01, 3.1276e-01,\n",
|
483 |
+
" 6.4092e-02, 2.7386e-01, 6.7687e-02, 3.0518e-02, 3.8880e-01, 2.8110e-01,\n",
|
484 |
+
" 5.7723e-02, 4.2425e-01, 6.5768e-01, 8.4208e-02, 3.2153e-01, 5.6956e-01,\n",
|
485 |
+
" 1.2256e-01, 4.2261e-01, 7.9419e-02, 1.5746e-01, 1.8869e-01, 4.1413e-01,\n",
|
486 |
+
" 3.7192e-01, 5.4023e-02, 1.1605e-01, 4.2643e-01, 1.6004e-01, 2.1577e-01,\n",
|
487 |
+
" 6.6576e-03, 4.4046e-01, 2.4404e-01, 8.1931e-02, 2.2825e-01, 8.8104e-02,\n",
|
488 |
+
" 4.0676e-01, 1.6295e-01, 5.8565e-01, 3.9977e-01, 5.0630e-02, 6.7476e-02,\n",
|
489 |
+
" 3.4367e-01, 1.8640e-01, 3.3172e-01, 2.6630e-02, 1.6500e-02, 2.6911e-01,\n",
|
490 |
+
" 3.4227e-02, 9.7154e-01, 8.5149e-01, 1.0421e-01, 6.2897e-01, 1.8700e-02,\n",
|
491 |
+
" 1.6866e-01, 3.2686e-01, 6.5600e-01, 2.9388e-02, 3.8548e-02, 1.5922e-01,\n",
|
492 |
+
" 5.6203e-01, 3.1285e-01, 3.8763e-01, 1.6276e-01, 1.2610e-01, 3.5952e-01,\n",
|
493 |
+
" 1.3288e-01, 6.0504e-01, 2.9626e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
|
494 |
+
" 9.2435e-02, 2.3198e-01, 1.8715e-01, 2.5481e-01, 3.2795e-01, 4.5814e-01,\n",
|
495 |
+
" 1.9183e-01, 2.7146e-01, 1.9477e-01, 5.7984e-03, 3.0490e-01, 9.8830e-03,\n",
|
496 |
+
" 6.9638e-01, 9.4965e-02, 8.8206e-02, 2.3173e-01, 1.2170e-01, 4.5793e-01,\n",
|
497 |
+
" 1.4489e-01, 2.2540e-01, 5.2360e-01, 2.7475e-01, 5.3707e-01, 9.3503e-02,\n",
|
498 |
+
" 1.5903e-01, 3.4478e-01, 3.9456e-01, 1.7182e-01, 5.6727e-03, 2.7554e-01,\n",
|
499 |
+
" 2.0691e-01, 1.6439e-01, 6.4637e-01, 1.3178e-01, 1.9076e-01, 2.2997e-01,\n",
|
500 |
+
" 9.9676e-04, 2.3884e-01, 5.3464e-01, 2.7388e-01, 2.3122e-01, 3.2136e-01,\n",
|
501 |
+
" 6.1094e-02, 1.6784e-01, 5.6459e-01, 4.4070e-01, 3.1866e-01, 4.1410e-01,\n",
|
502 |
+
" 3.0922e-01, 5.3698e-01, 8.8994e-02, 4.1334e-01, 2.5389e-01, 6.0110e-01,\n",
|
503 |
+
" 3.8342e-01, 3.5175e-02, 2.5660e-01, 8.5744e-01, 3.0483e-03, 3.4735e-01,\n",
|
504 |
+
" 3.8450e-01, 3.9665e-01, 2.2100e-01, 6.5109e-02, 1.9003e-01, 7.4262e-02,\n",
|
505 |
+
" 2.9763e-01, 1.4098e-01, 1.1544e-01, 3.2446e-01, 1.4054e-02, 1.6943e-01,\n",
|
506 |
+
" 1.1417e-01, 3.3420e-01, 4.2107e-02, 4.9406e-01, 5.4846e-02, 2.4392e-01,\n",
|
507 |
+
" 2.4391e-01, 2.1046e-01, 3.5563e-01, 1.6479e-01, 3.2559e-01, 4.0702e-01,\n",
|
508 |
+
" 9.6086e-01, 1.3305e-01, 7.5751e-02, 2.7087e-01, 9.1068e-02, 4.7289e-01,\n",
|
509 |
+
" 1.0613e-01, 1.3504e-01, 2.7304e-01, 1.1986e-01, 4.7432e-01, 3.9729e-01,\n",
|
510 |
+
" 1.3385e-02, 1.6185e-01, 5.8601e-01, 5.8034e-01, 6.7479e-03, 2.1235e-01,\n",
|
511 |
+
" 6.9211e-02, 1.1795e-01, 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01,\n",
|
512 |
+
" 2.7441e-01, 4.9623e-01, 2.1623e-01, 8.4283e-02, 1.1040e-01, 3.7749e-02,\n",
|
513 |
+
" 3.9097e-01, 2.7157e-02, 3.7090e-01, 3.6961e-01, 2.6829e-01, 1.7171e-01,\n",
|
514 |
+
" 1.7970e-02, 1.2158e-01, 1.8717e-01, 3.5600e-01, 1.5203e-03, 2.1490e-01,\n",
|
515 |
+
" 2.2720e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, 3.4466e-02, 5.8733e-01,\n",
|
516 |
+
" 1.2950e-01, 2.3771e-01, 3.9440e-01, 1.0506e-01, 5.9232e-01])\n",
|
517 |
+
"TF-IDF Keywords: [array([['rus', 'ahecaqo', 'türkçe', 'pşıqo', '1777', 'çerkes', '000',\n",
|
518 |
+
" 'çerkesya', 'ölenler', 'ünlüydü', 'ölüm', 'yazmadıkları',\n",
|
519 |
+
" 'ulaşıyordu', 'tarihlerini', 'çerkeslerin', 'çerkesler',\n",
|
520 |
+
" 'çatışmalar', 'zaferlerle', 'öldü', 'soneki', 'soy', 'soyadları',\n",
|
521 |
+
" 'soylular', 'sıra', 'savaşı', 'sim', 'saygı', 'ordusu', 'oluşan',\n",
|
522 |
+
" 'olduğunu', 'müfrezesi', 'lakaptır', 'savaşına', 'qo', 'oğlu',\n",
|
523 |
+
" 'kazandığı', 'kaynakça', 'kaynaklarından', 'kaynak',\n",
|
524 |
+
" 'kaydedilmedi', 'katılımı', 'kalıcı', 'kafkas', 'ismidir',\n",
|
525 |
+
" 'ismi', 'hastalıktan', 'hakkında', 'geçti', 'lakapları',\n",
|
526 |
+
" 'kişiler', 'kişi', 'eş', 'geldiği', 'gelen', 'eğitim', 'dönem',\n",
|
527 |
+
" 'erken', 'ekiyle', 'ek', 'devlet', 'büyütüldü', 'bölgesinde',\n",
|
528 |
+
" 'bjeduğ', 'biyografi', 'duyulan', 'doğumlular', 'doğdu',\n",
|
529 |
+
" 'beyoğlu', 'bilgiler', 'birliklerine', 'belirtir', 'askerî',\n",
|
530 |
+
" 'becerisinin', 'atlıya', 'atlıdan', 'anlamlıdır', 'anlamına',\n",
|
531 |
+
" 'askeri', 'adlardaki', '1838', 'adamıydı', '1837', '12']],\n",
|
532 |
+
" dtype=object)]\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
533 |
"BERT Embeddings:\n",
|
534 |
"Text 1 embedding shape: torch.Size([233, 768])\n"
|
535 |
]
|
536 |
+
},
|
537 |
+
{
|
538 |
+
"ename": "ValueError",
|
539 |
+
"evalue": "setting an array element with a sequence.",
|
540 |
+
"output_type": "error",
|
541 |
+
"traceback": [
|
542 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
543 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
544 |
+
"\u001b[1;31mTypeError\u001b[0m: float() argument must be a string or a real number, not 'csr_matrix'",
|
545 |
+
"\nThe above exception was the direct cause of the following exception:\n",
|
546 |
+
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
547 |
+
"Cell \u001b[1;32mIn[31], line 151\u001b[0m\n\u001b[0;32m 146\u001b[0m \u001b[38;5;124;03m\"\"\"# Liste halindeki TF-IDF değerlerini yazdırma\u001b[39;00m\n\u001b[0;32m 147\u001b[0m \u001b[38;5;124;03mprint(\"TF-IDF List:\")\u001b[39;00m\n\u001b[0;32m 148\u001b[0m \u001b[38;5;124;03mfor row in tfidf_list:\u001b[39;00m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;124;03m print(row)\"\"\"\u001b[39;00m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeler ve metin arasındaki cosine similarity hesaplama\u001b[39;00m\n\u001b[1;32m--> 151\u001b[0m similarity_score \u001b[38;5;241m=\u001b[39m \u001b[43mcosine_similarity\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkeywords_vector\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdocument_vector\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 153\u001b[0m \u001b[38;5;66;03m# Her bir kelime için TF-IDF değerlerini yazdırma\u001b[39;00m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc_idx, doc \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(tfidf_scores):\n",
|
548 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
|
549 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\metrics\\pairwise.py:1679\u001b[0m, in \u001b[0;36mcosine_similarity\u001b[1;34m(X, Y, dense_output)\u001b[0m\n\u001b[0;32m 1635\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[0;32m 1636\u001b[0m \n\u001b[0;32m 1637\u001b[0m \u001b[38;5;124;03mCosine similarity, or the cosine kernel, computes similarity as the\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1675\u001b[0m \u001b[38;5;124;03m [0.57..., 0.81...]])\u001b[39;00m\n\u001b[0;32m 1676\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1677\u001b[0m \u001b[38;5;66;03m# to avoid recursive import\u001b[39;00m\n\u001b[1;32m-> 1679\u001b[0m X, Y \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_pairwise_arrays\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1681\u001b[0m X_normalized \u001b[38;5;241m=\u001b[39m normalize(X, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 1682\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X \u001b[38;5;129;01mis\u001b[39;00m Y:\n",
|
550 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\metrics\\pairwise.py:185\u001b[0m, in \u001b[0;36mcheck_pairwise_arrays\u001b[1;34m(X, Y, precomputed, dtype, accept_sparse, force_all_finite, ensure_2d, copy)\u001b[0m\n\u001b[0;32m 175\u001b[0m X \u001b[38;5;241m=\u001b[39m Y \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[0;32m 176\u001b[0m X,\n\u001b[0;32m 177\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 182\u001b[0m ensure_2d\u001b[38;5;241m=\u001b[39mensure_2d,\n\u001b[0;32m 183\u001b[0m )\n\u001b[0;32m 184\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 185\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 186\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maccept_sparse\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_all_finite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mensure_2d\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mensure_2d\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 194\u001b[0m Y \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[0;32m 195\u001b[0m Y,\n\u001b[0;32m 196\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 201\u001b[0m ensure_2d\u001b[38;5;241m=\u001b[39mensure_2d,\n\u001b[0;32m 202\u001b[0m )\n\u001b[0;32m 204\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m precomputed:\n",
|
551 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\validation.py:1012\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 1010\u001b[0m array \u001b[38;5;241m=\u001b[39m xp\u001b[38;5;241m.\u001b[39mastype(array, dtype, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1011\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1012\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43m_asarray_with_order\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mxp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mxp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1013\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ComplexWarning \u001b[38;5;28;01mas\u001b[39;00m complex_warning:\n\u001b[0;32m 1014\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1015\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComplex data not supported\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(array)\n\u001b[0;32m 1016\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcomplex_warning\u001b[39;00m\n",
|
552 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_array_api.py:751\u001b[0m, in \u001b[0;36m_asarray_with_order\u001b[1;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[0;32m 749\u001b[0m array \u001b[38;5;241m=\u001b[39m numpy\u001b[38;5;241m.\u001b[39marray(array, order\u001b[38;5;241m=\u001b[39morder, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[0;32m 750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 751\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mnumpy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 753\u001b[0m \u001b[38;5;66;03m# At this point array is a NumPy ndarray. We convert it to an array\u001b[39;00m\n\u001b[0;32m 754\u001b[0m \u001b[38;5;66;03m# container that is consistent with the input's namespace.\u001b[39;00m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m xp\u001b[38;5;241m.\u001b[39masarray(array)\n",
|
553 |
+
"\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."
|
554 |
+
]
|
555 |
}
|
556 |
],
|
557 |
"source": [
|
|
|
563 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
564 |
"from nltk.corpus import stopwords as nltk_stopwords\n",
|
565 |
"from transformers import BertTokenizer, BertModel\n",
|
566 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
567 |
"import torch\n",
|
568 |
+
"import torch.nn.functional as F\n",
|
569 |
"\n",
|
570 |
"# BERT Tokenizer ve Model'i yükleyin\n",
|
571 |
"tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
|
|
|
609 |
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
610 |
" X = vectorizer.fit_transform(corpus)\n",
|
611 |
" feature_names = vectorizer.get_feature_names_out()\n",
|
612 |
+
" #scores = np.asarray(X.sum(axis=0)).flatten()\n",
|
613 |
+
" sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
614 |
+
" #keywords = {feature_names[i]: scores[i] for i in range(len(feature_names))}\n",
|
615 |
+
" #sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)\n",
|
616 |
" return sorted_keywords\n",
|
617 |
"\n",
|
618 |
"#tokenleri kelimelere dönüştürür ve listeler \n",
|
|
|
644 |
" \n",
|
645 |
" \n",
|
646 |
"\n",
|
|
|
|
|
|
|
647 |
"#token ıd leri ve bert gömme vektörleri\n",
|
648 |
"for text in texts:\n",
|
649 |
" input_ids,embeddings= get_bert_embeddings(text)\n",
|
|
|
654 |
" # Tokenları ve ortalama vektörleri al\n",
|
655 |
" tokens = decode_tokens(input_ids)\n",
|
656 |
" avg_embedding = average_embeddings(embeddings)\n",
|
657 |
+
" #ortalama embedding değerlerinden sadece 0'dan büyük olanları alma\n",
|
658 |
+
" positive_avg_embedding= avg_embedding[avg_embedding>0]\n",
|
659 |
+
" # Eğer pozitif embedding değerleri varsa, çıktıyı yazdır\n",
|
660 |
+
"\n",
|
661 |
+
"if len(positive_avg_embedding) > 0:\n",
|
662 |
" print(f\"Tokens: {tokens}\")\n",
|
663 |
+
" print(f\"Positive Average Embedding Shape: {positive_avg_embedding.shape}\")\n",
|
664 |
+
" print(f\"Positive Average Embedding: {positive_avg_embedding}\")\n",
|
665 |
+
"else:\n",
|
666 |
+
" print(\"No positive embedding values found.\")\n",
|
667 |
"\n",
|
668 |
+
" \n",
|
669 |
"# TF-IDF anahtar kelimelerini çıkar\n",
|
670 |
"keywords = extract_keywords_tfidf(texts,stop_words_list)\n",
|
671 |
"print(\"TF-IDF Keywords:\", keywords)\n",
|
|
|
675 |
"for i, emb in enumerate(embeddings):\n",
|
676 |
" print(f\"Text {i+1} embedding shape: {emb.shape}\")\n",
|
677 |
"\n",
|
678 |
+
"keywords_str = \" \".join([str(keyword) for keyword in keywords])\n",
|
679 |
+
"\n",
|
680 |
+
"\n",
|
681 |
+
"#metinleri birleştirip tf-ıdf matrisini oluşturma\n",
|
682 |
+
"# TF-IDF vektörleştirici oluşturma\n",
|
683 |
+
"tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
684 |
+
"corpus = [text, keywords_str]\n",
|
685 |
+
"tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)\n",
|
686 |
+
"\n",
|
687 |
+
"# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\n",
|
688 |
+
"keywords_vector = tfidf_matrix[1]\n",
|
689 |
+
"document_vector = tfidf_matrix[0]\n",
|
690 |
+
"keywords_vector_dense = keywords_vector.toarray()\n",
|
691 |
+
"document_vector_dense = document_vector.toarray()\n",
|
692 |
+
"\n",
|
693 |
+
"# Kelimeleri ve TF-IDF değerlerini alma\n",
|
694 |
+
"feature_names = tfidf_vectorizer.get_feature_names_out()\n",
|
695 |
+
"tfidf_scores = tfidf_matrix.toarray()\n",
|
696 |
+
"similarity_score = cosine_similarity(keywords_vector_dense, document_vector_dense)\n",
|
697 |
+
"\n",
|
698 |
+
"# TF-IDF matrisini dense formata çevirme\n",
|
699 |
+
"dense_matrix = tfidf_matrix.todense()\n",
|
700 |
+
"# Dense matrisi liste haline getirme\n",
|
701 |
+
"tfidf_list = dense_matrix.tolist()\n",
|
702 |
+
"\n",
|
703 |
+
"\"\"\"# Liste halindeki TF-IDF değerlerini yazdırma\n",
|
704 |
+
"print(\"TF-IDF List:\")\n",
|
705 |
+
"for row in tfidf_list:\n",
|
706 |
+
" print(row)\"\"\"\n",
|
707 |
+
"# Anahtar kelimeler ve metin arasındaki cosine similarity hesaplama\n",
|
708 |
+
"similarity_score = cosine_similarity([keywords_vector], [document_vector])\n",
|
709 |
+
"\n",
|
710 |
+
"# Her bir kelime için TF-IDF değerlerini yazdırma\n",
|
711 |
+
"for doc_idx, doc in enumerate(tfidf_scores):\n",
|
712 |
+
" print(f\"Document {doc_idx + 1}:\")\n",
|
713 |
+
" for word_idx, score in enumerate(doc):\n",
|
714 |
+
" print(f\"Word: {feature_names[word_idx]}, TF-IDF: {score:.4f}\")\n",
|
715 |
+
" print(\"\\n\")\n",
|
716 |
+
"\n",
|
717 |
+
"# Sonucu yazdırma\n",
|
718 |
+
"print(f\"Keywords ile metin arasındaki benzerlik: {similarity_score[0][0]}\")\n",
|
719 |
"\n"
|
720 |
]
|
721 |
},
|
722 |
+
{
|
723 |
+
"cell_type": "code",
|
724 |
+
"execution_count": 32,
|
725 |
+
"metadata": {},
|
726 |
+
"outputs": [
|
727 |
+
{
|
728 |
+
"name": "stdout",
|
729 |
+
"output_type": "stream",
|
730 |
+
"text": [
|
731 |
+
"Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
|
732 |
+
"Positive Average Embedding Shape: torch.Size([353])\n",
|
733 |
+
"Positive Average Embedding: tensor([3.1219e-01, 1.1118e-02, 1.3312e-02, 8.7684e-02, 6.0835e-01, 4.2102e-01,\n",
|
734 |
+
" 3.7467e-01, 2.5975e-01, 6.8351e-02, 1.1375e-01, 6.9892e-02, 3.4909e-01,\n",
|
735 |
+
" 4.2718e-02, 6.9431e-01, 8.2034e-02, 4.9043e-01, 4.0028e-02, 2.4516e-02,\n",
|
736 |
+
" 4.0203e-01, 1.7956e-01, 2.7692e-01, 4.2539e-01, 3.9989e-01, 1.1785e-01,\n",
|
737 |
+
" 1.2440e-01, 1.7583e-01, 4.7179e-01, 3.4876e-01, 4.3870e-01, 3.8414e-01,\n",
|
738 |
+
" 3.6902e-01, 2.5584e-01, 2.0225e-01, 1.4411e-01, 2.9933e-01, 3.6910e-01,\n",
|
739 |
+
" 2.3893e-01, 6.0434e-01, 1.5669e-01, 3.6258e-01, 4.5186e-01, 3.8370e-01,\n",
|
740 |
+
" 4.9858e-01, 1.1362e-02, 2.1302e-01, 3.4201e-01, 7.4201e-01, 7.6336e-02,\n",
|
741 |
+
" 2.6290e-01, 2.3984e-01, 4.8434e-01, 4.7557e-01, 2.2432e-01, 3.5924e-01,\n",
|
742 |
+
" 5.3896e-02, 1.0477e-01, 3.8852e-01, 2.9142e-01, 3.6626e-01, 7.9898e-02,\n",
|
743 |
+
" 2.2686e-01, 2.3253e-02, 3.2550e+00, 1.1168e-01, 4.2853e-01, 7.7213e-02,\n",
|
744 |
+
" 3.1671e-01, 5.6494e-01, 2.0392e-01, 5.1432e-02, 2.2806e-01, 4.0886e-01,\n",
|
745 |
+
" 2.2627e-02, 2.4151e-01, 5.5605e-01, 2.1589e-01, 8.9567e-02, 4.1183e-01,\n",
|
746 |
+
" 3.7691e-01, 6.1995e-02, 5.1504e-01, 4.9226e-01, 1.0083e-01, 1.9789e-01,\n",
|
747 |
+
" 6.5205e-01, 3.4597e-02, 9.5440e-02, 6.5158e-01, 1.9009e-01, 1.1314e-01,\n",
|
748 |
+
" 1.0752e-01, 4.7765e-01, 2.5196e-01, 1.3468e-01, 4.2977e-01, 2.7336e-01,\n",
|
749 |
+
" 4.7672e-02, 2.3097e-01, 1.5998e-01, 3.8424e-01, 2.9264e-02, 7.9061e-02,\n",
|
750 |
+
" 2.8095e-01, 2.0505e-01, 8.8469e-02, 1.6993e-01, 2.5519e-01, 5.7010e-01,\n",
|
751 |
+
" 6.1551e-03, 7.0113e-02, 1.1820e-01, 5.2899e-01, 1.3287e-01, 1.0696e+00,\n",
|
752 |
+
" 3.1219e-01, 1.1373e-01, 2.6080e-01, 6.1457e-03, 5.5064e-02, 5.2089e-01,\n",
|
753 |
+
" 1.3195e-01, 4.0164e-01, 8.4919e-01, 1.8478e-02, 1.6000e-01, 3.3307e-01,\n",
|
754 |
+
" 2.8522e-01, 1.7133e-01, 2.4794e-02, 1.7487e-01, 1.0915e-01, 2.5974e-01,\n",
|
755 |
+
" 1.8174e-02, 8.9919e-02, 1.6508e+00, 4.9391e-01, 7.9321e-02, 3.2023e-02,\n",
|
756 |
+
" 3.1216e-01, 3.5055e-01, 2.4602e-01, 4.0553e-01, 1.3428e-02, 4.7906e-01,\n",
|
757 |
+
" 2.2494e-01, 3.5909e-01, 1.2861e-01, 9.8253e-02, 2.3110e-01, 3.1276e-01,\n",
|
758 |
+
" 6.4092e-02, 2.7386e-01, 6.7687e-02, 3.0518e-02, 3.8880e-01, 2.8110e-01,\n",
|
759 |
+
" 5.7723e-02, 4.2425e-01, 6.5768e-01, 8.4208e-02, 3.2153e-01, 5.6956e-01,\n",
|
760 |
+
" 1.2256e-01, 4.2261e-01, 7.9419e-02, 1.5746e-01, 1.8869e-01, 4.1413e-01,\n",
|
761 |
+
" 3.7192e-01, 5.4023e-02, 1.1605e-01, 4.2643e-01, 1.6004e-01, 2.1577e-01,\n",
|
762 |
+
" 6.6576e-03, 4.4046e-01, 2.4404e-01, 8.1931e-02, 2.2825e-01, 8.8104e-02,\n",
|
763 |
+
" 4.0676e-01, 1.6295e-01, 5.8565e-01, 3.9977e-01, 5.0630e-02, 6.7476e-02,\n",
|
764 |
+
" 3.4367e-01, 1.8640e-01, 3.3172e-01, 2.6630e-02, 1.6500e-02, 2.6911e-01,\n",
|
765 |
+
" 3.4227e-02, 9.7154e-01, 8.5149e-01, 1.0421e-01, 6.2897e-01, 1.8700e-02,\n",
|
766 |
+
" 1.6866e-01, 3.2686e-01, 6.5600e-01, 2.9388e-02, 3.8548e-02, 1.5922e-01,\n",
|
767 |
+
" 5.6203e-01, 3.1285e-01, 3.8763e-01, 1.6276e-01, 1.2610e-01, 3.5952e-01,\n",
|
768 |
+
" 1.3288e-01, 6.0504e-01, 2.9626e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
|
769 |
+
" 9.2435e-02, 2.3198e-01, 1.8715e-01, 2.5481e-01, 3.2795e-01, 4.5814e-01,\n",
|
770 |
+
" 1.9183e-01, 2.7146e-01, 1.9477e-01, 5.7984e-03, 3.0490e-01, 9.8830e-03,\n",
|
771 |
+
" 6.9638e-01, 9.4965e-02, 8.8206e-02, 2.3173e-01, 1.2170e-01, 4.5793e-01,\n",
|
772 |
+
" 1.4489e-01, 2.2540e-01, 5.2360e-01, 2.7475e-01, 5.3707e-01, 9.3503e-02,\n",
|
773 |
+
" 1.5903e-01, 3.4478e-01, 3.9456e-01, 1.7182e-01, 5.6727e-03, 2.7554e-01,\n",
|
774 |
+
" 2.0691e-01, 1.6439e-01, 6.4637e-01, 1.3178e-01, 1.9076e-01, 2.2997e-01,\n",
|
775 |
+
" 9.9676e-04, 2.3884e-01, 5.3464e-01, 2.7388e-01, 2.3122e-01, 3.2136e-01,\n",
|
776 |
+
" 6.1094e-02, 1.6784e-01, 5.6459e-01, 4.4070e-01, 3.1866e-01, 4.1410e-01,\n",
|
777 |
+
" 3.0922e-01, 5.3698e-01, 8.8994e-02, 4.1334e-01, 2.5389e-01, 6.0110e-01,\n",
|
778 |
+
" 3.8342e-01, 3.5175e-02, 2.5660e-01, 8.5744e-01, 3.0483e-03, 3.4735e-01,\n",
|
779 |
+
" 3.8450e-01, 3.9665e-01, 2.2100e-01, 6.5109e-02, 1.9003e-01, 7.4262e-02,\n",
|
780 |
+
" 2.9763e-01, 1.4098e-01, 1.1544e-01, 3.2446e-01, 1.4054e-02, 1.6943e-01,\n",
|
781 |
+
" 1.1417e-01, 3.3420e-01, 4.2107e-02, 4.9406e-01, 5.4846e-02, 2.4392e-01,\n",
|
782 |
+
" 2.4391e-01, 2.1046e-01, 3.5563e-01, 1.6479e-01, 3.2559e-01, 4.0702e-01,\n",
|
783 |
+
" 9.6086e-01, 1.3305e-01, 7.5751e-02, 2.7087e-01, 9.1068e-02, 4.7289e-01,\n",
|
784 |
+
" 1.0613e-01, 1.3504e-01, 2.7304e-01, 1.1986e-01, 4.7432e-01, 3.9729e-01,\n",
|
785 |
+
" 1.3385e-02, 1.6185e-01, 5.8601e-01, 5.8034e-01, 6.7479e-03, 2.1235e-01,\n",
|
786 |
+
" 6.9211e-02, 1.1795e-01, 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01,\n",
|
787 |
+
" 2.7441e-01, 4.9623e-01, 2.1623e-01, 8.4283e-02, 1.1040e-01, 3.7749e-02,\n",
|
788 |
+
" 3.9097e-01, 2.7157e-02, 3.7090e-01, 3.6961e-01, 2.6829e-01, 1.7171e-01,\n",
|
789 |
+
" 1.7970e-02, 1.2158e-01, 1.8717e-01, 3.5600e-01, 1.5203e-03, 2.1490e-01,\n",
|
790 |
+
" 2.2720e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, 3.4466e-02, 5.8733e-01,\n",
|
791 |
+
" 1.2950e-01, 2.3771e-01, 3.9440e-01, 1.0506e-01, 5.9232e-01])\n",
|
792 |
+
"TF-IDF Keywords: [array([['rus', 'ahecaqo', 'türkçe', 'pşıqo', '1777', 'çerkes', '000',\n",
|
793 |
+
" 'çerkesya', 'ölenler', 'ünlüydü', 'ölüm', 'yazmadıkları',\n",
|
794 |
+
" 'ulaşıyordu', 'tarihlerini', 'çerkeslerin', 'çerkesler',\n",
|
795 |
+
" 'çatışmalar', 'zaferlerle', 'öldü', 'soneki', 'soy', 'soyadları',\n",
|
796 |
+
" 'soylular', 'sıra', 'savaşı', 'sim', 'saygı', 'ordusu', 'oluşan',\n",
|
797 |
+
" 'olduğunu', 'müfrezesi', 'lakaptır', 'savaşına', 'qo', 'oğlu',\n",
|
798 |
+
" 'kazandığı', 'kaynakça', 'kaynaklarından', 'kaynak',\n",
|
799 |
+
" 'kaydedilmedi', 'katılımı', 'kalıcı', 'kafkas', 'ismidir',\n",
|
800 |
+
" 'ismi', 'hastalıktan', 'hakkında', 'geçti', 'lakapları',\n",
|
801 |
+
" 'kişiler', 'kişi', 'eş', 'geldiği', 'gelen', 'eğitim', 'dönem',\n",
|
802 |
+
" 'erken', 'ekiyle', 'ek', 'devlet', 'büyütüldü', 'bölgesinde',\n",
|
803 |
+
" 'bjeduğ', 'biyografi', 'duyulan', 'doğumlular', 'doğdu',\n",
|
804 |
+
" 'beyoğlu', 'bilgiler', 'birliklerine', 'belirtir', 'askerî',\n",
|
805 |
+
" 'becerisinin', 'atlıya', 'atlıdan', 'anlamlıdır', 'anlamına',\n",
|
806 |
+
" 'askeri', 'adlardaki', '1838', 'adamıydı', '1837', '12']],\n",
|
807 |
+
" dtype=object)]\n"
|
808 |
+
]
|
809 |
+
},
|
810 |
+
{
|
811 |
+
"ename": "TypeError",
|
812 |
+
"evalue": "sequence item 0: expected str instance, numpy.ndarray found",
|
813 |
+
"output_type": "error",
|
814 |
+
"traceback": [
|
815 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
816 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
817 |
+
"Cell \u001b[1;32mIn[32], line 96\u001b[0m\n\u001b[0;32m 94\u001b[0m \u001b[38;5;66;03m# TF-IDF matrisini oluşturma\u001b[39;00m\n\u001b[0;32m 95\u001b[0m tfidf_vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words_list)\n\u001b[1;32m---> 96\u001b[0m corpus \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m \u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeywords\u001b[49m\u001b[43m)\u001b[49m] \u001b[38;5;66;03m# Anahtar kelimeleri string olarak birleştir\u001b[39;00m\n\u001b[0;32m 97\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m tfidf_vectorizer\u001b[38;5;241m.\u001b[39mfit_transform(corpus \u001b[38;5;241m+\u001b[39m texts)\n\u001b[0;32m 99\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\u001b[39;00m\n",
|
818 |
+
"\u001b[1;31mTypeError\u001b[0m: sequence item 0: expected str instance, numpy.ndarray found"
|
819 |
+
]
|
820 |
+
}
|
821 |
+
],
|
822 |
+
"source": [
|
823 |
+
"import re\n",
|
824 |
+
"import numpy as np\n",
|
825 |
+
"import pandas as pd\n",
|
826 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
827 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
828 |
+
"from nltk.corpus import stopwords as nltk_stopwords\n",
|
829 |
+
"from transformers import BertTokenizer, BertModel\n",
|
830 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
831 |
+
"import torch\n",
|
832 |
+
"import torch.nn.functional as F\n",
|
833 |
+
"\n",
|
834 |
+
"# BERT Tokenizer ve Model'i yükleyin\n",
|
835 |
+
"tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
|
836 |
+
"model = BertModel.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
|
837 |
+
"\n",
|
838 |
+
"#-------------------------- burada turkish_stop_words'ü alıyoruz\n",
|
839 |
+
"def load_stop_words(file_path):\n",
|
840 |
+
" \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur.\"\"\"\n",
|
841 |
+
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
842 |
+
" stop_words = [line.strip() for line in file if line.strip()]\n",
|
843 |
+
" return stop_words\n",
|
844 |
+
"\n",
|
845 |
+
"# Türkçe stop words dosyasını yükleyin\n",
|
846 |
+
"stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
|
847 |
+
"\n",
|
848 |
+
"# Gömülü kelimeleri k-means ile kümeleyebiliriz , benzerlik oranını hesaplamak için farklı algoritmalardan yararlanabiliriz.\n",
|
849 |
+
"def get_bert_embeddings(text):\n",
|
850 |
+
" inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)\n",
|
851 |
+
" with torch.no_grad():\n",
|
852 |
+
" outputs = model(**inputs)\n",
|
853 |
+
" # Son katmandaki gömme (embedding) çıktısını alın\n",
|
854 |
+
" return inputs['input_ids'], outputs.last_hidden_state\n",
|
855 |
+
"\n",
|
856 |
+
"#------------------------------------ token verilerinin ortalaması (eşik değer için)\n",
|
857 |
+
"def average_embeddings(embeddings):\n",
|
858 |
+
" # Token vektörlerinin ortalamasını alarak metin düzeyinde özet oluştur\n",
|
859 |
+
" return torch.mean(embeddings, dim=1).squeeze()\n",
|
860 |
+
"\n",
|
861 |
+
"# Keywords çıkarma fonksiyonu\n",
|
862 |
+
"def extract_keywords_tfidf(corpus, stop_words_list):\n",
|
863 |
+
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
864 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
865 |
+
" X = vectorizer.fit_transform(corpus)\n",
|
866 |
+
" feature_names = vectorizer.get_feature_names_out()\n",
|
867 |
+
" sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
868 |
+
" return sorted_keywords\n",
|
869 |
+
"\n",
|
870 |
+
"# Tokenları kelimelere dönüştürür ve listeler \n",
|
871 |
+
"def decode_tokens(input_ids):\n",
|
872 |
+
" tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())\n",
|
873 |
+
" return tokens\n",
|
874 |
+
"\n",
|
875 |
+
"# Örnek metinler (buranın yerine combined_text kullanılacak)\n",
|
876 |
+
"texts = [\"\"\"Biyografi\n",
|
877 |
+
"İsim \n",
|
878 |
+
"Pşıqo ismi Ahecaqo soy ismidir. Çerkeslerin \"-qo\" soneki ile biten hem soyadları hem de lakapları vardı. Bu ek Türkçe adlardaki \"-oğlu\" ekiyle eş anlamlıdır. Pşıqo Türkçe \"Beyoğlu\" anlamına gelen bir lakaptır.\n",
|
879 |
+
"\n",
|
880 |
+
"Erken dönem \n",
|
881 |
+
"Çerkesler tarihlerini yazmadıkları ve tüm bilgiler Rus kaynaklarından geldiği için Ahecaqo hakkında pek bir şey kaydedilmedi. 1777'de Çerkesya'nın Bjeduğ bölgesinde doğdu. Askerî eğitim ile büyütüldü.\n",
|
882 |
+
"\n",
|
883 |
+
"Rus-Çerkes Savaşına Katılımı \n",
|
884 |
+
"Birkaç kaynak, Ahecaqo'nun tüm Çerkesya'da saygı duyulan bir kişi olduğunu belirtir. En az 6.000 atlıdan oluşan kalıcı bir ordusu vardı ve çatışmalar sırasında müfrezesi 12.000 atlıya ulaşıyordu. Rus birliklerine karşı kazandığı zaferlerle ünlüydü. Askeri becerisinin yanı sıra yetenekli bir devlet adamıydı.\n",
|
885 |
+
"\n",
|
886 |
+
"Ölüm \n",
|
887 |
+
"1837 yılında Rus tarafına geçti ve bir yıl sonra hastalıktan öldü.\n",
|
888 |
+
"\n",
|
889 |
+
"Kaynakça \n",
|
890 |
+
"\n",
|
891 |
+
"Çerkes soylular\n",
|
892 |
+
"1777 doğumlular\n",
|
893 |
+
"1838 yılında ölenler\n",
|
894 |
+
"Kafkas Savaşı'nda kişiler \"\"\"]\n",
|
895 |
+
"\n",
|
896 |
+
"# Token id'leri ve BERT gömme vektörleri\n",
|
897 |
+
"for text in texts:\n",
|
898 |
+
" input_ids, embeddings = get_bert_embeddings(text)\n",
|
899 |
+
" tokens = decode_tokens(input_ids)\n",
|
900 |
+
" avg_embedding = average_embeddings(embeddings)\n",
|
901 |
+
"\n",
|
902 |
+
" # Ortalama embedding değerlerinden sadece 0'dan büyük olanları alma\n",
|
903 |
+
" positive_avg_embedding = avg_embedding[avg_embedding > 0]\n",
|
904 |
+
"\n",
|
905 |
+
" if len(positive_avg_embedding) > 0:\n",
|
906 |
+
" print(f\"Tokens: {tokens}\")\n",
|
907 |
+
" print(f\"Positive Average Embedding Shape: {positive_avg_embedding.shape}\")\n",
|
908 |
+
" print(f\"Positive Average Embedding: {positive_avg_embedding}\")\n",
|
909 |
+
" else:\n",
|
910 |
+
" print(\"No positive embedding values found.\")\n",
|
911 |
+
"\n",
|
912 |
+
"# TF-IDF anahtar kelimelerini çıkar\n",
|
913 |
+
"keywords = extract_keywords_tfidf(texts, stop_words_list)\n",
|
914 |
+
"print(\"TF-IDF Keywords:\", keywords)\n",
|
915 |
+
"\n",
|
916 |
+
"# TF-IDF matrisini oluşturma\n",
|
917 |
+
"tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
918 |
+
"corpus = [\" \".join(keywords)] # Anahtar kelimeleri string olarak birleştir\n",
|
919 |
+
"tfidf_matrix = tfidf_vectorizer.fit_transform(corpus + texts)\n",
|
920 |
+
"\n",
|
921 |
+
"# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\n",
|
922 |
+
"keywords_vector = tfidf_matrix[0]\n",
|
923 |
+
"document_vectors = tfidf_matrix[1:]\n",
|
924 |
+
"\n",
|
925 |
+
"# Kelimeleri ve TF-IDF değerlerini alma\n",
|
926 |
+
"feature_names = tfidf_vectorizer.get_feature_names_out()\n",
|
927 |
+
"tfidf_scores = tfidf_matrix.toarray()\n",
|
928 |
+
"\n",
|
929 |
+
"# Cosine similarity hesaplama\n",
|
930 |
+
"similarity_scores = cosine_similarity(keywords_vector, document_vectors)\n",
|
931 |
+
"\n",
|
932 |
+
"# Her bir kelime için TF-IDF değerlerini yazdırma\n",
|
933 |
+
"for doc_idx, doc in enumerate(tfidf_scores[1:], start=1):\n",
|
934 |
+
" print(f\"Document {doc_idx}:\")\n",
|
935 |
+
" for word_idx, score in enumerate(doc):\n",
|
936 |
+
" print(f\"Word: {feature_names[word_idx]}, TF-IDF: {score:.4f}\")\n",
|
937 |
+
" print(\"\\n\")\n",
|
938 |
+
"\n",
|
939 |
+
"# Sonucu yazdırma\n",
|
940 |
+
"print(f\"Keywords ile metin arasındaki benzerlik: {similarity_scores[0][0]}\")\n"
|
941 |
+
]
|
942 |
+
},
|
943 |
+
{
|
944 |
+
"cell_type": "code",
|
945 |
+
"execution_count": 1,
|
946 |
+
"metadata": {},
|
947 |
+
"outputs": [
|
948 |
+
{
|
949 |
+
"name": "stderr",
|
950 |
+
"output_type": "stream",
|
951 |
+
"text": [
|
952 |
+
"\n",
|
953 |
+
"A module that was compiled using NumPy 1.x cannot be run in\n",
|
954 |
+
"NumPy 2.1.0 as it may crash. To support both 1.x and 2.x\n",
|
955 |
+
"versions of NumPy, modules must be compiled with NumPy 2.0.\n",
|
956 |
+
"Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n",
|
957 |
+
"\n",
|
958 |
+
"If you are a user of the module, the easiest solution will be to\n",
|
959 |
+
"downgrade to 'numpy<2' or try to upgrade the affected module.\n",
|
960 |
+
"We expect that some modules will need time to support NumPy 2.\n",
|
961 |
+
"\n",
|
962 |
+
"Traceback (most recent call last): File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\runpy.py\", line 196, in _run_module_as_main\n",
|
963 |
+
" return _run_code(code, main_globals, None,\n",
|
964 |
+
" File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\runpy.py\", line 86, in _run_code\n",
|
965 |
+
" exec(code, run_globals)\n",
|
966 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel_launcher.py\", line 18, in <module>\n",
|
967 |
+
" app.launch_new_instance()\n",
|
968 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\traitlets\\config\\application.py\", line 1075, in launch_instance\n",
|
969 |
+
" app.start()\n",
|
970 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelapp.py\", line 739, in start\n",
|
971 |
+
" self.io_loop.start()\n",
|
972 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tornado\\platform\\asyncio.py\", line 205, in start\n",
|
973 |
+
" self.asyncio_loop.run_forever()\n",
|
974 |
+
" File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\base_events.py\", line 603, in run_forever\n",
|
975 |
+
" self._run_once()\n",
|
976 |
+
" File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\base_events.py\", line 1909, in _run_once\n",
|
977 |
+
" handle._run()\n",
|
978 |
+
" File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\events.py\", line 80, in _run\n",
|
979 |
+
" self._context.run(self._callback, *self._args)\n",
|
980 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 545, in dispatch_queue\n",
|
981 |
+
" await self.process_one()\n",
|
982 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 534, in process_one\n",
|
983 |
+
" await dispatch(*args)\n",
|
984 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 437, in dispatch_shell\n",
|
985 |
+
" await result\n",
|
986 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 362, in execute_request\n",
|
987 |
+
" await super().execute_request(stream, ident, parent)\n",
|
988 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 778, in execute_request\n",
|
989 |
+
" reply_content = await reply_content\n",
|
990 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 449, in do_execute\n",
|
991 |
+
" res = shell.run_cell(\n",
|
992 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\zmqshell.py\", line 549, in run_cell\n",
|
993 |
+
" return super().run_cell(*args, **kwargs)\n",
|
994 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3075, in run_cell\n",
|
995 |
+
" result = self._run_cell(\n",
|
996 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3130, in _run_cell\n",
|
997 |
+
" result = runner(coro)\n",
|
998 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\async_helpers.py\", line 128, in _pseudo_sync_runner\n",
|
999 |
+
" coro.send(None)\n",
|
1000 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3334, in run_cell_async\n",
|
1001 |
+
" has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
|
1002 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3517, in run_ast_nodes\n",
|
1003 |
+
" if await self.run_code(code, result, async_=asy):\n",
|
1004 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3577, in run_code\n",
|
1005 |
+
" exec(code_obj, self.user_global_ns, self.user_ns)\n",
|
1006 |
+
" File \"C:\\Users\\info\\AppData\\Local\\Temp\\ipykernel_17960\\3105833283.py\", line 7, in <module>\n",
|
1007 |
+
" from transformers import BertTokenizer, BertModel\n",
|
1008 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\__init__.py\", line 26, in <module>\n",
|
1009 |
+
" from . import dependency_versions_check\n",
|
1010 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\dependency_versions_check.py\", line 16, in <module>\n",
|
1011 |
+
" from .utils.versions import require_version, require_version_core\n",
|
1012 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\__init__.py\", line 34, in <module>\n",
|
1013 |
+
" from .generic import (\n",
|
1014 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py\", line 462, in <module>\n",
|
1015 |
+
" import torch.utils._pytree as _torch_pytree\n",
|
1016 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py\", line 2120, in <module>\n",
|
1017 |
+
" from torch._higher_order_ops import cond\n",
|
1018 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_higher_order_ops\\__init__.py\", line 1, in <module>\n",
|
1019 |
+
" from .cond import cond\n",
|
1020 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_higher_order_ops\\cond.py\", line 5, in <module>\n",
|
1021 |
+
" import torch._subclasses.functional_tensor\n",
|
1022 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_subclasses\\functional_tensor.py\", line 42, in <module>\n",
|
1023 |
+
" class FunctionalTensor(torch.Tensor):\n",
|
1024 |
+
" File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_subclasses\\functional_tensor.py\", line 258, in FunctionalTensor\n",
|
1025 |
+
" cpu = _conversion_method_template(device=torch.device(\"cpu\"))\n",
|
1026 |
+
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_subclasses\\functional_tensor.py:258: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\torch\\csrc\\utils\\tensor_numpy.cpp:84.)\n",
|
1027 |
+
" cpu = _conversion_method_template(device=torch.device(\"cpu\"))\n"
|
1028 |
+
]
|
1029 |
+
},
|
1030 |
+
{
|
1031 |
+
"ename": "NameError",
|
1032 |
+
"evalue": "name 'texts' is not defined",
|
1033 |
+
"output_type": "error",
|
1034 |
+
"traceback": [
|
1035 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
1036 |
+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
1037 |
+
"Cell \u001b[1;32mIn[1], line 62\u001b[0m\n\u001b[0;32m 59\u001b[0m tfidf_vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer()\n\u001b[0;32m 61\u001b[0m \u001b[38;5;66;03m# TF-IDF anahtar kelimelerini çıkar\u001b[39;00m\n\u001b[1;32m---> 62\u001b[0m keywords \u001b[38;5;241m=\u001b[39m extract_keywords_tfidf(\u001b[43mtexts\u001b[49m, stop_words_list)\n\u001b[0;32m 63\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTF-IDF Keywords:\u001b[39m\u001b[38;5;124m\"\u001b[39m, keywords)\n\u001b[0;32m 65\u001b[0m \u001b[38;5;66;03m# Transform the text and keywords into TF-IDF representations\u001b[39;00m\n",
|
1038 |
+
"\u001b[1;31mNameError\u001b[0m: name 'texts' is not defined"
|
1039 |
+
]
|
1040 |
+
}
|
1041 |
+
],
|
1042 |
+
"source": [
|
1043 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
1044 |
+
"import numpy as np\n",
|
1045 |
+
"import re\n",
|
1046 |
+
"import pandas as pd\n",
|
1047 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
1048 |
+
"from nltk.corpus import stopwords as nltk_stopwords\n",
|
1049 |
+
"from transformers import BertTokenizer, BertModel\n",
|
1050 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
1051 |
+
"import torch\n",
|
1052 |
+
"import torch.nn.functional as F\n",
|
1053 |
+
"\n",
|
1054 |
+
"# BERT Tokenizer ve Model'i yükleyin\n",
|
1055 |
+
"tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
|
1056 |
+
"model = BertModel.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
|
1057 |
+
"\n",
|
1058 |
+
"#-------------------------- burada turkish_stop_words'ü alıyoruz\n",
|
1059 |
+
"def load_stop_words(file_path):\n",
|
1060 |
+
" \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur.\"\"\"\n",
|
1061 |
+
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
1062 |
+
" stop_words = [line.strip() for line in file if line.strip()]\n",
|
1063 |
+
" return stop_words\n",
|
1064 |
+
"\n",
|
1065 |
+
"# Keywords çıkarma fonksiyonu\n",
|
1066 |
+
"def extract_keywords_tfidf(corpus, stop_words_list):\n",
|
1067 |
+
" \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
|
1068 |
+
" vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
1069 |
+
" X = vectorizer.fit_transform(corpus)\n",
|
1070 |
+
" feature_names = vectorizer.get_feature_names_out()\n",
|
1071 |
+
" sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
|
1072 |
+
" return sorted_keywords\n",
|
1073 |
+
"\n",
|
1074 |
+
"# Türkçe stop words dosyasını yükleyin\n",
|
1075 |
+
"stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
|
1076 |
+
"# Define the text\n",
|
1077 |
+
"text = \"\"\"Biyografi\n",
|
1078 |
+
"İsim \n",
|
1079 |
+
"Pşıqo ismi Ahecaqo soy ismidir. Çerkeslerin \"-qo\" soneki ile biten hem soyadları hem de lakapları vardı. Bu ek Türkçe adlardaki \"-oğlu\" ekiyle eş anlamlıdır. Pşıqo Türkçe \"Beyoğlu\" anlamına gelen bir lakaptır.\n",
|
1080 |
+
"\n",
|
1081 |
+
"Erken dönem \n",
|
1082 |
+
"Çerkesler tarihlerini yazmadıkları ve tüm bilgiler Rus kaynaklarından geldiği için Ahecaqo hakkında pek bir şey kaydedilmedi. 1777'de Çerkesya'nın Bjeduğ bölgesinde doğdu. Askerî eğitim ile büyütüldü.\n",
|
1083 |
+
"\n",
|
1084 |
+
"Rus-Çerkes Savaşına Katılımı \n",
|
1085 |
+
"Birkaç kaynak, Ahecaqo'nun tüm Çerkesya'da saygı duyulan bir kişi olduğunu belirtir. En az 6.000 atlıdan oluşan kalıcı bir ordusu vardı ve çatışmalar sırasında müfrezesi 12.000 atlıya ulaşıyordu. Rus birliklerine karşı kazandığı zaferlerle ünlüydü. Askeri becerisinin yanı sıra yetenekli bir devlet adamıydı.\n",
|
1086 |
+
"\n",
|
1087 |
+
"Ölüm \n",
|
1088 |
+
"1837 yılında Rus tarafına geçti ve bir yıl sonra hastalıktan öldü.\n",
|
1089 |
+
"\n",
|
1090 |
+
"Kaynakça \n",
|
1091 |
+
"\n",
|
1092 |
+
"Çerkes soylular\n",
|
1093 |
+
"1777 doğumlular\n",
|
1094 |
+
"1838 yılında ölenler\n",
|
1095 |
+
"Kafkas Savaşı'nda kişiler \"\"\"\n",
|
1096 |
+
"\n",
|
1097 |
+
"# Define the keywords\n",
|
1098 |
+
"#keywords = [\"rus\", \"ahecaqo\", \"türkçe\", \"pşıqo\", \"1777\", \"çerkes\", \"000\", \"çerkesya\", \"ölenler\", \"ünlüydü\"]\n",
|
1099 |
+
"\n",
|
1100 |
+
"# Create a TfidfVectorizer instance\n",
|
1101 |
+
"tfidf_vectorizer = TfidfVectorizer()\n",
|
1102 |
+
"\n",
|
1103 |
+
"# TF-IDF anahtar kelimelerini çıkar\n",
|
1104 |
+
"keywords = extract_keywords_tfidf(texts, stop_words_list)\n",
|
1105 |
+
"print(\"TF-IDF Keywords:\", keywords)\n",
|
1106 |
+
"\n",
|
1107 |
+
"# Transform the text and keywords into TF-IDF representations\n",
|
1108 |
+
"text_tfidf = tfidf_vectorizer.fit_transform([text]) #burada text'i de vetörize ediyoruz.\n",
|
1109 |
+
"keywords_tfidf = tfidf_vectorizer.transform(keywords)\n",
|
1110 |
+
"\n",
|
1111 |
+
"# Calculate the cosine similarity between the text and each keyword\n",
|
1112 |
+
"similarities = []\n",
|
1113 |
+
"for i in range(keywords_tfidf.shape[0]): #keyword_tfidf matrisinin satırları üzerinde dönfü tanımlıyoruz \n",
|
1114 |
+
" keyword_tfidf = keywords_tfidf[i, :] # matrisin i. değerini alıyoruz \n",
|
1115 |
+
" # `text_tfidf` ile `keyword_tfidf` arasındaki kosinüs benzerliğini hesaplıyoruz\n",
|
1116 |
+
" similarity = np.dot(text_tfidf, keyword_tfidf.T).toarray()[0][0]\n",
|
1117 |
+
" similarities.append((keywords[i], similarity))\n",
|
1118 |
+
"\n",
|
1119 |
+
"# Sort the similarities in descending order\n",
|
1120 |
+
"keyword_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)\n",
|
1121 |
+
"\n",
|
1122 |
+
"\n",
|
1123 |
+
"\n",
|
1124 |
+
"# Print the top 10 keywords with their similarities\n",
|
1125 |
+
"print(\"Top 10 Keywords with Similarities:\")\n",
|
1126 |
+
"for keyword, similarity in keyword_similarities[:10]:\n",
|
1127 |
+
" print(f\"{keyword}: {similarity:.4f}\")"
|
1128 |
+
]
|
1129 |
+
},
|
1130 |
+
{
|
1131 |
+
"cell_type": "code",
|
1132 |
+
"execution_count": 24,
|
1133 |
+
"metadata": {},
|
1134 |
+
"outputs": [
|
1135 |
+
{
|
1136 |
+
"ename": "AttributeError",
|
1137 |
+
"evalue": "'list' object has no attribute 'lower'",
|
1138 |
+
"output_type": "error",
|
1139 |
+
"traceback": [
|
1140 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
1141 |
+
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
1142 |
+
"Cell \u001b[1;32mIn[24], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m tfidf_vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words_list)\n\u001b[0;32m 13\u001b[0m corpus \u001b[38;5;241m=\u001b[39m [text, keywords]\n\u001b[1;32m---> 14\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_vectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\u001b[39;00m\n\u001b[0;32m 17\u001b[0m keywords_vector \u001b[38;5;241m=\u001b[39m tfidf_matrix[\u001b[38;5;241m1\u001b[39m]\n",
|
1143 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
|
1144 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
1145 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1372\u001b[0m, in \u001b[0;36mCountVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 1364\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1365\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUpper case characters found in\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m vocabulary while \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlowercase\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is True. These entries will not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1368\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be matched with any documents\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1369\u001b[0m )\n\u001b[0;32m 1370\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m-> 1372\u001b[0m vocabulary, X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbinary:\n\u001b[0;32m 1375\u001b[0m X\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mfill(\u001b[38;5;241m1\u001b[39m)\n",
|
1146 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1259\u001b[0m, in \u001b[0;36mCountVectorizer._count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 1257\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m raw_documents:\n\u001b[0;32m 1258\u001b[0m feature_counter \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m-> 1259\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m \u001b[43manalyze\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[0;32m 1260\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1261\u001b[0m feature_idx \u001b[38;5;241m=\u001b[39m vocabulary[feature]\n",
|
1147 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:108\u001b[0m, in \u001b[0;36m_analyze\u001b[1;34m(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preprocessor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 108\u001b[0m doc \u001b[38;5;241m=\u001b[39m \u001b[43mpreprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 110\u001b[0m doc \u001b[38;5;241m=\u001b[39m tokenizer(doc)\n",
|
1148 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:66\u001b[0m, in \u001b[0;36m_preprocess\u001b[1;34m(doc, accent_function, lower)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Chain together an optional series of text preprocessing steps to\u001b[39;00m\n\u001b[0;32m 48\u001b[0m \u001b[38;5;124;03mapply to a document.\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 63\u001b[0m \u001b[38;5;124;03m preprocessed string\u001b[39;00m\n\u001b[0;32m 64\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 65\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lower:\n\u001b[1;32m---> 66\u001b[0m doc \u001b[38;5;241m=\u001b[39m \u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlower\u001b[49m()\n\u001b[0;32m 67\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m accent_function \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 68\u001b[0m doc \u001b[38;5;241m=\u001b[39m accent_function(doc)\n",
|
1149 |
+
"\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'lower'"
|
1150 |
+
]
|
1151 |
+
}
|
1152 |
+
],
|
1153 |
+
"source": [
|
1154 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
1155 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
1156 |
+
"import numpy as np\n",
|
1157 |
+
"\n",
|
1158 |
+
"#metin ile keywordslerin benzerlik oranını hesaplama \n",
|
1159 |
+
"text,keywords\n",
|
1160 |
+
"\n",
|
1161 |
+
"# Metinleri birleştirip TF-IDF matrisini oluşturma\n",
|
1162 |
+
"# TF-IDF vektörleştirici oluşturma\n",
|
1163 |
+
"# Türkçe stop words dosyasını yükleyin\n",
|
1164 |
+
"stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
|
1165 |
+
"tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
1166 |
+
"corpus = [text, keywords]\n",
|
1167 |
+
"tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)\n",
|
1168 |
+
"\n",
|
1169 |
+
"# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\n",
|
1170 |
+
"keywords_vector = tfidf_matrix[1]\n",
|
1171 |
+
"text_vector = tfidf_matrix[0]\n",
|
1172 |
+
"\n",
|
1173 |
+
"# Anahtar kelimeler ve metin arasındaki cosine similarity hesaplama\n",
|
1174 |
+
"similarity_score = cosine_similarity(keywords_vector, text_vector)\n",
|
1175 |
+
"\n",
|
1176 |
+
"# Sonucu yazdırma\n",
|
1177 |
+
"print(f\"Keywords ile metin arasındaki benzerlik: {similarity_score[0][0]}\")\n"
|
1178 |
+
]
|
1179 |
+
},
|
1180 |
+
{
|
1181 |
+
"cell_type": "code",
|
1182 |
+
"execution_count": 19,
|
1183 |
+
"metadata": {},
|
1184 |
+
"outputs": [
|
1185 |
+
{
|
1186 |
+
"ename": "TypeError",
|
1187 |
+
"evalue": "'function' object is not subscriptable",
|
1188 |
+
"output_type": "error",
|
1189 |
+
"traceback": [
|
1190 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
1191 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
1192 |
+
"Cell \u001b[1;32mIn[19], line 18\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\n\u001b[0;32m 17\u001b[0m \u001b[38;5;66;03m# Compute BERT embeddings for the top 10 keywords\u001b[39;00m\n\u001b[1;32m---> 18\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m [keyword \u001b[38;5;28;01mfor\u001b[39;00m keyword, _ \u001b[38;5;129;01min\u001b[39;00m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m]\u001b[49m]\n\u001b[0;32m 19\u001b[0m bert_embeddings \u001b[38;5;241m=\u001b[39m compute_bert_embeddings(top_keywords)\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# Define a function to compute the similarity between two embeddings\u001b[39;00m\n",
|
1193 |
+
"\u001b[1;31mTypeError\u001b[0m: 'function' object is not subscriptable"
|
1194 |
+
]
|
1195 |
+
}
|
1196 |
+
],
|
1197 |
+
"source": [
|
1198 |
+
"#------------------------------ tf-ıdf ve embedding benzerlik \n",
|
1199 |
+
"# Define a function to compute BERT embeddings for a list of keywords\n",
|
1200 |
+
"\n",
|
1201 |
+
"def compute_bert_embeddings(keywords):\n",
|
1202 |
+
" embeddings = []\n",
|
1203 |
+
" for keyword in keywords:\n",
|
1204 |
+
" inputs = tokenizer.encode_plus(\n",
|
1205 |
+
" keyword,\n",
|
1206 |
+
" add_special_tokens=True,\n",
|
1207 |
+
" max_length=512,\n",
|
1208 |
+
" return_attention_mask=True,\n",
|
1209 |
+
" return_tensors='pt'\n",
|
1210 |
+
" )\n",
|
1211 |
+
" outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])\n",
|
1212 |
+
" embeddings.append(outputs.last_hidden_state[:, 0, :]) # Take the embedding of the [CLS] token\n",
|
1213 |
+
" return embeddings\n",
|
1214 |
+
"\n",
|
1215 |
+
"# Compute BERT embeddings for the top 10 keywords\n",
|
1216 |
+
"top_keywords = [keyword for keyword, score in extract_keywords_tfidf[:10]]\n",
|
1217 |
+
"bert_embeddings = compute_bert_embeddings(top_keywords)\n",
|
1218 |
+
"\n",
|
1219 |
+
"# Define a function to compute the similarity between two embeddings\n",
|
1220 |
+
"def compute_similarity(embedding1, embedding2):\n",
|
1221 |
+
" return F.cosine_similarity(embedding1, embedding2)\n",
|
1222 |
+
"\n",
|
1223 |
+
"# Compute the similarity between the text and each keyword\n",
|
1224 |
+
"similarities = []\n",
|
1225 |
+
"for keyword_embedding in enumerate(bert_embeddings):\n",
|
1226 |
+
"\n",
|
1227 |
+
" keyword= top_keywords[i]\n",
|
1228 |
+
" score = extract_keywords_tfidf[i][1]\n",
|
1229 |
+
" similarity = compute_similarity(positive_avg_embedding, keyword_embedding)\n",
|
1230 |
+
" similarities.append(keyword,similarity.item()*score)\n",
|
1231 |
+
"\n",
|
1232 |
+
"# Combine the top 10 keywords with their similarities\n",
|
1233 |
+
"keyword_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)\n",
|
1234 |
+
"# Combine the top 10 keywords with their similarities\n",
|
1235 |
+
"#keyword_similarities = list(zip(top_keywords, similarities))"
|
1236 |
+
]
|
1237 |
+
},
|
1238 |
{
|
1239 |
"cell_type": "code",
|
1240 |
"execution_count": 8,
|
|
|
1300 |
"test_stop_words_effectiveness(texts, stop_words_list)\n"
|
1301 |
]
|
1302 |
},
|
1303 |
+
{
|
1304 |
+
"cell_type": "markdown",
|
1305 |
+
"metadata": {},
|
1306 |
+
"source": [
|
1307 |
+
"K-nn ile Cosine Similarity "
|
1308 |
+
]
|
1309 |
+
},
|
1310 |
+
{
|
1311 |
+
"cell_type": "code",
|
1312 |
+
"execution_count": null,
|
1313 |
+
"metadata": {},
|
1314 |
+
"outputs": [],
|
1315 |
+
"source": [
|
1316 |
+
"#tf-ıdf değeleri arasınadki en çok metinde tekrarlanan ve anlam ilşikisi en yüksek olan kelimeleri kıyaslama \n",
|
1317 |
+
"model.most_similar(positive=[\"rus\",])"
|
1318 |
+
]
|
1319 |
+
},
|
1320 |
+
{
|
1321 |
+
"cell_type": "code",
|
1322 |
+
"execution_count": null,
|
1323 |
+
"metadata": {},
|
1324 |
+
"outputs": [],
|
1325 |
+
"source": [
|
1326 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
1327 |
+
"\n",
|
1328 |
+
"# TF-IDF ile vektörleri oluştur\n",
|
1329 |
+
"vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
|
1330 |
+
"tfidf_matrix = vectorizer.fit_transform(texts)\n",
|
1331 |
+
"\n",
|
1332 |
+
"# BERT ile elde edilen pozitif embedding'leri TF-IDF vektörlerine dönüştür\n",
|
1333 |
+
"# Bu adımda, her kelimenin veya metnin TF-IDF ağırlıklarıyla karşılaştırılması yapılacak\n",
|
1334 |
+
"\n",
|
1335 |
+
"def get_tfidf_vector_for_query(query, vectorizer):\n",
|
1336 |
+
" \"\"\"Sorgu metni için TF-IDF vektörü alır\"\"\"\n",
|
1337 |
+
" return vectorizer.transform([query])\n",
|
1338 |
+
"\n",
|
1339 |
+
"def calculate_similarity(tfidf_vector, embeddings):\n",
|
1340 |
+
" \"\"\"TF-IDF vektörü ile embeddings arasındaki cosine similarity hesaplar\"\"\"\n",
|
1341 |
+
" return cosine_similarity(tfidf_vector, embeddings)\n",
|
1342 |
+
"\n",
|
1343 |
+
"# Sorgu metnini tanımlayın ve TF-IDF vektörünü alın\n",
|
1344 |
+
"query_text = \"Nasılsın?\"\n",
|
1345 |
+
"query_tfidf_vector = get_tfidf_vector_for_query(query_text, vectorizer)\n",
|
1346 |
+
"\n",
|
1347 |
+
"# Cosine similarity hesaplayın\n",
|
1348 |
+
"similarity_scores = calculate_similarity(query_tfidf_vector, tfidf_matrix)\n",
|
1349 |
+
"\n",
|
1350 |
+
"# Sonuçları yazdırın\n",
|
1351 |
+
"print(\"Cosine Similarity Scores:\", similarity_scores)\n"
|
1352 |
+
]
|
1353 |
+
},
|
1354 |
+
{
|
1355 |
+
"cell_type": "code",
|
1356 |
+
"execution_count": null,
|
1357 |
+
"metadata": {},
|
1358 |
+
"outputs": [],
|
1359 |
+
"source": [
|
1360 |
+
"from sklearn.neighbors import NearestNeighbors\n",
|
1361 |
+
"\n",
|
1362 |
+
"def fit_knn_model(embeddings,n_neighbors=5):\n",
|
1363 |
+
" knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')\n",
|
1364 |
+
" knn.fit(embeddings)\n",
|
1365 |
+
" return knn\n",
|
1366 |
+
"\n",
|
1367 |
+
"embeddings= np.array([get_bert_embeddings(text) for text in texts])\n",
|
1368 |
+
"#knn\n",
|
1369 |
+
"knn_model=fit_knn_model(embeddings)\n",
|
1370 |
+
"\n",
|
1371 |
+
"\n",
|
1372 |
+
"#tf-ıdf değelriyle bert üzerinden elde ettiğimiz verlerin benzerliğini hesaplayacağız \n",
|
1373 |
+
"keywords"
|
1374 |
+
]
|
1375 |
+
},
|
1376 |
{
|
1377 |
"cell_type": "code",
|
1378 |
"execution_count": 20,
|
deneme.ipynb
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 3,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
13 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"ename": "OSError",
|
18 |
+
"evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
|
19 |
+
"output_type": "error",
|
20 |
+
"traceback": [
|
21 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
22 |
+
"\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
|
23 |
+
"Cell \u001b[1;32mIn[3], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
|
24 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
|
25 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
|
26 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
|
27 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
|
28 |
+
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
|
29 |
+
"\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
|
30 |
+
]
|
31 |
+
}
|
32 |
+
],
|
33 |
+
"source": [
|
34 |
+
"from datasets import load_dataset\n",
|
35 |
+
"import pandas as pd \n",
|
36 |
+
"from pymongo import MongoClient\n",
|
37 |
+
"from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"cell_type": "code",
|
42 |
+
"execution_count": null,
|
43 |
+
"metadata": {},
|
44 |
+
"outputs": [],
|
45 |
+
"source": []
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"metadata": {
|
49 |
+
"kernelspec": {
|
50 |
+
"display_name": ".venv",
|
51 |
+
"language": "python",
|
52 |
+
"name": "python3"
|
53 |
+
},
|
54 |
+
"language_info": {
|
55 |
+
"codemirror_mode": {
|
56 |
+
"name": "ipython",
|
57 |
+
"version": 3
|
58 |
+
},
|
59 |
+
"file_extension": ".py",
|
60 |
+
"mimetype": "text/x-python",
|
61 |
+
"name": "python",
|
62 |
+
"nbconvert_exporter": "python",
|
63 |
+
"pygments_lexer": "ipython3",
|
64 |
+
"version": "3.10.11"
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"nbformat": 4,
|
68 |
+
"nbformat_minor": 2
|
69 |
+
}
|
gereksiz_kelimeler.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
kelimeler.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -1,6 +1,127 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.1.0
|
2 |
+
aiofiles==23.2.1
|
3 |
+
aiohappyeyeballs==2.3.4
|
4 |
+
aiohttp==3.10.1
|
5 |
+
aiosignal==1.3.1
|
6 |
+
annotated-types==0.7.0
|
7 |
+
anyio==4.4.0
|
8 |
+
asttokens==2.4.1
|
9 |
+
astunparse==1.6.3
|
10 |
+
attrs==24.1.0
|
11 |
+
certifi==2024.7.4
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
click==8.1.7
|
14 |
+
colorama==0.4.6
|
15 |
+
comm==0.2.2
|
16 |
+
contourpy==1.2.1
|
17 |
+
cycler==0.12.1
|
18 |
+
datasets==2.20.0
|
19 |
+
debugpy==1.8.5
|
20 |
+
decorator==5.1.1
|
21 |
+
dill==0.3.8
|
22 |
+
dnspython==2.6.1
|
23 |
+
executing==2.0.1
|
24 |
+
fastapi==0.112.0
|
25 |
+
ffmpy==0.4.0
|
26 |
+
filelock==3.15.4
|
27 |
+
flatbuffers==24.3.25
|
28 |
+
fonttools==4.53.1
|
29 |
+
frozenlist==1.4.1
|
30 |
+
fsspec==2024.5.0
|
31 |
+
gast==0.6.0
|
32 |
+
google-pasta==0.2.0
|
33 |
+
gradio==4.40.0
|
34 |
+
gradio_client==1.2.0
|
35 |
+
grpcio==1.65.4
|
36 |
+
h11==0.14.0
|
37 |
+
h5py==3.11.0
|
38 |
+
httpcore==1.0.5
|
39 |
+
httpx==0.27.0
|
40 |
+
huggingface-hub==0.24.5
|
41 |
+
idna==3.7
|
42 |
+
importlib_resources==6.4.0
|
43 |
+
ipykernel==6.29.5
|
44 |
+
ipython==8.26.0
|
45 |
+
jedi==0.19.1
|
46 |
+
Jinja2==3.1.4
|
47 |
+
jupyter_client==8.6.2
|
48 |
+
jupyter_core==5.7.2
|
49 |
+
keras==3.4.1
|
50 |
+
kiwisolver==1.4.5
|
51 |
+
libclang==18.1.1
|
52 |
+
Markdown==3.6
|
53 |
+
markdown-it-py==3.0.0
|
54 |
+
MarkupSafe==2.1.5
|
55 |
+
matplotlib==3.9.0
|
56 |
+
matplotlib-inline==0.1.7
|
57 |
+
mdurl==0.1.2
|
58 |
+
ml-dtypes==0.4.0
|
59 |
+
mpmath==1.3.0
|
60 |
+
multidict==6.0.5
|
61 |
+
multiprocess==0.70.16
|
62 |
+
namex==0.0.8
|
63 |
+
nest-asyncio==1.6.0
|
64 |
+
networkx==3.3
|
65 |
+
numpy==1.26.4
|
66 |
+
opt-einsum==3.3.0
|
67 |
+
optree==0.12.1
|
68 |
+
orjson==3.10.6
|
69 |
+
packaging==24.1
|
70 |
+
pandas==2.2.2
|
71 |
+
parso==0.8.4
|
72 |
+
pillow==10.4.0
|
73 |
+
platformdirs==4.2.2
|
74 |
+
prompt_toolkit==3.0.47
|
75 |
+
protobuf==4.25.4
|
76 |
+
psutil==6.0.0
|
77 |
+
pure_eval==0.2.3
|
78 |
+
pyarrow==17.0.0
|
79 |
+
pyarrow-hotfix==0.6
|
80 |
+
pydantic==2.8.2
|
81 |
+
pydantic_core==2.20.1
|
82 |
+
pydub==0.25.1
|
83 |
+
Pygments==2.18.0
|
84 |
+
pymongo==4.8.0
|
85 |
+
pyparsing==3.1.2
|
86 |
+
python-dateutil==2.9.0.post0
|
87 |
+
python-multipart==0.0.9
|
88 |
+
pytz==2024.1
|
89 |
+
pywin32==306
|
90 |
+
PyYAML==6.0.1
|
91 |
+
pyzmq==26.1.0
|
92 |
+
regex==2024.7.24
|
93 |
+
requests==2.32.3
|
94 |
+
rich==13.7.1
|
95 |
+
ruff==0.5.6
|
96 |
+
safetensors==0.4.4
|
97 |
+
semantic-version==2.10.0
|
98 |
+
sentence-transformers==3.0.1
|
99 |
+
shellingham==1.5.4
|
100 |
+
six==1.16.0
|
101 |
+
sniffio==1.3.1
|
102 |
+
stack-data==0.6.3
|
103 |
+
starlette==0.37.2
|
104 |
+
sympy==1.13.1
|
105 |
+
tensorboard==2.17.0
|
106 |
+
tensorboard-data-server==0.7.2
|
107 |
+
tensorflow==2.17.0
|
108 |
+
tensorflow-intel==2.17.0
|
109 |
+
tensorflow-io-gcs-filesystem==0.31.0
|
110 |
+
termcolor==2.4.0
|
111 |
+
tokenizers==0.19.1
|
112 |
+
tomlkit==0.12.0
|
113 |
+
tqdm==4.66.5
|
114 |
+
traitlets==5.14.3
|
115 |
+
transformers==4.43.4
|
116 |
+
typer==0.12.3
|
117 |
+
typing_extensions==4.12.2
|
118 |
+
tzdata==2024.1
|
119 |
+
urllib3==2.2.2
|
120 |
+
uvicorn==0.30.5
|
121 |
+
wcwidth==0.2.13
|
122 |
+
websockets==12.0
|
123 |
+
Werkzeug==3.0.3
|
124 |
+
wrapt==1.16.0
|
125 |
+
xxhash==3.4.1
|
126 |
+
yarl==1.9.4
|
127 |
+
|
turkish_stop_words.txt
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ah
|
2 |
+
ama
|
3 |
+
an
|
4 |
+
ancak
|
5 |
+
araba
|
6 |
+
aralar
|
7 |
+
aslında
|
8 |
+
az
|
9 |
+
başlayan
|
10 |
+
bağlı
|
11 |
+
bazı
|
12 |
+
belirli
|
13 |
+
ben
|
14 |
+
bence
|
15 |
+
birkaç
|
16 |
+
birlikte
|
17 |
+
bunu
|
18 |
+
burada
|
19 |
+
biten
|
20 |
+
bir
|
21 |
+
birkaç
|
22 |
+
biz
|
23 |
+
bu
|
24 |
+
buna
|
25 |
+
çünkü
|
26 |
+
da
|
27 |
+
de
|
28 |
+
demek
|
29 |
+
den
|
30 |
+
derken
|
31 |
+
değil
|
32 |
+
daha
|
33 |
+
dolayı
|
34 |
+
edilir
|
35 |
+
eğer
|
36 |
+
en
|
37 |
+
fakat
|
38 |
+
genellikle
|
39 |
+
gibi
|
40 |
+
hem
|
41 |
+
her
|
42 |
+
herhangi
|
43 |
+
hiç
|
44 |
+
için
|
45 |
+
ile
|
46 |
+
ise
|
47 |
+
işte
|
48 |
+
itibaren
|
49 |
+
iyi
|
50 |
+
kadar
|
51 |
+
karşı
|
52 |
+
ki
|
53 |
+
kime
|
54 |
+
kısaca
|
55 |
+
mu
|
56 |
+
mü
|
57 |
+
nasıl
|
58 |
+
ne
|
59 |
+
neden
|
60 |
+
niye
|
61 |
+
nın
|
62 |
+
nda
|
63 |
+
nun
|
64 |
+
o
|
65 |
+
olasılıkla
|
66 |
+
olabilir
|
67 |
+
olarak
|
68 |
+
olduğu
|
69 |
+
oluşur
|
70 |
+
önce
|
71 |
+
pek
|
72 |
+
peki
|
73 |
+
şu
|
74 |
+
sadece
|
75 |
+
se
|
76 |
+
şey
|
77 |
+
sırasında
|
78 |
+
şimdi
|
79 |
+
sonra
|
80 |
+
tabi
|
81 |
+
tarafına
|
82 |
+
tüm
|
83 |
+
vardı
|
84 |
+
ve
|
85 |
+
ya
|
86 |
+
ya da
|
87 |
+
yanı
|
88 |
+
yani
|
89 |
+
yıl
|
90 |
+
yılında
|
91 |
+
yetenekli
|
92 |
+
yine
|
93 |
+
ama
|
94 |
+
amma
|
95 |
+
anca
|
96 |
+
ancak
|
97 |
+
belki
|
98 |
+
çünkü
|
99 |
+
dahi
|
100 |
+
eğer
|
101 |
+
emme
|
102 |
+
fakat
|
103 |
+
gah
|
104 |
+
gerek
|
105 |
+
hakeza
|
106 |
+
halbuki
|
107 |
+
hatta
|
108 |
+
hele
|
109 |
+
hem
|
110 |
+
hoş
|
111 |
+
ile
|
112 |
+
ile
|
113 |
+
imdi
|
114 |
+
ister
|
115 |
+
kah
|
116 |
+
keşke
|
117 |
+
keza
|
118 |
+
kezalik
|
119 |
+
kim
|
120 |
+
lakin
|
121 |
+
madem
|
122 |
+
mademki
|
123 |
+
mamafih
|
124 |
+
meğer
|
125 |
+
meğerki
|
126 |
+
meğerse
|
127 |
+
netekim
|
128 |
+
neyse
|
129 |
+
nitekim
|
130 |
+
oysa
|
131 |
+
oysaki
|
132 |
+
şayet
|
133 |
+
velev
|
134 |
+
velhasıl
|
135 |
+
velhasılıkelam
|
136 |
+
veya
|
137 |
+
veyahut
|
138 |
+
yahut
|
139 |
+
yalnız
|
140 |
+
yani
|
141 |
+
yok
|
142 |
+
yoksa
|
143 |
+
zira
|
144 |
+
acaba
|
145 |
+
acep
|
146 |
+
açıkça
|
147 |
+
açıkçası
|
148 |
+
adamakıllı
|
149 |
+
adeta
|
150 |
+
bazen
|
151 |
+
bazı
|
152 |
+
bilcümle
|
153 |
+
binaen
|
154 |
+
binaenaleyh
|
155 |
+
bir
|
156 |
+
biraz
|
157 |
+
birazdan
|
158 |
+
birden
|
159 |
+
birden
|
160 |
+
birdenbire
|
161 |
+
birice
|
162 |
+
birlikte
|
163 |
+
bitevi
|
164 |
+
biteviye
|
165 |
+
bittabi
|
166 |
+
bizatihi
|
167 |
+
bizce
|
168 |
+
bizcileyin
|
169 |
+
bizden
|
170 |
+
bizzat
|
171 |
+
boşuna
|
172 |
+
böyle
|
173 |
+
böylece
|
174 |
+
böylecene
|
175 |
+
böylelikle
|
176 |
+
böylemesine
|
177 |
+
böylesine
|
178 |
+
buracıkta
|
179 |
+
burada
|
180 |
+
buradan
|
181 |
+
büsbütün
|
182 |
+
çabuk
|
183 |
+
çabukça
|
184 |
+
çeşitli
|
185 |
+
çoğu
|
186 |
+
çoğun
|
187 |
+
çoğunca
|
188 |
+
çoğunlukla
|
189 |
+
çok
|
190 |
+
çokça
|
191 |
+
çokluk
|
192 |
+
çoklukla
|
193 |
+
cuk
|
194 |
+
daha
|
195 |
+
dahil
|
196 |
+
dahilen
|
197 |
+
daima
|
198 |
+
demin
|
199 |
+
demincek
|
200 |
+
deminden
|
201 |
+
derakap
|
202 |
+
derhal
|
203 |
+
derken
|
204 |
+
diye
|
205 |
+
elbet
|
206 |
+
elbette
|
207 |
+
enikonu
|
208 |
+
epey
|
209 |
+
epeyce
|
210 |
+
epeyi
|
211 |
+
esasen
|
212 |
+
esnasında
|
213 |
+
etraflı
|
214 |
+
etraflıca
|
215 |
+
evleviyetle
|
216 |
+
evvel
|
217 |
+
evvela
|
218 |
+
evvelce
|
219 |
+
evvelden
|
220 |
+
evvelemirde
|
221 |
+
evveli
|
222 |
+
gayet
|
223 |
+
gayetle
|
224 |
+
gayri
|
225 |
+
gayrı
|
226 |
+
geçende
|
227 |
+
geçenlerde
|
228 |
+
gene
|
229 |
+
gerçi
|
230 |
+
gibi
|
231 |
+
gibilerden
|
232 |
+
gibisinden
|
233 |
+
gine
|
234 |
+
halen
|
235 |
+
halihazırda
|
236 |
+
haliyle
|
237 |
+
handiyse
|
238 |
+
hani
|
239 |
+
hasılı
|
240 |
+
hulasaten
|
241 |
+
iken
|
242 |
+
illa
|
243 |
+
illaki
|
244 |
+
itibarıyla
|
245 |
+
iyice
|
246 |
+
iyicene
|
247 |
+
kala
|
248 |
+
kez
|
249 |
+
kısaca
|
250 |
+
külliyen
|
251 |
+
lütfen
|
252 |
+
nasıl
|
253 |
+
nasılsa
|
254 |
+
nazaran
|
255 |
+
neden
|
256 |
+
nedeniyle
|
257 |
+
nedense
|
258 |
+
nerde
|
259 |
+
nerden
|
260 |
+
nerdeyse
|
261 |
+
nerede
|
262 |
+
nereden
|
263 |
+
neredeyse
|
264 |
+
nereye
|
265 |
+
neye
|
266 |
+
neyi
|
267 |
+
nice
|
268 |
+
niçin
|
269 |
+
nihayet
|
270 |
+
nihayetinde
|
271 |
+
niye
|
272 |
+
oldu
|
273 |
+
oldukça
|
274 |
+
olur
|
275 |
+
onca
|
276 |
+
önce
|
277 |
+
önceden
|
278 |
+
önceleri
|
279 |
+
öncelikle
|
280 |
+
onculayın
|
281 |
+
ondan
|
282 |
+
oracık
|
283 |
+
oracıkta
|
284 |
+
orada
|
285 |
+
oradan
|
286 |
+
oranca
|
287 |
+
oranla
|
288 |
+
oraya
|
289 |
+
öyle
|
290 |
+
öylece
|
291 |
+
öylelikle
|
292 |
+
öylemesine
|
293 |
+
pek
|
294 |
+
pekala
|
295 |
+
pekçe
|
296 |
+
peki
|
297 |
+
peyderpey
|
298 |
+
sadece
|
299 |
+
sahi
|
300 |
+
sahiden
|
301 |
+
sanki
|
302 |
+
sonra
|
303 |
+
sonradan
|
304 |
+
sonraları
|
305 |
+
sonunda
|
306 |
+
şöyle
|
307 |
+
şuncacık
|
308 |
+
şuracıkta
|
309 |
+
tabii
|
310 |
+
tam
|
311 |
+
tamam
|
312 |
+
tamamen
|
313 |
+
tamamıyla
|
314 |
+
tek
|
315 |
+
vasıtasıyla
|
316 |
+
yakinen
|
317 |
+
yakında
|
318 |
+
yakından
|
319 |
+
yakınlarda
|
320 |
+
yalnız
|
321 |
+
yalnızca
|
322 |
+
yeniden
|
323 |
+
yenilerde
|
324 |
+
yine
|
325 |
+
yok
|
326 |
+
yoluyla
|
327 |
+
yüzünden
|
328 |
+
zaten
|
329 |
+
zati
|
330 |
+
ait
|
331 |
+
bari
|
332 |
+
beri
|
333 |
+
bile
|
334 |
+
değin
|
335 |
+
dek
|
336 |
+
denli
|
337 |
+
doğru
|
338 |
+
dolayı
|
339 |
+
dolayısıyla
|
340 |
+
gelgelelim
|
341 |
+
gibi
|
342 |
+
gırla
|
343 |
+
göre
|
344 |
+
hasebiyle
|
345 |
+
için
|
346 |
+
ila
|
347 |
+
ile
|
348 |
+
ilen
|
349 |
+
indinde
|
350 |
+
inen
|
351 |
+
kadar
|
352 |
+
kaffesi
|
353 |
+
karşın
|
354 |
+
kelli
|
355 |
+
Leh
|
356 |
+
maada
|
357 |
+
mebni
|
358 |
+
naşi
|
359 |
+
rağmen
|
360 |
+
üzere
|
361 |
+
zarfında
|
362 |
+
öbür
|
363 |
+
bana
|
364 |
+
başkası
|
365 |
+
ben
|
366 |
+
beriki
|
367 |
+
birbiri
|
368 |
+
birçoğu
|
369 |
+
biri
|
370 |
+
birileri
|
371 |
+
birisi
|
372 |
+
birkaçı
|
373 |
+
biz
|
374 |
+
bizimki
|
375 |
+
buna
|
376 |
+
bunda
|
377 |
+
bundan
|
378 |
+
bunlar
|
379 |
+
bunu
|
380 |
+
bunun
|
381 |
+
burası
|
382 |
+
çoğu
|
383 |
+
çoğu
|
384 |
+
çokları
|
385 |
+
çoklarınca
|
386 |
+
cümlesi
|
387 |
+
değil
|
388 |
+
diğeri
|
389 |
+
filanca
|
390 |
+
hangisi
|
391 |
+
hepsi
|
392 |
+
hiçbiri
|
393 |
+
iş
|
394 |
+
kaçı
|
395 |
+
kaynak
|
396 |
+
kendi
|
397 |
+
kim
|
398 |
+
kimi
|
399 |
+
kimisi
|
400 |
+
kimse
|
401 |
+
kimse
|
402 |
+
kimsecik
|
403 |
+
kimsecikler
|
404 |
+
nere
|
405 |
+
neresi
|
406 |
+
öbürkü
|
407 |
+
öbürü
|
408 |
+
ona
|
409 |
+
onda
|
410 |
+
ondan
|
411 |
+
onlar
|
412 |
+
onu
|
413 |
+
onun
|
414 |
+
öteki
|
415 |
+
ötekisi
|
416 |
+
öz
|
417 |
+
sana
|
418 |
+
sen
|
419 |
+
siz
|
420 |
+
şuna
|
421 |
+
şunda
|
422 |
+
şundan
|
423 |
+
şunlar
|
424 |
+
şunu
|
425 |
+
şunun
|
426 |
+
şura
|
427 |
+
şuracık
|
428 |
+
şurası
|