yonkasoft commited on
Commit
5cd7165
1 Parent(s): dab5364

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ combined_output.csv filter=lfs diff=lfs merge=lfs -text
MLM.ipynb ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "OSError",
10
+ "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n",
16
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
17
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
18
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
19
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
20
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
21
+ "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
27
+ "import torch "
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
40
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
41
+ " warnings.warn(message)\n",
42
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
43
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
44
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
45
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
46
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
47
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
48
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
49
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
50
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
51
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
52
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
53
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
54
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
55
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
56
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
57
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
58
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
59
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
60
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
61
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
62
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
63
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
64
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
65
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
66
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
67
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
68
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
69
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
70
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
71
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
72
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
73
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
74
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
75
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
76
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
77
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
78
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
79
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
80
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
81
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
82
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
83
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
84
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
85
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
86
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
87
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
88
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
89
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
90
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
91
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
92
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
93
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
94
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
95
+ "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
96
+ "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
102
+ "model=BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
103
+ "\n",
104
+ "text=(\"After reading these reports,\"\n",
105
+ " \"we start an outline of the application of ML.\"\n",
106
+ " \"It includes the [MASK] process \"\n",
107
+ " \"and various applications (from various software development to hardware development), to [MASK] of IT systems, and various approaches on analytics.\"\n",
108
+ " \"The approach incorporates [MASK] as well as computing and data mining.\"\n",
109
+ " \"For example, software developers and manufacturing engineers used AI \"\n",
110
+ " \"in manufacturing to develop their applications.\"\n",
111
+ " )"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 4,
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/plain": [
122
+ "dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
123
+ ]
124
+ },
125
+ "execution_count": 4,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "#maskeleme yaptıktan sonra tokenlere çeviriyoruz\n",
132
+ "inputs= tokenizer(text,return_tensors='pt')\n",
133
+ "inputs.keys()"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 5,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "text/plain": [
144
+ "tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
145
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
146
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
147
+ " 8051, 2458, 1007, 1010, 2000, 103, 1997, 2009, 3001, 1010,\n",
148
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 103,\n",
149
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
150
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
151
+ " 2000, 4503, 2037, 5097, 1012, 102]])"
152
+ ]
153
+ },
154
+ "execution_count": 5,
155
+ "metadata": {},
156
+ "output_type": "execute_result"
157
+ }
158
+ ],
159
+ "source": [
160
+ "inputs.input_ids"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": 6,
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": [
169
+ "text_normal= (\"After reading these reports,\"\n",
170
+ " \"we start an outline of the application of ML.\"\n",
171
+ " \"It includes the learning process \"\n",
172
+ " \"and various applications (from various software development to hardware development), to analysis of IT systems, and various approaches on analytics.\"\n",
173
+ " \"The approach incorporates AI as well as computing and data mining.\"\n",
174
+ " \"For example, software developers and manufacturing engineers used AI \"\n",
175
+ " \"in manufacturing to develop their applications.\"\n",
176
+ " )"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 8,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "data": {
186
+ "text/plain": [
187
+ "dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
188
+ ]
189
+ },
190
+ "execution_count": 8,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "#texti tokenlere çeviriyoruz\n",
197
+ "inputs_2= tokenizer(text_normal,return_tensors='pt')\n",
198
+ "inputs_2.keys()"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 9,
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "data": {
208
+ "text/plain": [
209
+ "tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
210
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
211
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
212
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
213
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
214
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
215
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
216
+ " 2000, 4503, 2037, 5097, 1012, 102]])"
217
+ ]
218
+ },
219
+ "execution_count": 9,
220
+ "metadata": {},
221
+ "output_type": "execute_result"
222
+ }
223
+ ],
224
+ "source": [
225
+ "inputs_2.input_ids"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 10,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "inputs_2['labels']= inputs_2.input_ids.detach().clone()"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 11,
240
+ "metadata": {},
241
+ "outputs": [
242
+ {
243
+ "data": {
244
+ "text/plain": [
245
+ "{'input_ids': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
246
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
247
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
248
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
249
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
250
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
251
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
252
+ " 2000, 4503, 2037, 5097, 1012, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
253
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
254
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
255
+ " 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
256
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
257
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
258
+ " 1, 1, 1, 1]]), 'labels': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
259
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
260
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
261
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
262
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
263
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
264
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
265
+ " 2000, 4503, 2037, 5097, 1012, 102]])}"
266
+ ]
267
+ },
268
+ "execution_count": 11,
269
+ "metadata": {},
270
+ "output_type": "execute_result"
271
+ }
272
+ ],
273
+ "source": [
274
+ "inputs_2"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 12,
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "data": {
284
+ "text/plain": [
285
+ "torch.Size([1, 76])"
286
+ ]
287
+ },
288
+ "execution_count": 12,
289
+ "metadata": {},
290
+ "output_type": "execute_result"
291
+ }
292
+ ],
293
+ "source": [
294
+ "#random tokenler oluşturacağız labelsiz\n",
295
+ "rand=torch.rand(inputs_2.input_ids.shape)\n",
296
+ "rand.shape"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 13,
302
+ "metadata": {},
303
+ "outputs": [
304
+ {
305
+ "data": {
306
+ "text/plain": [
307
+ "tensor([[0.9397, 0.1325, 0.1893, 0.8258, 0.7453, 0.1766, 0.9338, 0.0806, 0.0626,\n",
308
+ " 0.6665, 0.4240, 0.3946, 0.5413, 0.3799, 0.4023, 0.8699, 0.8159, 0.1511,\n",
309
+ " 0.6842, 0.0242, 0.7235, 0.0063, 0.1857, 0.9684, 0.8930, 0.8208, 0.5711,\n",
310
+ " 0.0345, 0.9919, 0.1140, 0.7597, 0.4546, 0.6478, 0.2295, 0.2846, 0.6314,\n",
311
+ " 0.3640, 0.9291, 0.3843, 0.3553, 0.1125, 0.0790, 0.4261, 0.4307, 0.6724,\n",
312
+ " 0.8569, 0.4476, 0.8032, 0.0241, 0.0152, 0.4196, 0.5609, 0.0010, 0.7240,\n",
313
+ " 0.4531, 0.5834, 0.5232, 0.3602, 0.6575, 0.9012, 0.1519, 0.2255, 0.0799,\n",
314
+ " 0.5673, 0.7244, 0.4387, 0.2713, 0.4243, 0.8435, 0.1670, 0.8664, 0.6261,\n",
315
+ " 0.4090, 0.2988, 0.3379, 0.7784]])"
316
+ ]
317
+ },
318
+ "execution_count": 13,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "rand"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 14,
330
+ "metadata": {},
331
+ "outputs": [
332
+ {
333
+ "data": {
334
+ "text/plain": [
335
+ "tensor([[False, True, False, False, False, False, False, True, True, False,\n",
336
+ " False, False, False, False, False, False, False, False, False, True,\n",
337
+ " False, True, False, False, False, False, False, True, False, True,\n",
338
+ " False, False, False, False, False, False, False, False, False, False,\n",
339
+ " True, True, False, False, False, False, False, False, True, True,\n",
340
+ " False, False, True, False, False, False, False, False, False, False,\n",
341
+ " False, False, True, False, False, False, False, False, False, False,\n",
342
+ " False, False, False, False, False, False]])"
343
+ ]
344
+ },
345
+ "execution_count": 14,
346
+ "metadata": {},
347
+ "output_type": "execute_result"
348
+ }
349
+ ],
350
+ "source": [
351
+ "#cümledeki toknelerin yüzde 15 alınır \n",
352
+ "#mask_arr = rand < 0.15 ifadesi, rand fonksiyonunun her bir token için rastgele bir sayı üreteceğini ve bu sayının 0.15'ten küçük olup olmadığına bakarak token'ın maskelenip maskelenmeyeceğini belirler. Eğer sayı 0.15'ten küçükse, token maskelenir; değilse, maskelenmez. \n",
353
+ "mask_arr = rand < 0.15\n",
354
+ "mask_arr"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 15,
360
+ "metadata": {},
361
+ "outputs": [
362
+ {
363
+ "data": {
364
+ "text/plain": [
365
+ "[1, 7, 8, 19, 21, 27, 29, 40, 41, 48, 49, 52, 62]"
366
+ ]
367
+ },
368
+ "execution_count": 15,
369
+ "metadata": {},
370
+ "output_type": "execute_result"
371
+ }
372
+ ],
373
+ "source": [
374
+ "#burada seçilen değer maskeleme yapılan tokenlarda 0 olmayan karakterlerin yazılmasıdır.\n",
375
+ "#torch flatten özelliği listeden çıkartarak yalnızca bir array olmasını sağladı\n",
376
+ "selection= torch.flatten(mask_arr[0].nonzero()).tolist()\n",
377
+ "selection"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": 16,
383
+ "metadata": {},
384
+ "outputs": [
385
+ {
386
+ "data": {
387
+ "text/plain": [
388
+ "tensor([[ 101, 103, 3752, 2122, 4311, 1010, 2057, 103, 103, 12685,\n",
389
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
390
+ " 2832, 103, 2536, 5097, 1006, 2013, 2536, 103, 2458, 103,\n",
391
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
392
+ " 103, 103, 8107, 2006, 25095, 1012, 1996, 3921, 103, 103,\n",
393
+ " 2004, 2092, 103, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
394
+ " 1010, 4007, 103, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
395
+ " 2000, 4503, 2037, 5097, 1012, 102]])"
396
+ ]
397
+ },
398
+ "execution_count": 16,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "#input_ids değerleri 0 olanlar için 103 değerinim atadık \n",
405
+ "inputs_2.input_ids[0,selection]=103\n",
406
+ "inputs_2.input_ids"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 17,
412
+ "metadata": {},
413
+ "outputs": [],
414
+ "source": [
415
+ "outputs= model(**inputs_2)"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 18,
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "data": {
425
+ "text/plain": [
426
+ "odict_keys(['loss', 'logits'])"
427
+ ]
428
+ },
429
+ "execution_count": 18,
430
+ "metadata": {},
431
+ "output_type": "execute_result"
432
+ }
433
+ ],
434
+ "source": [
435
+ "outputs.keys()"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 19,
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "data": {
445
+ "text/plain": [
446
+ "tensor(0.8399, grad_fn=<NllLossBackward0>)"
447
+ ]
448
+ },
449
+ "execution_count": 19,
450
+ "metadata": {},
451
+ "output_type": "execute_result"
452
+ }
453
+ ],
454
+ "source": [
455
+ "outputs.loss"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 22,
461
+ "metadata": {},
462
+ "outputs": [
463
+ {
464
+ "name": "stderr",
465
+ "output_type": "stream",
466
+ "text": [
467
+ "c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--facebook--dpr-ctx_encoder-single-nq-base. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
468
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
469
+ " warnings.warn(message)\n",
470
+ "Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
471
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
472
+ "Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
473
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
474
+ "Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']\n",
475
+ "- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
476
+ "- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
477
+ ]
478
+ }
479
+ ],
480
+ "source": [
481
+ "from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer\n",
482
+ "\n",
483
+ "ctx_model=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
484
+ "ctx_tokenizer=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
485
+ "\n",
486
+ "question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
487
+ "question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
488
+ "\n"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": null,
494
+ "metadata": {},
495
+ "outputs": [],
496
+ "source": [
497
+ "\"\"\"title = [\"2024 Yılında Mobil Teknoloji Trendleri\"]\n",
498
+ "keywords = [\"mobil teknoloji\", \"2024 trendleri\", \"akıllı telefon yenilikleri\", \"5G teknolojisi\", \"giyilebilir cihazlar\"]\n",
499
+ "subheading = [\n",
500
+ " \"2024'te Akıllı Telefonlardaki Yenilikler\",\n",
501
+ " \"Giyilebilir Teknolojiler: Sağlık ve Fitness Trendleri\",\n",
502
+ " \"5G'nin Mobil Cihazlar Üzerindeki Etkisi\",\n",
503
+ " \"Mobil Güvenlikte Yeni Yaklaşımlar\"\n",
504
+ "]\"\"\"\n"
505
+ ]
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "execution_count": null,
510
+ "metadata": {},
511
+ "outputs": [],
512
+ "source": [
513
+ "\n",
514
+ "xb_tokens=ctx_tokenizer()\n"
515
+ ]
516
+ }
517
+ ],
518
+ "metadata": {
519
+ "kernelspec": {
520
+ "display_name": "myenv",
521
+ "language": "python",
522
+ "name": "python3"
523
+ },
524
+ "language_info": {
525
+ "codemirror_mode": {
526
+ "name": "ipython",
527
+ "version": 3
528
+ },
529
+ "file_extension": ".py",
530
+ "mimetype": "text/x-python",
531
+ "name": "python",
532
+ "nbconvert_exporter": "python",
533
+ "pygments_lexer": "ipython3",
534
+ "version": "3.12.4"
535
+ }
536
+ },
537
+ "nbformat": 4,
538
+ "nbformat_minor": 2
539
+ }
alternatif.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Alternatif Yollar
2
+ LDA (Latent Dirichlet Allocation): Konu modelleme yöntemlerinden biridir ve belgelerdeki ana temaları belirlemek için kullanılabilir.
3
+
4
+ BERT ve Transformer Modelleri: Önerilen transformer tabanlı modeller, bağlam analizi için güçlüdür ve metinleri anlamak ve anahtar kelimeleri çıkarmak için kullanılabilir.
5
+
6
+ TF-IDF ile Birlikte LDA: TF-IDF ve LDA’nın kombinasyonu, hem anahtar kelime hem de konu modellemesi için kullanılabilir.
7
+
8
+ Sentence Embeddings ve Similitude Modeling: Sentence-BERT gibi modeller kullanarak metinler arasındaki benzerlikleri ölçebilir ve bu benzerlikleri anahtar kelime çıkarımı ile birleştirebilirsiniz.
9
+
10
+ Pretrained Language Models: Hugging Face gibi kütüphaneler üzerinden önceden eğitilmiş modeller kullanarak, daha doğru ve bağlamlı anahtar kelime çıkarımı ve metin analizi yapılabilir.
combined_output.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d132586c17782ba7dabc58b6b8c11b797a65890e95a605b38b208caa08d984e
3
+ size 1476218231
datasets.ipynb CHANGED
@@ -437,9 +437,16 @@
437
  "TF-IDF HESAPLAMA"
438
  ]
439
  },
 
 
 
 
 
 
 
440
  {
441
  "cell_type": "code",
442
- "execution_count": 13,
443
  "metadata": {},
444
  "outputs": [
445
  {
@@ -447,165 +454,104 @@
447
  "output_type": "stream",
448
  "text": [
449
  "Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
450
- "Average Embedding Shape: torch.Size([768])\n",
451
- "Average Embedding: tensor([ 3.1219e-01, -3.4488e-02, 1.1118e-02, -3.6194e-02, 1.3312e-02,\n",
452
- " 8.7684e-02, 6.0835e-01, -5.8831e-03, 4.2102e-01, 3.7467e-01,\n",
453
- " -1.9954e-01, 2.5975e-01, -8.9819e-02, 6.8351e-02, -2.3226e-01,\n",
454
- " -6.4409e-02, 1.1375e-01, 6.9892e-02, 3.4909e-01, -2.0660e-01,\n",
455
- " 4.2718e-02, -2.3758e-01, -1.2146e-01, 6.9431e-01, 8.2034e-02,\n",
456
- " -4.4726e-01, -3.9995e-01, 4.9043e-01, -5.3700e-01, 4.0028e-02,\n",
457
- " 2.4516e-02, -1.9234e-01, -5.9901e-02, 4.0203e-01, 1.7956e-01,\n",
458
- " 2.7692e-01, 4.2539e-01, -1.0046e-01, -1.9326e-01, -2.3722e-01,\n",
459
- " 3.9989e-01, 1.1785e-01, -3.7475e-01, -4.7698e-01, 1.2440e-01,\n",
460
- " 1.7583e-01, 4.7179e-01, -6.1670e-01, 3.4876e-01, -1.1977e-01,\n",
461
- " 4.3870e-01, -4.7105e-01, 3.8414e-01, 3.6902e-01, -1.2121e-02,\n",
462
- " -3.3284e-02, 2.5584e-01, 2.0225e-01, 1.4411e-01, 2.9933e-01,\n",
463
- " 3.6910e-01, 2.3893e-01, 6.0434e-01, 1.5669e-01, -8.5170e-01,\n",
464
- " -2.5171e-01, 3.6258e-01, 4.5186e-01, -2.9369e-01, 3.8370e-01,\n",
465
- " 4.9858e-01, -7.5623e-02, 1.1362e-02, -1.3621e-01, -2.7373e-01,\n",
466
- " -3.1269e-01, -6.4951e-01, -6.9747e-02, 2.1302e-01, 3.4201e-01,\n",
467
- " -3.8148e-01, -3.2749e-02, 7.4201e-01, -6.0619e-01, -1.8069e-01,\n",
468
- " -1.5151e-01, 7.6336e-02, -4.0224e-02, -5.9742e-01, -1.7219e-02,\n",
469
- " -5.6787e-01, 2.6290e-01, 2.3984e-01, 4.8434e-01, 4.7557e-01,\n",
470
- " 2.2432e-01, -1.0822e-01, 3.5924e-01, -4.4102e-01, -1.1613e+00,\n",
471
- " 5.3896e-02, -2.8951e-01, -1.0792e+00, -2.2577e-02, -2.9868e-01,\n",
472
- " -2.7837e-01, 1.0477e-01, 3.8852e-01, 2.9142e-01, -4.2427e-01,\n",
473
- " 3.6626e-01, 7.9898e-02, 2.2686e-01, 2.3253e-02, -6.9434e-01,\n",
474
- " 3.2550e+00, -5.6280e-02, 1.1168e-01, 4.2853e-01, 7.7213e-02,\n",
475
- " 3.1671e-01, -2.9387e-01, -2.1341e-01, -7.9131e-02, -1.0102e-01,\n",
476
- " -5.7301e-01, 5.6494e-01, 2.0392e-01, -2.6974e-01, -9.0448e-01,\n",
477
- " -7.6977e-01, 5.1432e-02, -1.3809e-01, 2.2806e-01, -3.8749e-01,\n",
478
- " 4.0886e-01, 2.2627e-02, -2.4360e-02, -1.0032e-01, -8.8879e-03,\n",
479
- " -2.9814e-01, 2.4151e-01, -6.5038e-01, 5.5605e-01, -1.5214e-02,\n",
480
- " -4.4102e-01, 2.1589e-01, 8.9567e-02, -3.3454e-01, 4.1183e-01,\n",
481
- " -2.5177e-02, -4.8496e-01, 3.7691e-01, 6.1995e-02, -2.9426e-01,\n",
482
- " -1.5210e-01, 5.1504e-01, 4.9226e-01, 1.0083e-01, 1.9789e-01,\n",
483
- " 6.5205e-01, -9.7679e-02, 3.4597e-02, 9.5440e-02, 6.5158e-01,\n",
484
- " -5.6019e-01, -1.1912e-01, 1.9009e-01, 1.1314e-01, 1.0752e-01,\n",
485
- " 4.7765e-01, 2.5196e-01, -1.5925e-01, 1.3468e-01, -1.9441e-01,\n",
486
- " -5.0252e-02, 4.2977e-01, 2.7336e-01, 4.7672e-02, 2.3097e-01,\n",
487
- " 1.5998e-01, -1.3434e-01, 3.8424e-01, -3.9759e-01, -2.6207e-02,\n",
488
- " 2.9264e-02, -1.2846e-01, -3.9234e-01, -2.3295e-01, -1.4392e-01,\n",
489
- " 7.9061e-02, 2.8095e-01, -1.6391e-01, 2.0505e-01, -1.2172e-01,\n",
490
- " -2.5179e-01, 8.8469e-02, -1.5946e+00, -6.6211e-01, 1.6993e-01,\n",
491
- " -1.6472e-02, 2.5519e-01, -2.4024e-02, 5.7010e-01, 6.1551e-03,\n",
492
- " 7.0113e-02, -3.9507e-01, -2.2114e-02, -2.0259e-01, -8.9107e-03,\n",
493
- " 1.1820e-01, -1.0522e-02, 5.2899e-01, -3.6007e-01, -5.6266e-01,\n",
494
- " 1.3287e-01, -5.8443e-01, -2.5912e-01, -4.3816e-02, -1.1244e-01,\n",
495
- " 1.0696e+00, 3.1219e-01, -4.1700e-01, 1.1373e-01, -2.2935e-01,\n",
496
- " -1.4058e-02, 2.6080e-01, 6.1457e-03, 5.5064e-02, 5.2089e-01,\n",
497
- " 1.3195e-01, -6.0868e-01, 4.0164e-01, -1.8374e-01, 8.4919e-01,\n",
498
- " -4.2096e-01, -3.7411e-01, 1.8478e-02, -5.6272e-01, -2.5044e-01,\n",
499
- " -1.1385e-01, 1.6000e-01, 3.3307e-01, -5.7846e-02, -4.1887e-02,\n",
500
- " -1.7514e-01, 2.8522e-01, -3.3909e-01, 1.7133e-01, 2.4794e-02,\n",
501
- " -3.0897e-01, 1.7487e-01, -4.8215e-01, -1.0892e-01, 1.0915e-01,\n",
502
- " -2.9227e-02, -6.7439e-02, -3.6022e-01, -8.8648e-02, 2.5974e-01,\n",
503
- " -2.2780e-02, 1.8174e-02, 8.9919e-02, 1.6508e+00, -6.3506e-01,\n",
504
- " 4.9391e-01, 7.9321e-02, 3.2023e-02, 3.1216e-01, -7.8220e-02,\n",
505
- " 3.5055e-01, -2.8349e-01, -4.8787e-01, -5.3590e-01, -4.5163e-01,\n",
506
- " 2.4602e-01, 4.0553e-01, -2.9002e-01, -1.6120e-01, 1.3428e-02,\n",
507
- " 4.7906e-01, 2.2494e-01, 3.5909e-01, 1.2861e-01, -1.7966e-01,\n",
508
- " 9.8253e-02, -9.9344e-02, 2.3110e-01, 3.1276e-01, 6.4092e-02,\n",
509
- " 2.7386e-01, -3.8601e-01, -5.6480e-01, -5.6070e-01, -6.4271e-02,\n",
510
- " -2.8354e-01, 6.7687e-02, -5.7471e-01, 3.0518e-02, -1.3380e-02,\n",
511
- " -3.6718e-01, 3.8880e-01, -1.9569e-01, 2.8110e-01, -2.9406e-01,\n",
512
- " -2.5891e-01, -3.0043e-01, -3.3694e-01, 5.7723e-02, -1.2361e+00,\n",
513
- " -1.1917e-01, -2.6665e-01, -5.6574e-02, -3.8907e-01, 4.2425e-01,\n",
514
- " -6.5229e-02, 6.5768e-01, -1.0842e-01, -7.0508e-01, 8.4208e-02,\n",
515
- " -3.7736e-01, 3.2153e-01, 5.6956e-01, 1.2256e-01, 4.2261e-01,\n",
516
- " -2.7749e-01, 7.9419e-02, -8.1517e-02, -3.0462e-02, 1.5746e-01,\n",
517
- " -8.7179e-02, 1.8869e-01, 4.1413e-01, 3.7192e-01, -1.9835e-01,\n",
518
- " -2.5932e-01, 5.4023e-02, -3.8093e-01, 1.1605e-01, -1.4389e-01,\n",
519
- " -4.5509e-01, -6.0786e-01, 4.2643e-01, 1.6004e-01, -3.4740e-02,\n",
520
- " -4.4579e-01, -5.6887e-01, -1.1662e-01, 2.1577e-01, 6.6576e-03,\n",
521
- " -2.3879e-01, 4.4046e-01, -2.6281e-01, 2.4404e-01, 8.1931e-02,\n",
522
- " 2.2825e-01, -1.5294e-01, -3.7482e-01, 8.8104e-02, 4.0676e-01,\n",
523
- " 1.6295e-01, 5.8565e-01, -8.0144e-02, -4.1792e-01, -4.6798e-01,\n",
524
- " 3.9977e-01, -3.7319e-01, -1.2999e-01, -4.4200e-01, -2.9825e-01,\n",
525
- " -1.2899e-01, -1.8651e-01, -2.0209e-02, -6.6213e-01, 5.0630e-02,\n",
526
- " -4.6655e-01, -4.3796e-01, 6.7476e-02, 3.4367e-01, 1.8640e-01,\n",
527
- " 3.3172e-01, -4.1092e-01, 2.6630e-02, -4.9168e-02, -3.4948e-01,\n",
528
- " 1.6500e-02, -4.3398e-01, 2.6911e-01, 3.4227e-02, -2.1475e-01,\n",
529
- " 9.7154e-01, -2.9554e-01, 8.5149e-01, -6.0231e-01, 1.0421e-01,\n",
530
- " 6.2897e-01, 1.8700e-02, 1.6866e-01, -7.0568e-03, -6.9820e-01,\n",
531
- " -1.3916e-01, 3.2686e-01, -1.5017e-01, 6.5600e-01, 2.9388e-02,\n",
532
- " -6.0431e-01, 3.8548e-02, -1.2187e-01, -4.8818e-01, 1.5922e-01,\n",
533
- " -2.1494e-02, -2.1316e-01, -1.5983e-01, -3.7928e-01, 5.6203e-01,\n",
534
- " 3.1285e-01, -4.0310e-01, 3.8763e-01, -4.1886e-01, 1.6276e-01,\n",
535
- " 1.2610e-01, 3.5952e-01, 1.3288e-01, 6.0504e-01, -3.4769e-01,\n",
536
- " -1.5976e-01, 2.9626e-01, -2.2079e-01, -1.5934e-01, -5.8491e-01,\n",
537
- " -5.7811e-02, -4.7510e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
538
- " 9.2435e-02, 2.3198e-01, -5.8704e-01, -1.9506e-01, -5.3740e-01,\n",
539
- " 1.8715e-01, -3.5691e-01, 2.5481e-01, 3.2795e-01, -9.4206e-02,\n",
540
- " -2.2492e-01, -3.1406e-01, 4.5814e-01, -1.7896e-01, -3.9470e-01,\n",
541
- " 1.9183e-01, -4.3177e-01, 2.7146e-01, 1.9477e-01, -1.7568e-02,\n",
542
- " -2.0134e-01, 5.7984e-03, 3.0490e-01, -2.7846e-01, 9.8830e-03,\n",
543
- " -3.0119e-01, -4.1994e-01, -1.0905e-02, 6.9638e-01, 9.4965e-02,\n",
544
- " -2.6103e-01, 8.8206e-02, -1.0292e-01, -1.2342e-01, -2.2317e-03,\n",
545
- " -5.2474e-02, -2.1636e-01, -1.6554e-01, 2.3173e-01, 1.2170e-01,\n",
546
- " 4.5793e-01, -1.1033e-01, 1.4489e-01, 2.2540e-01, 5.2360e-01,\n",
547
- " -3.6468e-01, -1.5081e-01, -2.3761e-02, 2.7475e-01, 5.3707e-01,\n",
548
- " 9.3503e-02, -4.9759e-01, 1.5903e-01, -1.2017e-01, 3.4478e-01,\n",
549
- " -2.1399e-01, 3.9456e-01, -3.2861e-01, 1.7182e-01, -1.1697e-01,\n",
550
- " 5.6727e-03, -1.9770e-01, -2.3682e-01, 2.7554e-01, -3.9236e-01,\n",
551
- " 2.0691e-01, 1.6439e-01, -3.7138e-01, -7.8304e-01, -1.9874e-01,\n",
552
- " 6.4637e-01, -2.4494e-01, -4.1920e-01, -3.7675e-01, 1.3178e-01,\n",
553
- " 1.9076e-01, -1.2906e-01, -6.4864e-04, -9.7821e-03, -1.2172e-01,\n",
554
- " -5.5357e-02, 2.2997e-01, -3.2848e-01, -4.1649e-01, 9.9676e-04,\n",
555
- " -4.5320e-01, -2.2864e-01, -1.6760e-01, -7.9657e-02, -6.0780e-02,\n",
556
- " -1.7627e-01, -4.1947e-02, 2.3884e-01, -4.7784e-03, -3.1593e-01,\n",
557
- " -1.0243e-01, 5.3464e-01, 2.7388e-01, -4.2258e-02, -1.5521e-01,\n",
558
- " -1.0183e-01, -2.9342e-01, -1.0132e+00, 2.3122e-01, -3.3482e-01,\n",
559
- " 3.2136e-01, -2.3603e-01, -1.4938e-01, -2.3986e-01, 6.1094e-02,\n",
560
- " 1.6784e-01, -3.8075e-02, 5.6459e-01, -2.0828e-02, -1.7406e-01,\n",
561
- " -2.9475e-01, -5.0143e-01, -1.6885e-01, 4.4070e-01, 3.1866e-01,\n",
562
- " -2.7534e-01, 4.1410e-01, -7.2704e-02, -2.9659e-01, 3.0922e-01,\n",
563
- " -5.1553e-01, -2.7293e-01, -1.2403e-01, 5.3698e-01, 8.8994e-02,\n",
564
- " 4.1334e-01, 2.5389e-01, 6.0110e-01, -2.3192e-01, -9.9463e+00,\n",
565
- " 3.8342e-01, -3.4833e-01, 3.5175e-02, -3.3336e-01, 2.5660e-01,\n",
566
- " 8.5744e-01, -3.4563e-01, 3.0483e-03, 3.4735e-01, 3.8450e-01,\n",
567
- " 3.9665e-01, 2.2100e-01, 6.5109e-02, -5.5761e-01, -6.2348e-01,\n",
568
- " -1.8679e-01, 1.9003e-01, 7.4262e-02, -5.9655e-02, -3.9839e-01,\n",
569
- " -2.2625e-02, -7.6319e-02, 2.9763e-01, 1.4098e-01, -2.8759e-01,\n",
570
- " -4.0783e-01, 1.1544e-01, 3.2446e-01, -2.9828e-01, 1.4054e-02,\n",
571
- " 1.6943e-01, -2.0345e-01, -2.1174e-02, 1.1417e-01, 3.3420e-01,\n",
572
- " -1.0892e-01, -3.1187e-01, -5.7087e-01, -1.1561e-02, 4.2107e-02,\n",
573
- " 4.9406e-01, -3.7056e-01, -3.2354e-01, 5.4846e-02, 2.4392e-01,\n",
574
- " -1.2840e-01, -4.3743e-01, 2.4391e-01, 2.1046e-01, -6.3811e-01,\n",
575
- " 3.5563e-01, -2.0561e-01, -3.0996e-01, 1.6479e-01, -5.1947e-02,\n",
576
- " 3.2559e-01, -6.3670e-03, -2.7855e-01, -4.2847e-01, -1.2022e-01,\n",
577
- " 4.0702e-01, 9.6086e-01, 1.3305e-01, -2.0369e-01, 7.5751e-02,\n",
578
- " -1.2915e-01, -8.5741e-02, 2.7087e-01, 9.1068e-02, -1.5946e-01,\n",
579
- " 4.7289e-01, 1.0613e-01, 1.3504e-01, 2.7304e-01, -7.9823e-01,\n",
580
- " 1.1986e-01, 4.7432e-01, -1.4133e-01, 3.9729e-01, -1.6949e-01,\n",
581
- " -9.2290e-01, -1.9302e-01, -7.9017e-02, -6.5796e-01, 1.3385e-02,\n",
582
- " 1.6185e-01, -3.4487e-01, 5.8601e-01, -1.5023e-01, 5.8034e-01,\n",
583
- " -2.8326e-01, -1.6494e-01, -2.9796e-01, 6.7479e-03, -6.3622e-01,\n",
584
- " -1.7732e-02, -1.6043e-01, -8.2452e-01, -2.4934e-02, -1.3969e-01,\n",
585
- " -1.2475e-01, 2.1235e-01, 6.9211e-02, 1.1795e-01, -2.5098e-02,\n",
586
- " 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01, 2.7441e-01,\n",
587
- " -2.6457e-01, -3.3007e-01, -3.1083e-01, 4.9623e-01, -2.7829e-01,\n",
588
- " -3.0000e-01, -2.5620e-01, 2.1623e-01, -1.0724e-01, -5.0995e-01,\n",
589
- " -4.9460e-01, 8.4283e-02, -3.2844e-01, -6.0080e-01, -1.1809e-01,\n",
590
- " 1.1040e-01, 3.7749e-02, 3.9097e-01, 2.7157e-02, -3.5270e-01,\n",
591
- " -1.0008e-01, -3.1026e-01, -1.9041e-01, 3.7090e-01, -4.5056e-01,\n",
592
- " -8.3087e-02, -3.6450e-01, -1.0154e+00, -1.3134e-01, -5.0261e-02,\n",
593
- " 3.6961e-01, -1.1989e-01, -1.2336e-01, 2.6829e-01, -6.0926e-01,\n",
594
- " -3.0037e-01, -1.0460e+00, -2.1501e-01, 1.7171e-01, 1.7970e-02,\n",
595
- " -2.0708e-01, -1.3656e-01, -3.2854e-01, 1.2158e-01, -3.0438e-01,\n",
596
- " -4.6487e-02, 1.8717e-01, -2.3236e-01, -1.4668e-01, -6.9169e-01,\n",
597
- " -2.1502e-01, -1.2722e-01, 3.5600e-01, 1.5203e-03, -3.7041e-01,\n",
598
- " -6.5877e-01, 2.1490e-01, -5.1359e-02, 2.2720e-01, -1.6363e-01,\n",
599
- " -1.0862e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, -2.5987e-01,\n",
600
- " -2.0222e-01, 3.4466e-02, 5.8733e-01, -1.6877e-01, -4.8642e-01,\n",
601
- " -7.8254e-03, 1.2950e-01, -5.6791e-01, -6.6342e-01, -1.5021e-01,\n",
602
- " -4.4367e-01, -2.8434e-01, -1.7593e-01, -4.2538e-01, -3.7350e-01,\n",
603
- " -4.0185e-02, -6.1727e-01, 2.3771e-01, -4.1247e-01, 3.9440e-01,\n",
604
- " 1.0506e-01, -4.0222e-01, 5.9232e-01])\n",
605
- "TF-IDF Keywords: [('rus', np.float64(0.33567254331867563)), ('ahecaqo', np.float64(0.25175440748900674)), ('000', np.float64(0.16783627165933782)), ('1777', np.float64(0.16783627165933782)), ('ile', np.float64(0.16783627165933782)), ('pşıqo', np.float64(0.16783627165933782)), ('türkçe', np.float64(0.16783627165933782)), ('vardı', np.float64(0.16783627165933782)), ('çerkes', np.float64(0.16783627165933782)), ('çerkesya', np.float64(0.16783627165933782)), ('12', np.float64(0.08391813582966891)), ('1837', np.float64(0.08391813582966891)), ('1838', np.float64(0.08391813582966891)), ('adamıydı', np.float64(0.08391813582966891)), ('adlardaki', np.float64(0.08391813582966891)), ('anlamlıdır', np.float64(0.08391813582966891)), ('anlamına', np.float64(0.08391813582966891)), ('askeri', np.float64(0.08391813582966891)), ('askerî', np.float64(0.08391813582966891)), ('atlıdan', np.float64(0.08391813582966891)), ('atlıya', np.float64(0.08391813582966891)), ('az', np.float64(0.08391813582966891)), ('becerisinin', np.float64(0.08391813582966891)), ('belirtir', np.float64(0.08391813582966891)), ('beyoğlu', np.float64(0.08391813582966891)), ('bilgiler', np.float64(0.08391813582966891)), ('birliklerine', np.float64(0.08391813582966891)), ('biyografi', np.float64(0.08391813582966891)), ('bjeduğ', np.float64(0.08391813582966891)), ('bölgesinde', np.float64(0.08391813582966891)), ('büyütüldü', np.float64(0.08391813582966891)), ('devlet', np.float64(0.08391813582966891)), ('doğdu', np.float64(0.08391813582966891)), ('doğumlular', np.float64(0.08391813582966891)), ('duyulan', np.float64(0.08391813582966891)), ('dönem', np.float64(0.08391813582966891)), ('ek', np.float64(0.08391813582966891)), ('ekiyle', np.float64(0.08391813582966891)), ('erken', np.float64(0.08391813582966891)), ('eğitim', np.float64(0.08391813582966891)), ('eş', np.float64(0.08391813582966891)), ('geldiği', np.float64(0.08391813582966891)), ('gelen', np.float64(0.08391813582966891)), ('geçti', np.float64(0.08391813582966891)), ('hakkında', np.float64(0.08391813582966891)), ('hastalıktan', np.float64(0.08391813582966891)), ('ismi', np.float64(0.08391813582966891)), ('ismidir', np.float64(0.08391813582966891)), ('için', np.float64(0.08391813582966891)), ('kafkas', np.float64(0.08391813582966891)), ('kalıcı', np.float64(0.08391813582966891)), ('katılımı', np.float64(0.08391813582966891)), ('kaydedilmedi', np.float64(0.08391813582966891)), ('kaynak', np.float64(0.08391813582966891)), ('kaynaklarından', np.float64(0.08391813582966891)), ('kaynakça', np.float64(0.08391813582966891)), ('kazandığı', np.float64(0.08391813582966891)), ('kişi', np.float64(0.08391813582966891)), ('kişiler', np.float64(0.08391813582966891)), ('lakapları', np.float64(0.08391813582966891)), ('lakaptır', np.float64(0.08391813582966891)), ('müfrezesi', np.float64(0.08391813582966891)), ('nda', np.float64(0.08391813582966891)), ('nun', np.float64(0.08391813582966891)), ('nın', np.float64(0.08391813582966891)), ('olduğunu', np.float64(0.08391813582966891)), ('oluşan', np.float64(0.08391813582966891)), ('ordusu', np.float64(0.08391813582966891)), ('oğlu', np.float64(0.08391813582966891)), ('pek', np.float64(0.08391813582966891)), ('qo', np.float64(0.08391813582966891)), ('savaşı', np.float64(0.08391813582966891)), ('savaşına', np.float64(0.08391813582966891)), ('saygı', np.float64(0.08391813582966891)), ('sim', np.float64(0.08391813582966891)), ('soneki', np.float64(0.08391813582966891)), ('sonra', np.float64(0.08391813582966891)), ('soy', np.float64(0.08391813582966891)), ('soyadları', np.float64(0.08391813582966891)), ('soylular', np.float64(0.08391813582966891)), ('sıra', np.float64(0.08391813582966891)), ('sırasında', np.float64(0.08391813582966891)), ('tarafına', np.float64(0.08391813582966891)), ('tarihlerini', np.float64(0.08391813582966891)), ('ulaşıyordu', np.float64(0.08391813582966891)), ('yazmadıkları', np.float64(0.08391813582966891)), ('yıl', np.float64(0.08391813582966891)), ('zaferlerle', np.float64(0.08391813582966891)), ('çatışmalar', np.float64(0.08391813582966891)), ('çerkesler', np.float64(0.08391813582966891)), ('çerkeslerin', np.float64(0.08391813582966891)), ('öldü', np.float64(0.08391813582966891)), ('ölenler', np.float64(0.08391813582966891)), ('ölüm', np.float64(0.08391813582966891)), ('ünlüydü', np.float64(0.08391813582966891))]\n",
606
  "BERT Embeddings:\n",
607
  "Text 1 embedding shape: torch.Size([233, 768])\n"
608
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  }
610
  ],
611
  "source": [
@@ -617,7 +563,9 @@
617
  "from sklearn.feature_extraction.text import TfidfVectorizer\n",
618
  "from nltk.corpus import stopwords as nltk_stopwords\n",
619
  "from transformers import BertTokenizer, BertModel\n",
 
620
  "import torch\n",
 
621
  "\n",
622
  "# BERT Tokenizer ve Model'i yükleyin\n",
623
  "tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
@@ -661,9 +609,10 @@
661
  " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
662
  " X = vectorizer.fit_transform(corpus)\n",
663
  " feature_names = vectorizer.get_feature_names_out()\n",
664
- " scores = np.asarray(X.sum(axis=0)).flatten()\n",
665
- " keywords = {feature_names[i]: scores[i] for i in range(len(feature_names))}\n",
666
- " sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)\n",
 
667
  " return sorted_keywords\n",
668
  "\n",
669
  "#tokenleri kelimelere dönüştürür ve listeler \n",
@@ -695,9 +644,6 @@
695
  " \n",
696
  " \n",
697
  "\n",
698
- " \n",
699
- "\n",
700
- "\n",
701
  "#token ıd leri ve bert gömme vektörleri\n",
702
  "for text in texts:\n",
703
  " input_ids,embeddings= get_bert_embeddings(text)\n",
@@ -708,10 +654,18 @@
708
  " # Tokenları ve ortalama vektörleri al\n",
709
  " tokens = decode_tokens(input_ids)\n",
710
  " avg_embedding = average_embeddings(embeddings)\n",
 
 
 
 
 
711
  " print(f\"Tokens: {tokens}\")\n",
712
- " print(f\"Average Embedding Shape: {avg_embedding.shape}\")\n",
713
- " print(f\"Average Embedding: {avg_embedding}\")\n",
 
 
714
  "\n",
 
715
  "# TF-IDF anahtar kelimelerini çıkar\n",
716
  "keywords = extract_keywords_tfidf(texts,stop_words_list)\n",
717
  "print(\"TF-IDF Keywords:\", keywords)\n",
@@ -721,10 +675,566 @@
721
  "for i, emb in enumerate(embeddings):\n",
722
  " print(f\"Text {i+1} embedding shape: {emb.shape}\")\n",
723
  "\n",
724
- "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725
  "\n"
726
  ]
727
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  {
729
  "cell_type": "code",
730
  "execution_count": 8,
@@ -790,6 +1300,79 @@
790
  "test_stop_words_effectiveness(texts, stop_words_list)\n"
791
  ]
792
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
  {
794
  "cell_type": "code",
795
  "execution_count": 20,
 
437
  "TF-IDF HESAPLAMA"
438
  ]
439
  },
440
+ {
441
+ "cell_type": "markdown",
442
+ "metadata": {},
443
+ "source": [
444
+ "Token vektörlerinin ortalamasını alarak metin düzeyinde özet oluşturacak şekilde k-means ve tf-ıdf algoritmalarını kullanarak keyword oluşturmak "
445
+ ]
446
+ },
447
  {
448
  "cell_type": "code",
449
+ "execution_count": 31,
450
  "metadata": {},
451
  "outputs": [
452
  {
 
454
  "output_type": "stream",
455
  "text": [
456
  "Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
457
+ "Positive Average Embedding Shape: torch.Size([353])\n",
458
+ "Positive Average Embedding: tensor([3.1219e-01, 1.1118e-02, 1.3312e-02, 8.7684e-02, 6.0835e-01, 4.2102e-01,\n",
459
+ " 3.7467e-01, 2.5975e-01, 6.8351e-02, 1.1375e-01, 6.9892e-02, 3.4909e-01,\n",
460
+ " 4.2718e-02, 6.9431e-01, 8.2034e-02, 4.9043e-01, 4.0028e-02, 2.4516e-02,\n",
461
+ " 4.0203e-01, 1.7956e-01, 2.7692e-01, 4.2539e-01, 3.9989e-01, 1.1785e-01,\n",
462
+ " 1.2440e-01, 1.7583e-01, 4.7179e-01, 3.4876e-01, 4.3870e-01, 3.8414e-01,\n",
463
+ " 3.6902e-01, 2.5584e-01, 2.0225e-01, 1.4411e-01, 2.9933e-01, 3.6910e-01,\n",
464
+ " 2.3893e-01, 6.0434e-01, 1.5669e-01, 3.6258e-01, 4.5186e-01, 3.8370e-01,\n",
465
+ " 4.9858e-01, 1.1362e-02, 2.1302e-01, 3.4201e-01, 7.4201e-01, 7.6336e-02,\n",
466
+ " 2.6290e-01, 2.3984e-01, 4.8434e-01, 4.7557e-01, 2.2432e-01, 3.5924e-01,\n",
467
+ " 5.3896e-02, 1.0477e-01, 3.8852e-01, 2.9142e-01, 3.6626e-01, 7.9898e-02,\n",
468
+ " 2.2686e-01, 2.3253e-02, 3.2550e+00, 1.1168e-01, 4.2853e-01, 7.7213e-02,\n",
469
+ " 3.1671e-01, 5.6494e-01, 2.0392e-01, 5.1432e-02, 2.2806e-01, 4.0886e-01,\n",
470
+ " 2.2627e-02, 2.4151e-01, 5.5605e-01, 2.1589e-01, 8.9567e-02, 4.1183e-01,\n",
471
+ " 3.7691e-01, 6.1995e-02, 5.1504e-01, 4.9226e-01, 1.0083e-01, 1.9789e-01,\n",
472
+ " 6.5205e-01, 3.4597e-02, 9.5440e-02, 6.5158e-01, 1.9009e-01, 1.1314e-01,\n",
473
+ " 1.0752e-01, 4.7765e-01, 2.5196e-01, 1.3468e-01, 4.2977e-01, 2.7336e-01,\n",
474
+ " 4.7672e-02, 2.3097e-01, 1.5998e-01, 3.8424e-01, 2.9264e-02, 7.9061e-02,\n",
475
+ " 2.8095e-01, 2.0505e-01, 8.8469e-02, 1.6993e-01, 2.5519e-01, 5.7010e-01,\n",
476
+ " 6.1551e-03, 7.0113e-02, 1.1820e-01, 5.2899e-01, 1.3287e-01, 1.0696e+00,\n",
477
+ " 3.1219e-01, 1.1373e-01, 2.6080e-01, 6.1457e-03, 5.5064e-02, 5.2089e-01,\n",
478
+ " 1.3195e-01, 4.0164e-01, 8.4919e-01, 1.8478e-02, 1.6000e-01, 3.3307e-01,\n",
479
+ " 2.8522e-01, 1.7133e-01, 2.4794e-02, 1.7487e-01, 1.0915e-01, 2.5974e-01,\n",
480
+ " 1.8174e-02, 8.9919e-02, 1.6508e+00, 4.9391e-01, 7.9321e-02, 3.2023e-02,\n",
481
+ " 3.1216e-01, 3.5055e-01, 2.4602e-01, 4.0553e-01, 1.3428e-02, 4.7906e-01,\n",
482
+ " 2.2494e-01, 3.5909e-01, 1.2861e-01, 9.8253e-02, 2.3110e-01, 3.1276e-01,\n",
483
+ " 6.4092e-02, 2.7386e-01, 6.7687e-02, 3.0518e-02, 3.8880e-01, 2.8110e-01,\n",
484
+ " 5.7723e-02, 4.2425e-01, 6.5768e-01, 8.4208e-02, 3.2153e-01, 5.6956e-01,\n",
485
+ " 1.2256e-01, 4.2261e-01, 7.9419e-02, 1.5746e-01, 1.8869e-01, 4.1413e-01,\n",
486
+ " 3.7192e-01, 5.4023e-02, 1.1605e-01, 4.2643e-01, 1.6004e-01, 2.1577e-01,\n",
487
+ " 6.6576e-03, 4.4046e-01, 2.4404e-01, 8.1931e-02, 2.2825e-01, 8.8104e-02,\n",
488
+ " 4.0676e-01, 1.6295e-01, 5.8565e-01, 3.9977e-01, 5.0630e-02, 6.7476e-02,\n",
489
+ " 3.4367e-01, 1.8640e-01, 3.3172e-01, 2.6630e-02, 1.6500e-02, 2.6911e-01,\n",
490
+ " 3.4227e-02, 9.7154e-01, 8.5149e-01, 1.0421e-01, 6.2897e-01, 1.8700e-02,\n",
491
+ " 1.6866e-01, 3.2686e-01, 6.5600e-01, 2.9388e-02, 3.8548e-02, 1.5922e-01,\n",
492
+ " 5.6203e-01, 3.1285e-01, 3.8763e-01, 1.6276e-01, 1.2610e-01, 3.5952e-01,\n",
493
+ " 1.3288e-01, 6.0504e-01, 2.9626e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
494
+ " 9.2435e-02, 2.3198e-01, 1.8715e-01, 2.5481e-01, 3.2795e-01, 4.5814e-01,\n",
495
+ " 1.9183e-01, 2.7146e-01, 1.9477e-01, 5.7984e-03, 3.0490e-01, 9.8830e-03,\n",
496
+ " 6.9638e-01, 9.4965e-02, 8.8206e-02, 2.3173e-01, 1.2170e-01, 4.5793e-01,\n",
497
+ " 1.4489e-01, 2.2540e-01, 5.2360e-01, 2.7475e-01, 5.3707e-01, 9.3503e-02,\n",
498
+ " 1.5903e-01, 3.4478e-01, 3.9456e-01, 1.7182e-01, 5.6727e-03, 2.7554e-01,\n",
499
+ " 2.0691e-01, 1.6439e-01, 6.4637e-01, 1.3178e-01, 1.9076e-01, 2.2997e-01,\n",
500
+ " 9.9676e-04, 2.3884e-01, 5.3464e-01, 2.7388e-01, 2.3122e-01, 3.2136e-01,\n",
501
+ " 6.1094e-02, 1.6784e-01, 5.6459e-01, 4.4070e-01, 3.1866e-01, 4.1410e-01,\n",
502
+ " 3.0922e-01, 5.3698e-01, 8.8994e-02, 4.1334e-01, 2.5389e-01, 6.0110e-01,\n",
503
+ " 3.8342e-01, 3.5175e-02, 2.5660e-01, 8.5744e-01, 3.0483e-03, 3.4735e-01,\n",
504
+ " 3.8450e-01, 3.9665e-01, 2.2100e-01, 6.5109e-02, 1.9003e-01, 7.4262e-02,\n",
505
+ " 2.9763e-01, 1.4098e-01, 1.1544e-01, 3.2446e-01, 1.4054e-02, 1.6943e-01,\n",
506
+ " 1.1417e-01, 3.3420e-01, 4.2107e-02, 4.9406e-01, 5.4846e-02, 2.4392e-01,\n",
507
+ " 2.4391e-01, 2.1046e-01, 3.5563e-01, 1.6479e-01, 3.2559e-01, 4.0702e-01,\n",
508
+ " 9.6086e-01, 1.3305e-01, 7.5751e-02, 2.7087e-01, 9.1068e-02, 4.7289e-01,\n",
509
+ " 1.0613e-01, 1.3504e-01, 2.7304e-01, 1.1986e-01, 4.7432e-01, 3.9729e-01,\n",
510
+ " 1.3385e-02, 1.6185e-01, 5.8601e-01, 5.8034e-01, 6.7479e-03, 2.1235e-01,\n",
511
+ " 6.9211e-02, 1.1795e-01, 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01,\n",
512
+ " 2.7441e-01, 4.9623e-01, 2.1623e-01, 8.4283e-02, 1.1040e-01, 3.7749e-02,\n",
513
+ " 3.9097e-01, 2.7157e-02, 3.7090e-01, 3.6961e-01, 2.6829e-01, 1.7171e-01,\n",
514
+ " 1.7970e-02, 1.2158e-01, 1.8717e-01, 3.5600e-01, 1.5203e-03, 2.1490e-01,\n",
515
+ " 2.2720e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, 3.4466e-02, 5.8733e-01,\n",
516
+ " 1.2950e-01, 2.3771e-01, 3.9440e-01, 1.0506e-01, 5.9232e-01])\n",
517
+ "TF-IDF Keywords: [array([['rus', 'ahecaqo', 'türkçe', 'pşıqo', '1777', 'çerkes', '000',\n",
518
+ " 'çerkesya', 'ölenler', 'ünlüydü', 'ölüm', 'yazmadıkları',\n",
519
+ " 'ulaşıyordu', 'tarihlerini', 'çerkeslerin', 'çerkesler',\n",
520
+ " 'çatışmalar', 'zaferlerle', 'öldü', 'soneki', 'soy', 'soyadları',\n",
521
+ " 'soylular', 'sıra', 'savaşı', 'sim', 'saygı', 'ordusu', 'oluşan',\n",
522
+ " 'olduğunu', 'müfrezesi', 'lakaptır', 'savaşına', 'qo', 'oğlu',\n",
523
+ " 'kazandığı', 'kaynakça', 'kaynaklarından', 'kaynak',\n",
524
+ " 'kaydedilmedi', 'katılımı', 'kalıcı', 'kafkas', 'ismidir',\n",
525
+ " 'ismi', 'hastalıktan', 'hakkında', 'geçti', 'lakapları',\n",
526
+ " 'kişiler', 'kişi', 'eş', 'geldiği', 'gelen', 'eğitim', 'dönem',\n",
527
+ " 'erken', 'ekiyle', 'ek', 'devlet', 'büyütüldü', 'bölgesinde',\n",
528
+ " 'bjeduğ', 'biyografi', 'duyulan', 'doğumlular', 'doğdu',\n",
529
+ " 'beyoğlu', 'bilgiler', 'birliklerine', 'belirtir', 'askerî',\n",
530
+ " 'becerisinin', 'atlıya', 'atlıdan', 'anlamlıdır', 'anlamına',\n",
531
+ " 'askeri', 'adlardaki', '1838', 'adamıydı', '1837', '12']],\n",
532
+ " dtype=object)]\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  "BERT Embeddings:\n",
534
  "Text 1 embedding shape: torch.Size([233, 768])\n"
535
  ]
536
+ },
537
+ {
538
+ "ename": "ValueError",
539
+ "evalue": "setting an array element with a sequence.",
540
+ "output_type": "error",
541
+ "traceback": [
542
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
543
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
544
+ "\u001b[1;31mTypeError\u001b[0m: float() argument must be a string or a real number, not 'csr_matrix'",
545
+ "\nThe above exception was the direct cause of the following exception:\n",
546
+ "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
547
+ "Cell \u001b[1;32mIn[31], line 151\u001b[0m\n\u001b[0;32m 146\u001b[0m \u001b[38;5;124;03m\"\"\"# Liste halindeki TF-IDF değerlerini yazdırma\u001b[39;00m\n\u001b[0;32m 147\u001b[0m \u001b[38;5;124;03mprint(\"TF-IDF List:\")\u001b[39;00m\n\u001b[0;32m 148\u001b[0m \u001b[38;5;124;03mfor row in tfidf_list:\u001b[39;00m\n\u001b[0;32m 149\u001b[0m \u001b[38;5;124;03m print(row)\"\"\"\u001b[39;00m\n\u001b[0;32m 150\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeler ve metin arasındaki cosine similarity hesaplama\u001b[39;00m\n\u001b[1;32m--> 151\u001b[0m similarity_score \u001b[38;5;241m=\u001b[39m \u001b[43mcosine_similarity\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkeywords_vector\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdocument_vector\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 153\u001b[0m \u001b[38;5;66;03m# Her bir kelime için TF-IDF değerlerini yazdırma\u001b[39;00m\n\u001b[0;32m 154\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc_idx, doc \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(tfidf_scores):\n",
548
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
549
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\metrics\\pairwise.py:1679\u001b[0m, in \u001b[0;36mcosine_similarity\u001b[1;34m(X, Y, dense_output)\u001b[0m\n\u001b[0;32m 1635\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute cosine similarity between samples in X and Y.\u001b[39;00m\n\u001b[0;32m 1636\u001b[0m \n\u001b[0;32m 1637\u001b[0m \u001b[38;5;124;03mCosine similarity, or the cosine kernel, computes similarity as the\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1675\u001b[0m \u001b[38;5;124;03m [0.57..., 0.81...]])\u001b[39;00m\n\u001b[0;32m 1676\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1677\u001b[0m \u001b[38;5;66;03m# to avoid recursive import\u001b[39;00m\n\u001b[1;32m-> 1679\u001b[0m X, Y \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_pairwise_arrays\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1681\u001b[0m X_normalized \u001b[38;5;241m=\u001b[39m normalize(X, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 1682\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X \u001b[38;5;129;01mis\u001b[39;00m Y:\n",
550
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\metrics\\pairwise.py:185\u001b[0m, in \u001b[0;36mcheck_pairwise_arrays\u001b[1;34m(X, Y, precomputed, dtype, accept_sparse, force_all_finite, ensure_2d, copy)\u001b[0m\n\u001b[0;32m 175\u001b[0m X \u001b[38;5;241m=\u001b[39m Y \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[0;32m 176\u001b[0m X,\n\u001b[0;32m 177\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 182\u001b[0m ensure_2d\u001b[38;5;241m=\u001b[39mensure_2d,\n\u001b[0;32m 183\u001b[0m )\n\u001b[0;32m 184\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 185\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 186\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maccept_sparse\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_all_finite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mensure_2d\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mensure_2d\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 194\u001b[0m Y \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[0;32m 195\u001b[0m Y,\n\u001b[0;32m 196\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 201\u001b[0m ensure_2d\u001b[38;5;241m=\u001b[39mensure_2d,\n\u001b[0;32m 202\u001b[0m )\n\u001b[0;32m 204\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m precomputed:\n",
551
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\validation.py:1012\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 1010\u001b[0m array \u001b[38;5;241m=\u001b[39m xp\u001b[38;5;241m.\u001b[39mastype(array, dtype, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1011\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1012\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43m_asarray_with_order\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mxp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mxp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1013\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ComplexWarning \u001b[38;5;28;01mas\u001b[39;00m complex_warning:\n\u001b[0;32m 1014\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 1015\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mComplex data not supported\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(array)\n\u001b[0;32m 1016\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcomplex_warning\u001b[39;00m\n",
552
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\utils\\_array_api.py:751\u001b[0m, in \u001b[0;36m_asarray_with_order\u001b[1;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[0;32m 749\u001b[0m array \u001b[38;5;241m=\u001b[39m numpy\u001b[38;5;241m.\u001b[39marray(array, order\u001b[38;5;241m=\u001b[39morder, dtype\u001b[38;5;241m=\u001b[39mdtype)\n\u001b[0;32m 750\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 751\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mnumpy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 753\u001b[0m \u001b[38;5;66;03m# At this point array is a NumPy ndarray. We convert it to an array\u001b[39;00m\n\u001b[0;32m 754\u001b[0m \u001b[38;5;66;03m# container that is consistent with the input's namespace.\u001b[39;00m\n\u001b[0;32m 755\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m xp\u001b[38;5;241m.\u001b[39masarray(array)\n",
553
+ "\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."
554
+ ]
555
  }
556
  ],
557
  "source": [
 
563
  "from sklearn.feature_extraction.text import TfidfVectorizer\n",
564
  "from nltk.corpus import stopwords as nltk_stopwords\n",
565
  "from transformers import BertTokenizer, BertModel\n",
566
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
567
  "import torch\n",
568
+ "import torch.nn.functional as F\n",
569
  "\n",
570
  "# BERT Tokenizer ve Model'i yükleyin\n",
571
  "tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
 
609
  " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
610
  " X = vectorizer.fit_transform(corpus)\n",
611
  " feature_names = vectorizer.get_feature_names_out()\n",
612
+ " #scores = np.asarray(X.sum(axis=0)).flatten()\n",
613
+ " sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
614
+ " #keywords = {feature_names[i]: scores[i] for i in range(len(feature_names))}\n",
615
+ " #sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)\n",
616
  " return sorted_keywords\n",
617
  "\n",
618
  "#tokenleri kelimelere dönüştürür ve listeler \n",
 
644
  " \n",
645
  " \n",
646
  "\n",
 
 
 
647
  "#token ıd leri ve bert gömme vektörleri\n",
648
  "for text in texts:\n",
649
  " input_ids,embeddings= get_bert_embeddings(text)\n",
 
654
  " # Tokenları ve ortalama vektörleri al\n",
655
  " tokens = decode_tokens(input_ids)\n",
656
  " avg_embedding = average_embeddings(embeddings)\n",
657
+ " #ortalama embedding değerlerinden sadece 0'dan büyük olanları alma\n",
658
+ " positive_avg_embedding= avg_embedding[avg_embedding>0]\n",
659
+ " # Eğer pozitif embedding değerleri varsa, çıktıyı yazdır\n",
660
+ "\n",
661
+ "if len(positive_avg_embedding) > 0:\n",
662
  " print(f\"Tokens: {tokens}\")\n",
663
+ " print(f\"Positive Average Embedding Shape: {positive_avg_embedding.shape}\")\n",
664
+ " print(f\"Positive Average Embedding: {positive_avg_embedding}\")\n",
665
+ "else:\n",
666
+ " print(\"No positive embedding values found.\")\n",
667
  "\n",
668
+ " \n",
669
  "# TF-IDF anahtar kelimelerini çıkar\n",
670
  "keywords = extract_keywords_tfidf(texts,stop_words_list)\n",
671
  "print(\"TF-IDF Keywords:\", keywords)\n",
 
675
  "for i, emb in enumerate(embeddings):\n",
676
  " print(f\"Text {i+1} embedding shape: {emb.shape}\")\n",
677
  "\n",
678
+ "keywords_str = \" \".join([str(keyword) for keyword in keywords])\n",
679
+ "\n",
680
+ "\n",
681
+ "#metinleri birleştirip tf-ıdf matrisini oluşturma\n",
682
+ "# TF-IDF vektörleştirici oluşturma\n",
683
+ "tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
684
+ "corpus = [text, keywords_str]\n",
685
+ "tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)\n",
686
+ "\n",
687
+ "# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\n",
688
+ "keywords_vector = tfidf_matrix[1]\n",
689
+ "document_vector = tfidf_matrix[0]\n",
690
+ "keywords_vector_dense = keywords_vector.toarray()\n",
691
+ "document_vector_dense = document_vector.toarray()\n",
692
+ "\n",
693
+ "# Kelimeleri ve TF-IDF değerlerini alma\n",
694
+ "feature_names = tfidf_vectorizer.get_feature_names_out()\n",
695
+ "tfidf_scores = tfidf_matrix.toarray()\n",
696
+ "similarity_score = cosine_similarity(keywords_vector_dense, document_vector_dense)\n",
697
+ "\n",
698
+ "# TF-IDF matrisini dense formata çevirme\n",
699
+ "dense_matrix = tfidf_matrix.todense()\n",
700
+ "# Dense matrisi liste haline getirme\n",
701
+ "tfidf_list = dense_matrix.tolist()\n",
702
+ "\n",
703
+ "\"\"\"# Liste halindeki TF-IDF değerlerini yazdırma\n",
704
+ "print(\"TF-IDF List:\")\n",
705
+ "for row in tfidf_list:\n",
706
+ " print(row)\"\"\"\n",
707
+ "# Anahtar kelimeler ve metin arasındaki cosine similarity hesaplama\n",
708
+ "similarity_score = cosine_similarity([keywords_vector], [document_vector])\n",
709
+ "\n",
710
+ "# Her bir kelime için TF-IDF değerlerini yazdırma\n",
711
+ "for doc_idx, doc in enumerate(tfidf_scores):\n",
712
+ " print(f\"Document {doc_idx + 1}:\")\n",
713
+ " for word_idx, score in enumerate(doc):\n",
714
+ " print(f\"Word: {feature_names[word_idx]}, TF-IDF: {score:.4f}\")\n",
715
+ " print(\"\\n\")\n",
716
+ "\n",
717
+ "# Sonucu yazdırma\n",
718
+ "print(f\"Keywords ile metin arasındaki benzerlik: {similarity_score[0][0]}\")\n",
719
  "\n"
720
  ]
721
  },
722
+ {
723
+ "cell_type": "code",
724
+ "execution_count": 32,
725
+ "metadata": {},
726
+ "outputs": [
727
+ {
728
+ "name": "stdout",
729
+ "output_type": "stream",
730
+ "text": [
731
+ "Tokens: ['[CLS]', 'Biy', '##ografi', 'İsim', 'P', '##şı', '##q', '##o', 'ismi', 'Ah', '##ec', '##a', '##q', '##o', 'soy', 'ismi', '##dir', '.', 'Çerkes', '##lerin', '\"', '-', 'q', '##o', '\"', 'son', '##eki', 'ile', 'biten', 'hem', 'soya', '##d', '##ları', 'hem', 'de', 'lak', '##ap', '##ları', 'vardı', '.', 'Bu', 'ek', 'Türkçe', 'ad', '##lardaki', '\"', '-', 'oğlu', '\"', 'eki', '##yle', 'eş', 'anlamlı', '##dır', '.', 'P', '##şı', '##q', '##o', 'Türkçe', '\"', 'Beyoğlu', '\"', 'anlamına', 'gelen', 'bir', 'lak', '##ap', '##tır', '.', 'Erken', 'dönem', 'Çerkes', '##ler', 'tarihleri', '##ni', 'yazma', '##dıkları', 've', 'tüm', 'bilgiler', 'Rus', 'kaynaklarından', 'geldiği', 'için', 'Ah', '##ec', '##a', '##q', '##o', 'hakkında', 'pek', 'bir', 'şey', 'kayded', '##ilme', '##di', '.', '177', '##7', \"'\", 'de', 'Çerkes', '##ya', \"'\", 'nın', 'B', '##je', '##duğ', 'bölgesinde', 'doğdu', '.', 'Asker', '##î', 'eğitim', 'ile', 'büyüt', '##üldü', '.', 'Rus', '-', 'Çerkes', 'Savaşı', '##na', 'Katılım', '##ı', 'Birkaç', 'kaynak', ',', 'Ah', '##ec', '##a', '##q', '##o', \"'\", 'nun', 'tüm', 'Çerkes', '##ya', \"'\", 'da', 'saygı', 'duyulan', 'bir', 'kişi', 'olduğunu', 'belirtir', '.', 'En', 'az', '6', '.', '000', 'at', '##lı', '##dan', 'oluşan', 'kalıcı', 'bir', 'ordusu', 'vardı', 've', 'çatışmalar', 'sırasında', 'müfre', '##ze', '##si', '12', '.', '000', 'at', '##lı', '##ya', 'ulaşıyor', '##du', '.', 'Rus', 'birlikleri', '##ne', 'karşı', 'kazandığı', 'zafer', '##lerle', 'ünlü', '##y', '##dü', '.', 'Askeri', 'becerisi', '##nin', 'yanı', 'sıra', 'yetenekli', 'bir', 'devlet', 'adamı', '##ydı', '.', 'Ölüm', '18', '##37', 'yılında', 'Rus', 'tarafına', 'geçti', 've', 'bir', 'yıl', 'sonra', 'hastalık', '##tan', 'öldü', '.', 'Kaynak', '##ça', 'Çerkes', 'soylu', '##lar', '177', '##7', 'doğumlu', '##lar', '18', '##38', 'yılında', 'ölen', '##ler', 'Kafkas', 'Savaşı', \"'\", 'nda', 'kişiler', '[SEP]']\n",
732
+ "Positive Average Embedding Shape: torch.Size([353])\n",
733
+ "Positive Average Embedding: tensor([3.1219e-01, 1.1118e-02, 1.3312e-02, 8.7684e-02, 6.0835e-01, 4.2102e-01,\n",
734
+ " 3.7467e-01, 2.5975e-01, 6.8351e-02, 1.1375e-01, 6.9892e-02, 3.4909e-01,\n",
735
+ " 4.2718e-02, 6.9431e-01, 8.2034e-02, 4.9043e-01, 4.0028e-02, 2.4516e-02,\n",
736
+ " 4.0203e-01, 1.7956e-01, 2.7692e-01, 4.2539e-01, 3.9989e-01, 1.1785e-01,\n",
737
+ " 1.2440e-01, 1.7583e-01, 4.7179e-01, 3.4876e-01, 4.3870e-01, 3.8414e-01,\n",
738
+ " 3.6902e-01, 2.5584e-01, 2.0225e-01, 1.4411e-01, 2.9933e-01, 3.6910e-01,\n",
739
+ " 2.3893e-01, 6.0434e-01, 1.5669e-01, 3.6258e-01, 4.5186e-01, 3.8370e-01,\n",
740
+ " 4.9858e-01, 1.1362e-02, 2.1302e-01, 3.4201e-01, 7.4201e-01, 7.6336e-02,\n",
741
+ " 2.6290e-01, 2.3984e-01, 4.8434e-01, 4.7557e-01, 2.2432e-01, 3.5924e-01,\n",
742
+ " 5.3896e-02, 1.0477e-01, 3.8852e-01, 2.9142e-01, 3.6626e-01, 7.9898e-02,\n",
743
+ " 2.2686e-01, 2.3253e-02, 3.2550e+00, 1.1168e-01, 4.2853e-01, 7.7213e-02,\n",
744
+ " 3.1671e-01, 5.6494e-01, 2.0392e-01, 5.1432e-02, 2.2806e-01, 4.0886e-01,\n",
745
+ " 2.2627e-02, 2.4151e-01, 5.5605e-01, 2.1589e-01, 8.9567e-02, 4.1183e-01,\n",
746
+ " 3.7691e-01, 6.1995e-02, 5.1504e-01, 4.9226e-01, 1.0083e-01, 1.9789e-01,\n",
747
+ " 6.5205e-01, 3.4597e-02, 9.5440e-02, 6.5158e-01, 1.9009e-01, 1.1314e-01,\n",
748
+ " 1.0752e-01, 4.7765e-01, 2.5196e-01, 1.3468e-01, 4.2977e-01, 2.7336e-01,\n",
749
+ " 4.7672e-02, 2.3097e-01, 1.5998e-01, 3.8424e-01, 2.9264e-02, 7.9061e-02,\n",
750
+ " 2.8095e-01, 2.0505e-01, 8.8469e-02, 1.6993e-01, 2.5519e-01, 5.7010e-01,\n",
751
+ " 6.1551e-03, 7.0113e-02, 1.1820e-01, 5.2899e-01, 1.3287e-01, 1.0696e+00,\n",
752
+ " 3.1219e-01, 1.1373e-01, 2.6080e-01, 6.1457e-03, 5.5064e-02, 5.2089e-01,\n",
753
+ " 1.3195e-01, 4.0164e-01, 8.4919e-01, 1.8478e-02, 1.6000e-01, 3.3307e-01,\n",
754
+ " 2.8522e-01, 1.7133e-01, 2.4794e-02, 1.7487e-01, 1.0915e-01, 2.5974e-01,\n",
755
+ " 1.8174e-02, 8.9919e-02, 1.6508e+00, 4.9391e-01, 7.9321e-02, 3.2023e-02,\n",
756
+ " 3.1216e-01, 3.5055e-01, 2.4602e-01, 4.0553e-01, 1.3428e-02, 4.7906e-01,\n",
757
+ " 2.2494e-01, 3.5909e-01, 1.2861e-01, 9.8253e-02, 2.3110e-01, 3.1276e-01,\n",
758
+ " 6.4092e-02, 2.7386e-01, 6.7687e-02, 3.0518e-02, 3.8880e-01, 2.8110e-01,\n",
759
+ " 5.7723e-02, 4.2425e-01, 6.5768e-01, 8.4208e-02, 3.2153e-01, 5.6956e-01,\n",
760
+ " 1.2256e-01, 4.2261e-01, 7.9419e-02, 1.5746e-01, 1.8869e-01, 4.1413e-01,\n",
761
+ " 3.7192e-01, 5.4023e-02, 1.1605e-01, 4.2643e-01, 1.6004e-01, 2.1577e-01,\n",
762
+ " 6.6576e-03, 4.4046e-01, 2.4404e-01, 8.1931e-02, 2.2825e-01, 8.8104e-02,\n",
763
+ " 4.0676e-01, 1.6295e-01, 5.8565e-01, 3.9977e-01, 5.0630e-02, 6.7476e-02,\n",
764
+ " 3.4367e-01, 1.8640e-01, 3.3172e-01, 2.6630e-02, 1.6500e-02, 2.6911e-01,\n",
765
+ " 3.4227e-02, 9.7154e-01, 8.5149e-01, 1.0421e-01, 6.2897e-01, 1.8700e-02,\n",
766
+ " 1.6866e-01, 3.2686e-01, 6.5600e-01, 2.9388e-02, 3.8548e-02, 1.5922e-01,\n",
767
+ " 5.6203e-01, 3.1285e-01, 3.8763e-01, 1.6276e-01, 1.2610e-01, 3.5952e-01,\n",
768
+ " 1.3288e-01, 6.0504e-01, 2.9626e-01, 2.7285e-03, 3.7191e-01, 4.7557e-01,\n",
769
+ " 9.2435e-02, 2.3198e-01, 1.8715e-01, 2.5481e-01, 3.2795e-01, 4.5814e-01,\n",
770
+ " 1.9183e-01, 2.7146e-01, 1.9477e-01, 5.7984e-03, 3.0490e-01, 9.8830e-03,\n",
771
+ " 6.9638e-01, 9.4965e-02, 8.8206e-02, 2.3173e-01, 1.2170e-01, 4.5793e-01,\n",
772
+ " 1.4489e-01, 2.2540e-01, 5.2360e-01, 2.7475e-01, 5.3707e-01, 9.3503e-02,\n",
773
+ " 1.5903e-01, 3.4478e-01, 3.9456e-01, 1.7182e-01, 5.6727e-03, 2.7554e-01,\n",
774
+ " 2.0691e-01, 1.6439e-01, 6.4637e-01, 1.3178e-01, 1.9076e-01, 2.2997e-01,\n",
775
+ " 9.9676e-04, 2.3884e-01, 5.3464e-01, 2.7388e-01, 2.3122e-01, 3.2136e-01,\n",
776
+ " 6.1094e-02, 1.6784e-01, 5.6459e-01, 4.4070e-01, 3.1866e-01, 4.1410e-01,\n",
777
+ " 3.0922e-01, 5.3698e-01, 8.8994e-02, 4.1334e-01, 2.5389e-01, 6.0110e-01,\n",
778
+ " 3.8342e-01, 3.5175e-02, 2.5660e-01, 8.5744e-01, 3.0483e-03, 3.4735e-01,\n",
779
+ " 3.8450e-01, 3.9665e-01, 2.2100e-01, 6.5109e-02, 1.9003e-01, 7.4262e-02,\n",
780
+ " 2.9763e-01, 1.4098e-01, 1.1544e-01, 3.2446e-01, 1.4054e-02, 1.6943e-01,\n",
781
+ " 1.1417e-01, 3.3420e-01, 4.2107e-02, 4.9406e-01, 5.4846e-02, 2.4392e-01,\n",
782
+ " 2.4391e-01, 2.1046e-01, 3.5563e-01, 1.6479e-01, 3.2559e-01, 4.0702e-01,\n",
783
+ " 9.6086e-01, 1.3305e-01, 7.5751e-02, 2.7087e-01, 9.1068e-02, 4.7289e-01,\n",
784
+ " 1.0613e-01, 1.3504e-01, 2.7304e-01, 1.1986e-01, 4.7432e-01, 3.9729e-01,\n",
785
+ " 1.3385e-02, 1.6185e-01, 5.8601e-01, 5.8034e-01, 6.7479e-03, 2.1235e-01,\n",
786
+ " 6.9211e-02, 1.1795e-01, 4.8630e-01, 3.5354e-01, 4.4272e-01, 2.5360e-01,\n",
787
+ " 2.7441e-01, 4.9623e-01, 2.1623e-01, 8.4283e-02, 1.1040e-01, 3.7749e-02,\n",
788
+ " 3.9097e-01, 2.7157e-02, 3.7090e-01, 3.6961e-01, 2.6829e-01, 1.7171e-01,\n",
789
+ " 1.7970e-02, 1.2158e-01, 1.8717e-01, 3.5600e-01, 1.5203e-03, 2.1490e-01,\n",
790
+ " 2.2720e-01, 1.4914e-02, 3.7205e-01, 3.4950e-01, 3.4466e-02, 5.8733e-01,\n",
791
+ " 1.2950e-01, 2.3771e-01, 3.9440e-01, 1.0506e-01, 5.9232e-01])\n",
792
+ "TF-IDF Keywords: [array([['rus', 'ahecaqo', 'türkçe', 'pşıqo', '1777', 'çerkes', '000',\n",
793
+ " 'çerkesya', 'ölenler', 'ünlüydü', 'ölüm', 'yazmadıkları',\n",
794
+ " 'ulaşıyordu', 'tarihlerini', 'çerkeslerin', 'çerkesler',\n",
795
+ " 'çatışmalar', 'zaferlerle', 'öldü', 'soneki', 'soy', 'soyadları',\n",
796
+ " 'soylular', 'sıra', 'savaşı', 'sim', 'saygı', 'ordusu', 'oluşan',\n",
797
+ " 'olduğunu', 'müfrezesi', 'lakaptır', 'savaşına', 'qo', 'oğlu',\n",
798
+ " 'kazandığı', 'kaynakça', 'kaynaklarından', 'kaynak',\n",
799
+ " 'kaydedilmedi', 'katılımı', 'kalıcı', 'kafkas', 'ismidir',\n",
800
+ " 'ismi', 'hastalıktan', 'hakkında', 'geçti', 'lakapları',\n",
801
+ " 'kişiler', 'kişi', 'eş', 'geldiği', 'gelen', 'eğitim', 'dönem',\n",
802
+ " 'erken', 'ekiyle', 'ek', 'devlet', 'büyütüldü', 'bölgesinde',\n",
803
+ " 'bjeduğ', 'biyografi', 'duyulan', 'doğumlular', 'doğdu',\n",
804
+ " 'beyoğlu', 'bilgiler', 'birliklerine', 'belirtir', 'askerî',\n",
805
+ " 'becerisinin', 'atlıya', 'atlıdan', 'anlamlıdır', 'anlamına',\n",
806
+ " 'askeri', 'adlardaki', '1838', 'adamıydı', '1837', '12']],\n",
807
+ " dtype=object)]\n"
808
+ ]
809
+ },
810
+ {
811
+ "ename": "TypeError",
812
+ "evalue": "sequence item 0: expected str instance, numpy.ndarray found",
813
+ "output_type": "error",
814
+ "traceback": [
815
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
816
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
817
+ "Cell \u001b[1;32mIn[32], line 96\u001b[0m\n\u001b[0;32m 94\u001b[0m \u001b[38;5;66;03m# TF-IDF matrisini oluşturma\u001b[39;00m\n\u001b[0;32m 95\u001b[0m tfidf_vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words_list)\n\u001b[1;32m---> 96\u001b[0m corpus \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m \u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeywords\u001b[49m\u001b[43m)\u001b[49m] \u001b[38;5;66;03m# Anahtar kelimeleri string olarak birleştir\u001b[39;00m\n\u001b[0;32m 97\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m tfidf_vectorizer\u001b[38;5;241m.\u001b[39mfit_transform(corpus \u001b[38;5;241m+\u001b[39m texts)\n\u001b[0;32m 99\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\u001b[39;00m\n",
818
+ "\u001b[1;31mTypeError\u001b[0m: sequence item 0: expected str instance, numpy.ndarray found"
819
+ ]
820
+ }
821
+ ],
822
+ "source": [
823
+ "import re\n",
824
+ "import numpy as np\n",
825
+ "import pandas as pd\n",
826
+ "from nltk.stem import WordNetLemmatizer\n",
827
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
828
+ "from nltk.corpus import stopwords as nltk_stopwords\n",
829
+ "from transformers import BertTokenizer, BertModel\n",
830
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
831
+ "import torch\n",
832
+ "import torch.nn.functional as F\n",
833
+ "\n",
834
+ "# BERT Tokenizer ve Model'i yükleyin\n",
835
+ "tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
836
+ "model = BertModel.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
837
+ "\n",
838
+ "#-------------------------- burada turkish_stop_words'ü alıyoruz\n",
839
+ "def load_stop_words(file_path):\n",
840
+ " \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur.\"\"\"\n",
841
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
842
+ " stop_words = [line.strip() for line in file if line.strip()]\n",
843
+ " return stop_words\n",
844
+ "\n",
845
+ "# Türkçe stop words dosyasını yükleyin\n",
846
+ "stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
847
+ "\n",
848
+ "# Gömülü kelimeleri k-means ile kümeleyebiliriz , benzerlik oranını hesaplamak için farklı algoritmalardan yararlanabiliriz.\n",
849
+ "def get_bert_embeddings(text):\n",
850
+ " inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)\n",
851
+ " with torch.no_grad():\n",
852
+ " outputs = model(**inputs)\n",
853
+ " # Son katmandaki gömme (embedding) çıktısını alın\n",
854
+ " return inputs['input_ids'], outputs.last_hidden_state\n",
855
+ "\n",
856
+ "#------------------------------------ token verilerinin ortalaması (eşik değer için)\n",
857
+ "def average_embeddings(embeddings):\n",
858
+ " # Token vektörlerinin ortalamasını alarak metin düzeyinde özet oluştur\n",
859
+ " return torch.mean(embeddings, dim=1).squeeze()\n",
860
+ "\n",
861
+ "# Keywords çıkarma fonksiyonu\n",
862
+ "def extract_keywords_tfidf(corpus, stop_words_list):\n",
863
+ " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
864
+ " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
865
+ " X = vectorizer.fit_transform(corpus)\n",
866
+ " feature_names = vectorizer.get_feature_names_out()\n",
867
+ " sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
868
+ " return sorted_keywords\n",
869
+ "\n",
870
+ "# Tokenları kelimelere dönüştürür ve listeler \n",
871
+ "def decode_tokens(input_ids):\n",
872
+ " tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())\n",
873
+ " return tokens\n",
874
+ "\n",
875
+ "# Örnek metinler (buranın yerine combined_text kullanılacak)\n",
876
+ "texts = [\"\"\"Biyografi\n",
877
+ "İsim \n",
878
+ "Pşıqo ismi Ahecaqo soy ismidir. Çerkeslerin \"-qo\" soneki ile biten hem soyadları hem de lakapları vardı. Bu ek Türkçe adlardaki \"-oğlu\" ekiyle eş anlamlıdır. Pşıqo Türkçe \"Beyoğlu\" anlamına gelen bir lakaptır.\n",
879
+ "\n",
880
+ "Erken dönem \n",
881
+ "Çerkesler tarihlerini yazmadıkları ve tüm bilgiler Rus kaynaklarından geldiği için Ahecaqo hakkında pek bir şey kaydedilmedi. 1777'de Çerkesya'nın Bjeduğ bölgesinde doğdu. Askerî eğitim ile büyütüldü.\n",
882
+ "\n",
883
+ "Rus-Çerkes Savaşına Katılımı \n",
884
+ "Birkaç kaynak, Ahecaqo'nun tüm Çerkesya'da saygı duyulan bir kişi olduğunu belirtir. En az 6.000 atlıdan oluşan kalıcı bir ordusu vardı ve çatışmalar sırasında müfrezesi 12.000 atlıya ulaşıyordu. Rus birliklerine karşı kazandığı zaferlerle ünlüydü. Askeri becerisinin yanı sıra yetenekli bir devlet adamıydı.\n",
885
+ "\n",
886
+ "Ölüm \n",
887
+ "1837 yılında Rus tarafına geçti ve bir yıl sonra hastalıktan öldü.\n",
888
+ "\n",
889
+ "Kaynakça \n",
890
+ "\n",
891
+ "Çerkes soylular\n",
892
+ "1777 doğumlular\n",
893
+ "1838 yılında ölenler\n",
894
+ "Kafkas Savaşı'nda kişiler \"\"\"]\n",
895
+ "\n",
896
+ "# Token id'leri ve BERT gömme vektörleri\n",
897
+ "for text in texts:\n",
898
+ " input_ids, embeddings = get_bert_embeddings(text)\n",
899
+ " tokens = decode_tokens(input_ids)\n",
900
+ " avg_embedding = average_embeddings(embeddings)\n",
901
+ "\n",
902
+ " # Ortalama embedding değerlerinden sadece 0'dan büyük olanları alma\n",
903
+ " positive_avg_embedding = avg_embedding[avg_embedding > 0]\n",
904
+ "\n",
905
+ " if len(positive_avg_embedding) > 0:\n",
906
+ " print(f\"Tokens: {tokens}\")\n",
907
+ " print(f\"Positive Average Embedding Shape: {positive_avg_embedding.shape}\")\n",
908
+ " print(f\"Positive Average Embedding: {positive_avg_embedding}\")\n",
909
+ " else:\n",
910
+ " print(\"No positive embedding values found.\")\n",
911
+ "\n",
912
+ "# TF-IDF anahtar kelimelerini çıkar\n",
913
+ "keywords = extract_keywords_tfidf(texts, stop_words_list)\n",
914
+ "print(\"TF-IDF Keywords:\", keywords)\n",
915
+ "\n",
916
+ "# TF-IDF matrisini oluşturma\n",
917
+ "tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
918
+ "corpus = [\" \".join(keywords)] # Anahtar kelimeleri string olarak birleştir\n",
919
+ "tfidf_matrix = tfidf_vectorizer.fit_transform(corpus + texts)\n",
920
+ "\n",
921
+ "# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\n",
922
+ "keywords_vector = tfidf_matrix[0]\n",
923
+ "document_vectors = tfidf_matrix[1:]\n",
924
+ "\n",
925
+ "# Kelimeleri ve TF-IDF değerlerini alma\n",
926
+ "feature_names = tfidf_vectorizer.get_feature_names_out()\n",
927
+ "tfidf_scores = tfidf_matrix.toarray()\n",
928
+ "\n",
929
+ "# Cosine similarity hesaplama\n",
930
+ "similarity_scores = cosine_similarity(keywords_vector, document_vectors)\n",
931
+ "\n",
932
+ "# Her bir kelime için TF-IDF değerlerini yazdırma\n",
933
+ "for doc_idx, doc in enumerate(tfidf_scores[1:], start=1):\n",
934
+ " print(f\"Document {doc_idx}:\")\n",
935
+ " for word_idx, score in enumerate(doc):\n",
936
+ " print(f\"Word: {feature_names[word_idx]}, TF-IDF: {score:.4f}\")\n",
937
+ " print(\"\\n\")\n",
938
+ "\n",
939
+ "# Sonucu yazdırma\n",
940
+ "print(f\"Keywords ile metin arasındaki benzerlik: {similarity_scores[0][0]}\")\n"
941
+ ]
942
+ },
943
+ {
944
+ "cell_type": "code",
945
+ "execution_count": 1,
946
+ "metadata": {},
947
+ "outputs": [
948
+ {
949
+ "name": "stderr",
950
+ "output_type": "stream",
951
+ "text": [
952
+ "\n",
953
+ "A module that was compiled using NumPy 1.x cannot be run in\n",
954
+ "NumPy 2.1.0 as it may crash. To support both 1.x and 2.x\n",
955
+ "versions of NumPy, modules must be compiled with NumPy 2.0.\n",
956
+ "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n",
957
+ "\n",
958
+ "If you are a user of the module, the easiest solution will be to\n",
959
+ "downgrade to 'numpy<2' or try to upgrade the affected module.\n",
960
+ "We expect that some modules will need time to support NumPy 2.\n",
961
+ "\n",
962
+ "Traceback (most recent call last): File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\runpy.py\", line 196, in _run_module_as_main\n",
963
+ " return _run_code(code, main_globals, None,\n",
964
+ " File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\runpy.py\", line 86, in _run_code\n",
965
+ " exec(code, run_globals)\n",
966
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel_launcher.py\", line 18, in <module>\n",
967
+ " app.launch_new_instance()\n",
968
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\traitlets\\config\\application.py\", line 1075, in launch_instance\n",
969
+ " app.start()\n",
970
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelapp.py\", line 739, in start\n",
971
+ " self.io_loop.start()\n",
972
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tornado\\platform\\asyncio.py\", line 205, in start\n",
973
+ " self.asyncio_loop.run_forever()\n",
974
+ " File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\base_events.py\", line 603, in run_forever\n",
975
+ " self._run_once()\n",
976
+ " File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\base_events.py\", line 1909, in _run_once\n",
977
+ " handle._run()\n",
978
+ " File \"C:\\Users\\info\\AppData\\Local\\Programs\\Python\\Python310\\lib\\asyncio\\events.py\", line 80, in _run\n",
979
+ " self._context.run(self._callback, *self._args)\n",
980
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 545, in dispatch_queue\n",
981
+ " await self.process_one()\n",
982
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 534, in process_one\n",
983
+ " await dispatch(*args)\n",
984
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 437, in dispatch_shell\n",
985
+ " await result\n",
986
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 362, in execute_request\n",
987
+ " await super().execute_request(stream, ident, parent)\n",
988
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 778, in execute_request\n",
989
+ " reply_content = await reply_content\n",
990
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 449, in do_execute\n",
991
+ " res = shell.run_cell(\n",
992
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\ipykernel\\zmqshell.py\", line 549, in run_cell\n",
993
+ " return super().run_cell(*args, **kwargs)\n",
994
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3075, in run_cell\n",
995
+ " result = self._run_cell(\n",
996
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3130, in _run_cell\n",
997
+ " result = runner(coro)\n",
998
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\async_helpers.py\", line 128, in _pseudo_sync_runner\n",
999
+ " coro.send(None)\n",
1000
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3334, in run_cell_async\n",
1001
+ " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n",
1002
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3517, in run_ast_nodes\n",
1003
+ " if await self.run_code(code, result, async_=asy):\n",
1004
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3577, in run_code\n",
1005
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n",
1006
+ " File \"C:\\Users\\info\\AppData\\Local\\Temp\\ipykernel_17960\\3105833283.py\", line 7, in <module>\n",
1007
+ " from transformers import BertTokenizer, BertModel\n",
1008
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\__init__.py\", line 26, in <module>\n",
1009
+ " from . import dependency_versions_check\n",
1010
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\dependency_versions_check.py\", line 16, in <module>\n",
1011
+ " from .utils.versions import require_version, require_version_core\n",
1012
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\__init__.py\", line 34, in <module>\n",
1013
+ " from .generic import (\n",
1014
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py\", line 462, in <module>\n",
1015
+ " import torch.utils._pytree as _torch_pytree\n",
1016
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py\", line 2120, in <module>\n",
1017
+ " from torch._higher_order_ops import cond\n",
1018
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_higher_order_ops\\__init__.py\", line 1, in <module>\n",
1019
+ " from .cond import cond\n",
1020
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_higher_order_ops\\cond.py\", line 5, in <module>\n",
1021
+ " import torch._subclasses.functional_tensor\n",
1022
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_subclasses\\functional_tensor.py\", line 42, in <module>\n",
1023
+ " class FunctionalTensor(torch.Tensor):\n",
1024
+ " File \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_subclasses\\functional_tensor.py\", line 258, in FunctionalTensor\n",
1025
+ " cpu = _conversion_method_template(device=torch.device(\"cpu\"))\n",
1026
+ "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\_subclasses\\functional_tensor.py:258: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\builder\\windows\\pytorch\\torch\\csrc\\utils\\tensor_numpy.cpp:84.)\n",
1027
+ " cpu = _conversion_method_template(device=torch.device(\"cpu\"))\n"
1028
+ ]
1029
+ },
1030
+ {
1031
+ "ename": "NameError",
1032
+ "evalue": "name 'texts' is not defined",
1033
+ "output_type": "error",
1034
+ "traceback": [
1035
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1036
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
1037
+ "Cell \u001b[1;32mIn[1], line 62\u001b[0m\n\u001b[0;32m 59\u001b[0m tfidf_vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer()\n\u001b[0;32m 61\u001b[0m \u001b[38;5;66;03m# TF-IDF anahtar kelimelerini çıkar\u001b[39;00m\n\u001b[1;32m---> 62\u001b[0m keywords \u001b[38;5;241m=\u001b[39m extract_keywords_tfidf(\u001b[43mtexts\u001b[49m, stop_words_list)\n\u001b[0;32m 63\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTF-IDF Keywords:\u001b[39m\u001b[38;5;124m\"\u001b[39m, keywords)\n\u001b[0;32m 65\u001b[0m \u001b[38;5;66;03m# Transform the text and keywords into TF-IDF representations\u001b[39;00m\n",
1038
+ "\u001b[1;31mNameError\u001b[0m: name 'texts' is not defined"
1039
+ ]
1040
+ }
1041
+ ],
1042
+ "source": [
1043
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1044
+ "import numpy as np\n",
1045
+ "import re\n",
1046
+ "import pandas as pd\n",
1047
+ "from nltk.stem import WordNetLemmatizer\n",
1048
+ "from nltk.corpus import stopwords as nltk_stopwords\n",
1049
+ "from transformers import BertTokenizer, BertModel\n",
1050
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1051
+ "import torch\n",
1052
+ "import torch.nn.functional as F\n",
1053
+ "\n",
1054
+ "# BERT Tokenizer ve Model'i yükleyin\n",
1055
+ "tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
1056
+ "model = BertModel.from_pretrained('dbmdz/bert-base-turkish-cased')\n",
1057
+ "\n",
1058
+ "#-------------------------- burada turkish_stop_words'ü alıyoruz\n",
1059
+ "def load_stop_words(file_path):\n",
1060
+ " \"\"\"Stop words'leri dosyadan okuyarak bir liste oluşturur.\"\"\"\n",
1061
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
1062
+ " stop_words = [line.strip() for line in file if line.strip()]\n",
1063
+ " return stop_words\n",
1064
+ "\n",
1065
+ "# Keywords çıkarma fonksiyonu\n",
1066
+ "def extract_keywords_tfidf(corpus, stop_words_list):\n",
1067
+ " \"\"\"TF-IDF ile anahtar kelimeleri çıkarır, stop words listesi ile birlikte kullanır.\"\"\"\n",
1068
+ " vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
1069
+ " X = vectorizer.fit_transform(corpus)\n",
1070
+ " feature_names = vectorizer.get_feature_names_out()\n",
1071
+ " sorted_keywords = [feature_names[i] for i in X.sum(axis=0).argsort()[0, ::-1]]\n",
1072
+ " return sorted_keywords\n",
1073
+ "\n",
1074
+ "# Türkçe stop words dosyasını yükleyin\n",
1075
+ "stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
1076
+ "# Define the text\n",
1077
+ "text = \"\"\"Biyografi\n",
1078
+ "İsim \n",
1079
+ "Pşıqo ismi Ahecaqo soy ismidir. Çerkeslerin \"-qo\" soneki ile biten hem soyadları hem de lakapları vardı. Bu ek Türkçe adlardaki \"-oğlu\" ekiyle eş anlamlıdır. Pşıqo Türkçe \"Beyoğlu\" anlamına gelen bir lakaptır.\n",
1080
+ "\n",
1081
+ "Erken dönem \n",
1082
+ "Çerkesler tarihlerini yazmadıkları ve tüm bilgiler Rus kaynaklarından geldiği için Ahecaqo hakkında pek bir şey kaydedilmedi. 1777'de Çerkesya'nın Bjeduğ bölgesinde doğdu. Askerî eğitim ile büyütüldü.\n",
1083
+ "\n",
1084
+ "Rus-Çerkes Savaşına Katılımı \n",
1085
+ "Birkaç kaynak, Ahecaqo'nun tüm Çerkesya'da saygı duyulan bir kişi olduğunu belirtir. En az 6.000 atlıdan oluşan kalıcı bir ordusu vardı ve çatışmalar sırasında müfrezesi 12.000 atlıya ulaşıyordu. Rus birliklerine karşı kazandığı zaferlerle ünlüydü. Askeri becerisinin yanı sıra yetenekli bir devlet adamıydı.\n",
1086
+ "\n",
1087
+ "Ölüm \n",
1088
+ "1837 yılında Rus tarafına geçti ve bir yıl sonra hastalıktan öldü.\n",
1089
+ "\n",
1090
+ "Kaynakça \n",
1091
+ "\n",
1092
+ "Çerkes soylular\n",
1093
+ "1777 doğumlular\n",
1094
+ "1838 yılında ölenler\n",
1095
+ "Kafkas Savaşı'nda kişiler \"\"\"\n",
1096
+ "\n",
1097
+ "# Define the keywords\n",
1098
+ "#keywords = [\"rus\", \"ahecaqo\", \"türkçe\", \"pşıqo\", \"1777\", \"çerkes\", \"000\", \"çerkesya\", \"ölenler\", \"ünlüydü\"]\n",
1099
+ "\n",
1100
+ "# Create a TfidfVectorizer instance\n",
1101
+ "tfidf_vectorizer = TfidfVectorizer()\n",
1102
+ "\n",
1103
+ "# TF-IDF anahtar kelimelerini çıkar\n",
1104
+ "keywords = extract_keywords_tfidf(texts, stop_words_list)\n",
1105
+ "print(\"TF-IDF Keywords:\", keywords)\n",
1106
+ "\n",
1107
+ "# Transform the text and keywords into TF-IDF representations\n",
1108
+ "text_tfidf = tfidf_vectorizer.fit_transform([text]) #burada text'i de vetörize ediyoruz.\n",
1109
+ "keywords_tfidf = tfidf_vectorizer.transform(keywords)\n",
1110
+ "\n",
1111
+ "# Calculate the cosine similarity between the text and each keyword\n",
1112
+ "similarities = []\n",
1113
+ "for i in range(keywords_tfidf.shape[0]): #keyword_tfidf matrisinin satırları üzerinde dönfü tanımlıyoruz \n",
1114
+ " keyword_tfidf = keywords_tfidf[i, :] # matrisin i. değerini alıyoruz \n",
1115
+ " # `text_tfidf` ile `keyword_tfidf` arasındaki kosinüs benzerliğini hesaplıyoruz\n",
1116
+ " similarity = np.dot(text_tfidf, keyword_tfidf.T).toarray()[0][0]\n",
1117
+ " similarities.append((keywords[i], similarity))\n",
1118
+ "\n",
1119
+ "# Sort the similarities in descending order\n",
1120
+ "keyword_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)\n",
1121
+ "\n",
1122
+ "\n",
1123
+ "\n",
1124
+ "# Print the top 10 keywords with their similarities\n",
1125
+ "print(\"Top 10 Keywords with Similarities:\")\n",
1126
+ "for keyword, similarity in keyword_similarities[:10]:\n",
1127
+ " print(f\"{keyword}: {similarity:.4f}\")"
1128
+ ]
1129
+ },
1130
+ {
1131
+ "cell_type": "code",
1132
+ "execution_count": 24,
1133
+ "metadata": {},
1134
+ "outputs": [
1135
+ {
1136
+ "ename": "AttributeError",
1137
+ "evalue": "'list' object has no attribute 'lower'",
1138
+ "output_type": "error",
1139
+ "traceback": [
1140
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1141
+ "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
1142
+ "Cell \u001b[1;32mIn[24], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m tfidf_vectorizer \u001b[38;5;241m=\u001b[39m TfidfVectorizer(stop_words\u001b[38;5;241m=\u001b[39mstop_words_list)\n\u001b[0;32m 13\u001b[0m corpus \u001b[38;5;241m=\u001b[39m [text, keywords]\n\u001b[1;32m---> 14\u001b[0m tfidf_matrix \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_vectorizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcorpus\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 16\u001b[0m \u001b[38;5;66;03m# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\u001b[39;00m\n\u001b[0;32m 17\u001b[0m keywords_vector \u001b[38;5;241m=\u001b[39m tfidf_matrix[\u001b[38;5;241m1\u001b[39m]\n",
1143
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:2091\u001b[0m, in \u001b[0;36mTfidfVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 2084\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_params()\n\u001b[0;32m 2085\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf \u001b[38;5;241m=\u001b[39m TfidfTransformer(\n\u001b[0;32m 2086\u001b[0m norm\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm,\n\u001b[0;32m 2087\u001b[0m use_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_idf,\n\u001b[0;32m 2088\u001b[0m smooth_idf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmooth_idf,\n\u001b[0;32m 2089\u001b[0m sublinear_tf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msublinear_tf,\n\u001b[0;32m 2090\u001b[0m )\n\u001b[1;32m-> 2091\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2092\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mfit(X)\n\u001b[0;32m 2093\u001b[0m \u001b[38;5;66;03m# X is already a transformed view of raw_documents so\u001b[39;00m\n\u001b[0;32m 2094\u001b[0m \u001b[38;5;66;03m# we set copy to False\u001b[39;00m\n",
1144
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
1145
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1372\u001b[0m, in \u001b[0;36mCountVectorizer.fit_transform\u001b[1;34m(self, raw_documents, y)\u001b[0m\n\u001b[0;32m 1364\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 1365\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUpper case characters found in\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1366\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m vocabulary while \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlowercase\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is True. These entries will not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1368\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be matched with any documents\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1369\u001b[0m )\n\u001b[0;32m 1370\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m-> 1372\u001b[0m vocabulary, X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfixed_vocabulary_\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbinary:\n\u001b[0;32m 1375\u001b[0m X\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mfill(\u001b[38;5;241m1\u001b[39m)\n",
1146
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1259\u001b[0m, in \u001b[0;36mCountVectorizer._count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 1257\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m raw_documents:\n\u001b[0;32m 1258\u001b[0m feature_counter \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m-> 1259\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m \u001b[43manalyze\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[0;32m 1260\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1261\u001b[0m feature_idx \u001b[38;5;241m=\u001b[39m vocabulary[feature]\n",
1147
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:108\u001b[0m, in \u001b[0;36m_analyze\u001b[1;34m(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preprocessor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 108\u001b[0m doc \u001b[38;5;241m=\u001b[39m \u001b[43mpreprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 110\u001b[0m doc \u001b[38;5;241m=\u001b[39m tokenizer(doc)\n",
1148
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:66\u001b[0m, in \u001b[0;36m_preprocess\u001b[1;34m(doc, accent_function, lower)\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Chain together an optional series of text preprocessing steps to\u001b[39;00m\n\u001b[0;32m 48\u001b[0m \u001b[38;5;124;03mapply to a document.\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 63\u001b[0m \u001b[38;5;124;03m preprocessed string\u001b[39;00m\n\u001b[0;32m 64\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 65\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lower:\n\u001b[1;32m---> 66\u001b[0m doc \u001b[38;5;241m=\u001b[39m \u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlower\u001b[49m()\n\u001b[0;32m 67\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m accent_function \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 68\u001b[0m doc \u001b[38;5;241m=\u001b[39m accent_function(doc)\n",
1149
+ "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'lower'"
1150
+ ]
1151
+ }
1152
+ ],
1153
+ "source": [
1154
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1155
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1156
+ "import numpy as np\n",
1157
+ "\n",
1158
+ "#metin ile keywordslerin benzerlik oranını hesaplama \n",
1159
+ "text,keywords\n",
1160
+ "\n",
1161
+ "# Metinleri birleştirip TF-IDF matrisini oluşturma\n",
1162
+ "# TF-IDF vektörleştirici oluşturma\n",
1163
+ "# Türkçe stop words dosyasını yükleyin\n",
1164
+ "stop_words_list = load_stop_words('turkish_stop_words.txt')\n",
1165
+ "tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
1166
+ "corpus = [text, keywords]\n",
1167
+ "tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)\n",
1168
+ "\n",
1169
+ "# Anahtar kelimeler vektörünü ve diğer metin vektörünü ayırma\n",
1170
+ "keywords_vector = tfidf_matrix[1]\n",
1171
+ "text_vector = tfidf_matrix[0]\n",
1172
+ "\n",
1173
+ "# Anahtar kelimeler ve metin arasındaki cosine similarity hesaplama\n",
1174
+ "similarity_score = cosine_similarity(keywords_vector, text_vector)\n",
1175
+ "\n",
1176
+ "# Sonucu yazdırma\n",
1177
+ "print(f\"Keywords ile metin arasındaki benzerlik: {similarity_score[0][0]}\")\n"
1178
+ ]
1179
+ },
1180
+ {
1181
+ "cell_type": "code",
1182
+ "execution_count": 19,
1183
+ "metadata": {},
1184
+ "outputs": [
1185
+ {
1186
+ "ename": "TypeError",
1187
+ "evalue": "'function' object is not subscriptable",
1188
+ "output_type": "error",
1189
+ "traceback": [
1190
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1191
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
1192
+ "Cell \u001b[1;32mIn[19], line 18\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\n\u001b[0;32m 17\u001b[0m \u001b[38;5;66;03m# Compute BERT embeddings for the top 10 keywords\u001b[39;00m\n\u001b[1;32m---> 18\u001b[0m top_keywords \u001b[38;5;241m=\u001b[39m [keyword \u001b[38;5;28;01mfor\u001b[39;00m keyword, _ \u001b[38;5;129;01min\u001b[39;00m \u001b[43mextract_keywords_tfidf\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m]\u001b[49m]\n\u001b[0;32m 19\u001b[0m bert_embeddings \u001b[38;5;241m=\u001b[39m compute_bert_embeddings(top_keywords)\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# Define a function to compute the similarity between two embeddings\u001b[39;00m\n",
1193
+ "\u001b[1;31mTypeError\u001b[0m: 'function' object is not subscriptable"
1194
+ ]
1195
+ }
1196
+ ],
1197
+ "source": [
1198
+ "#------------------------------ tf-ıdf ve embedding benzerlik \n",
1199
+ "# Define a function to compute BERT embeddings for a list of keywords\n",
1200
+ "\n",
1201
+ "def compute_bert_embeddings(keywords):\n",
1202
+ " embeddings = []\n",
1203
+ " for keyword in keywords:\n",
1204
+ " inputs = tokenizer.encode_plus(\n",
1205
+ " keyword,\n",
1206
+ " add_special_tokens=True,\n",
1207
+ " max_length=512,\n",
1208
+ " return_attention_mask=True,\n",
1209
+ " return_tensors='pt'\n",
1210
+ " )\n",
1211
+ " outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])\n",
1212
+ " embeddings.append(outputs.last_hidden_state[:, 0, :]) # Take the embedding of the [CLS] token\n",
1213
+ " return embeddings\n",
1214
+ "\n",
1215
+ "# Compute BERT embeddings for the top 10 keywords\n",
1216
+ "top_keywords = [keyword for keyword, score in extract_keywords_tfidf[:10]]\n",
1217
+ "bert_embeddings = compute_bert_embeddings(top_keywords)\n",
1218
+ "\n",
1219
+ "# Define a function to compute the similarity between two embeddings\n",
1220
+ "def compute_similarity(embedding1, embedding2):\n",
1221
+ " return F.cosine_similarity(embedding1, embedding2)\n",
1222
+ "\n",
1223
+ "# Compute the similarity between the text and each keyword\n",
1224
+ "similarities = []\n",
1225
+ "for keyword_embedding in enumerate(bert_embeddings):\n",
1226
+ "\n",
1227
+ " keyword= top_keywords[i]\n",
1228
+ " score = extract_keywords_tfidf[i][1]\n",
1229
+ " similarity = compute_similarity(positive_avg_embedding, keyword_embedding)\n",
1230
+ " similarities.append(keyword,similarity.item()*score)\n",
1231
+ "\n",
1232
+ "# Combine the top 10 keywords with their similarities\n",
1233
+ "keyword_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)\n",
1234
+ "# Combine the top 10 keywords with their similarities\n",
1235
+ "#keyword_similarities = list(zip(top_keywords, similarities))"
1236
+ ]
1237
+ },
1238
  {
1239
  "cell_type": "code",
1240
  "execution_count": 8,
 
1300
  "test_stop_words_effectiveness(texts, stop_words_list)\n"
1301
  ]
1302
  },
1303
+ {
1304
+ "cell_type": "markdown",
1305
+ "metadata": {},
1306
+ "source": [
1307
+ "K-nn ile Cosine Similarity "
1308
+ ]
1309
+ },
1310
+ {
1311
+ "cell_type": "code",
1312
+ "execution_count": null,
1313
+ "metadata": {},
1314
+ "outputs": [],
1315
+ "source": [
1316
+ "#tf-ıdf değeleri arasınadki en çok metinde tekrarlanan ve anlam ilşikisi en yüksek olan kelimeleri kıyaslama \n",
1317
+ "model.most_similar(positive=[\"rus\",])"
1318
+ ]
1319
+ },
1320
+ {
1321
+ "cell_type": "code",
1322
+ "execution_count": null,
1323
+ "metadata": {},
1324
+ "outputs": [],
1325
+ "source": [
1326
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1327
+ "\n",
1328
+ "# TF-IDF ile vektörleri oluştur\n",
1329
+ "vectorizer = TfidfVectorizer(stop_words=stop_words_list)\n",
1330
+ "tfidf_matrix = vectorizer.fit_transform(texts)\n",
1331
+ "\n",
1332
+ "# BERT ile elde edilen pozitif embedding'leri TF-IDF vektörlerine dönüştür\n",
1333
+ "# Bu adımda, her kelimenin veya metnin TF-IDF ağırlıklarıyla karşılaştırılması yapılacak\n",
1334
+ "\n",
1335
+ "def get_tfidf_vector_for_query(query, vectorizer):\n",
1336
+ " \"\"\"Sorgu metni için TF-IDF vektörü alır\"\"\"\n",
1337
+ " return vectorizer.transform([query])\n",
1338
+ "\n",
1339
+ "def calculate_similarity(tfidf_vector, embeddings):\n",
1340
+ " \"\"\"TF-IDF vektörü ile embeddings arasındaki cosine similarity hesaplar\"\"\"\n",
1341
+ " return cosine_similarity(tfidf_vector, embeddings)\n",
1342
+ "\n",
1343
+ "# Sorgu metnini tanımlayın ve TF-IDF vektörünü alın\n",
1344
+ "query_text = \"Nasılsın?\"\n",
1345
+ "query_tfidf_vector = get_tfidf_vector_for_query(query_text, vectorizer)\n",
1346
+ "\n",
1347
+ "# Cosine similarity hesaplayın\n",
1348
+ "similarity_scores = calculate_similarity(query_tfidf_vector, tfidf_matrix)\n",
1349
+ "\n",
1350
+ "# Sonuçları yazdırın\n",
1351
+ "print(\"Cosine Similarity Scores:\", similarity_scores)\n"
1352
+ ]
1353
+ },
1354
+ {
1355
+ "cell_type": "code",
1356
+ "execution_count": null,
1357
+ "metadata": {},
1358
+ "outputs": [],
1359
+ "source": [
1360
+ "from sklearn.neighbors import NearestNeighbors\n",
1361
+ "\n",
1362
+ "def fit_knn_model(embeddings,n_neighbors=5):\n",
1363
+ " knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')\n",
1364
+ " knn.fit(embeddings)\n",
1365
+ " return knn\n",
1366
+ "\n",
1367
+ "embeddings= np.array([get_bert_embeddings(text) for text in texts])\n",
1368
+ "#knn\n",
1369
+ "knn_model=fit_knn_model(embeddings)\n",
1370
+ "\n",
1371
+ "\n",
1372
+ "#tf-ıdf değelriyle bert üzerinden elde ettiğimiz verlerin benzerliğini hesaplayacağız \n",
1373
+ "keywords"
1374
+ ]
1375
+ },
1376
  {
1377
  "cell_type": "code",
1378
  "execution_count": 20,
deneme.ipynb ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ },
16
+ {
17
+ "ename": "OSError",
18
+ "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
19
+ "output_type": "error",
20
+ "traceback": [
21
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
22
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
23
+ "Cell \u001b[1;32mIn[3], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
24
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
25
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
26
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
27
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
28
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
29
+ "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
30
+ ]
31
+ }
32
+ ],
33
+ "source": [
34
+ "from datasets import load_dataset\n",
35
+ "import pandas as pd \n",
36
+ "from pymongo import MongoClient\n",
37
+ "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": []
46
+ }
47
+ ],
48
+ "metadata": {
49
+ "kernelspec": {
50
+ "display_name": ".venv",
51
+ "language": "python",
52
+ "name": "python3"
53
+ },
54
+ "language_info": {
55
+ "codemirror_mode": {
56
+ "name": "ipython",
57
+ "version": 3
58
+ },
59
+ "file_extension": ".py",
60
+ "mimetype": "text/x-python",
61
+ "name": "python",
62
+ "nbconvert_exporter": "python",
63
+ "pygments_lexer": "ipython3",
64
+ "version": "3.10.11"
65
+ }
66
+ },
67
+ "nbformat": 4,
68
+ "nbformat_minor": 2
69
+ }
gereksiz_kelimeler.txt ADDED
The diff for this file is too large to render. See raw diff
 
kelimeler.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,6 +1,127 @@
1
- gradio==4.40.0.*
2
- pymongo==4.8.0.*
3
- pandas==2.2.2.*
4
- datasets==2.20.0.*
5
- torch
6
- transformers==4.43.4.*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiofiles==23.2.1
3
+ aiohappyeyeballs==2.3.4
4
+ aiohttp==3.10.1
5
+ aiosignal==1.3.1
6
+ annotated-types==0.7.0
7
+ anyio==4.4.0
8
+ asttokens==2.4.1
9
+ astunparse==1.6.3
10
+ attrs==24.1.0
11
+ certifi==2024.7.4
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ colorama==0.4.6
15
+ comm==0.2.2
16
+ contourpy==1.2.1
17
+ cycler==0.12.1
18
+ datasets==2.20.0
19
+ debugpy==1.8.5
20
+ decorator==5.1.1
21
+ dill==0.3.8
22
+ dnspython==2.6.1
23
+ executing==2.0.1
24
+ fastapi==0.112.0
25
+ ffmpy==0.4.0
26
+ filelock==3.15.4
27
+ flatbuffers==24.3.25
28
+ fonttools==4.53.1
29
+ frozenlist==1.4.1
30
+ fsspec==2024.5.0
31
+ gast==0.6.0
32
+ google-pasta==0.2.0
33
+ gradio==4.40.0
34
+ gradio_client==1.2.0
35
+ grpcio==1.65.4
36
+ h11==0.14.0
37
+ h5py==3.11.0
38
+ httpcore==1.0.5
39
+ httpx==0.27.0
40
+ huggingface-hub==0.24.5
41
+ idna==3.7
42
+ importlib_resources==6.4.0
43
+ ipykernel==6.29.5
44
+ ipython==8.26.0
45
+ jedi==0.19.1
46
+ Jinja2==3.1.4
47
+ jupyter_client==8.6.2
48
+ jupyter_core==5.7.2
49
+ keras==3.4.1
50
+ kiwisolver==1.4.5
51
+ libclang==18.1.1
52
+ Markdown==3.6
53
+ markdown-it-py==3.0.0
54
+ MarkupSafe==2.1.5
55
+ matplotlib==3.9.0
56
+ matplotlib-inline==0.1.7
57
+ mdurl==0.1.2
58
+ ml-dtypes==0.4.0
59
+ mpmath==1.3.0
60
+ multidict==6.0.5
61
+ multiprocess==0.70.16
62
+ namex==0.0.8
63
+ nest-asyncio==1.6.0
64
+ networkx==3.3
65
+ numpy==1.26.4
66
+ opt-einsum==3.3.0
67
+ optree==0.12.1
68
+ orjson==3.10.6
69
+ packaging==24.1
70
+ pandas==2.2.2
71
+ parso==0.8.4
72
+ pillow==10.4.0
73
+ platformdirs==4.2.2
74
+ prompt_toolkit==3.0.47
75
+ protobuf==4.25.4
76
+ psutil==6.0.0
77
+ pure_eval==0.2.3
78
+ pyarrow==17.0.0
79
+ pyarrow-hotfix==0.6
80
+ pydantic==2.8.2
81
+ pydantic_core==2.20.1
82
+ pydub==0.25.1
83
+ Pygments==2.18.0
84
+ pymongo==4.8.0
85
+ pyparsing==3.1.2
86
+ python-dateutil==2.9.0.post0
87
+ python-multipart==0.0.9
88
+ pytz==2024.1
89
+ pywin32==306
90
+ PyYAML==6.0.1
91
+ pyzmq==26.1.0
92
+ regex==2024.7.24
93
+ requests==2.32.3
94
+ rich==13.7.1
95
+ ruff==0.5.6
96
+ safetensors==0.4.4
97
+ semantic-version==2.10.0
98
+ sentence-transformers==3.0.1
99
+ shellingham==1.5.4
100
+ six==1.16.0
101
+ sniffio==1.3.1
102
+ stack-data==0.6.3
103
+ starlette==0.37.2
104
+ sympy==1.13.1
105
+ tensorboard==2.17.0
106
+ tensorboard-data-server==0.7.2
107
+ tensorflow==2.17.0
108
+ tensorflow-intel==2.17.0
109
+ tensorflow-io-gcs-filesystem==0.31.0
110
+ termcolor==2.4.0
111
+ tokenizers==0.19.1
112
+ tomlkit==0.12.0
113
+ tqdm==4.66.5
114
+ traitlets==5.14.3
115
+ transformers==4.43.4
116
+ typer==0.12.3
117
+ typing_extensions==4.12.2
118
+ tzdata==2024.1
119
+ urllib3==2.2.2
120
+ uvicorn==0.30.5
121
+ wcwidth==0.2.13
122
+ websockets==12.0
123
+ Werkzeug==3.0.3
124
+ wrapt==1.16.0
125
+ xxhash==3.4.1
126
+ yarl==1.9.4
127
+
turkish_stop_words.txt ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ah
2
+ ama
3
+ an
4
+ ancak
5
+ araba
6
+ aralar
7
+ aslında
8
+ az
9
+ başlayan
10
+ bağlı
11
+ bazı
12
+ belirli
13
+ ben
14
+ bence
15
+ birkaç
16
+ birlikte
17
+ bunu
18
+ burada
19
+ biten
20
+ bir
21
+ birkaç
22
+ biz
23
+ bu
24
+ buna
25
+ çünkü
26
+ da
27
+ de
28
+ demek
29
+ den
30
+ derken
31
+ değil
32
+ daha
33
+ dolayı
34
+ edilir
35
+ eğer
36
+ en
37
+ fakat
38
+ genellikle
39
+ gibi
40
+ hem
41
+ her
42
+ herhangi
43
+ hiç
44
+ için
45
+ ile
46
+ ise
47
+ işte
48
+ itibaren
49
+ iyi
50
+ kadar
51
+ karşı
52
+ ki
53
+ kime
54
+ kısaca
55
+ mu
56
+
57
+ nasıl
58
+ ne
59
+ neden
60
+ niye
61
+ nın
62
+ nda
63
+ nun
64
+ o
65
+ olasılıkla
66
+ olabilir
67
+ olarak
68
+ olduğu
69
+ oluşur
70
+ önce
71
+ pek
72
+ peki
73
+ şu
74
+ sadece
75
+ se
76
+ şey
77
+ sırasında
78
+ şimdi
79
+ sonra
80
+ tabi
81
+ tarafına
82
+ tüm
83
+ vardı
84
+ ve
85
+ ya
86
+ ya da
87
+ yanı
88
+ yani
89
+ yıl
90
+ yılında
91
+ yetenekli
92
+ yine
93
+ ama
94
+ amma
95
+ anca
96
+ ancak
97
+ belki
98
+ çünkü
99
+ dahi
100
+ eğer
101
+ emme
102
+ fakat
103
+ gah
104
+ gerek
105
+ hakeza
106
+ halbuki
107
+ hatta
108
+ hele
109
+ hem
110
+ hoş
111
+ ile
112
+ ile
113
+ imdi
114
+ ister
115
+ kah
116
+ keşke
117
+ keza
118
+ kezalik
119
+ kim
120
+ lakin
121
+ madem
122
+ mademki
123
+ mamafih
124
+ meğer
125
+ meğerki
126
+ meğerse
127
+ netekim
128
+ neyse
129
+ nitekim
130
+ oysa
131
+ oysaki
132
+ şayet
133
+ velev
134
+ velhasıl
135
+ velhasılıkelam
136
+ veya
137
+ veyahut
138
+ yahut
139
+ yalnız
140
+ yani
141
+ yok
142
+ yoksa
143
+ zira
144
+ acaba
145
+ acep
146
+ açıkça
147
+ açıkçası
148
+ adamakıllı
149
+ adeta
150
+ bazen
151
+ bazı
152
+ bilcümle
153
+ binaen
154
+ binaenaleyh
155
+ bir
156
+ biraz
157
+ birazdan
158
+ birden
159
+ birden
160
+ birdenbire
161
+ birice
162
+ birlikte
163
+ bitevi
164
+ biteviye
165
+ bittabi
166
+ bizatihi
167
+ bizce
168
+ bizcileyin
169
+ bizden
170
+ bizzat
171
+ boşuna
172
+ böyle
173
+ böylece
174
+ böylecene
175
+ böylelikle
176
+ böylemesine
177
+ böylesine
178
+ buracıkta
179
+ burada
180
+ buradan
181
+ büsbütün
182
+ çabuk
183
+ çabukça
184
+ çeşitli
185
+ çoğu
186
+ çoğun
187
+ çoğunca
188
+ çoğunlukla
189
+ çok
190
+ çokça
191
+ çokluk
192
+ çoklukla
193
+ cuk
194
+ daha
195
+ dahil
196
+ dahilen
197
+ daima
198
+ demin
199
+ demincek
200
+ deminden
201
+ derakap
202
+ derhal
203
+ derken
204
+ diye
205
+ elbet
206
+ elbette
207
+ enikonu
208
+ epey
209
+ epeyce
210
+ epeyi
211
+ esasen
212
+ esnasında
213
+ etraflı
214
+ etraflıca
215
+ evleviyetle
216
+ evvel
217
+ evvela
218
+ evvelce
219
+ evvelden
220
+ evvelemirde
221
+ evveli
222
+ gayet
223
+ gayetle
224
+ gayri
225
+ gayrı
226
+ geçende
227
+ geçenlerde
228
+ gene
229
+ gerçi
230
+ gibi
231
+ gibilerden
232
+ gibisinden
233
+ gine
234
+ halen
235
+ halihazırda
236
+ haliyle
237
+ handiyse
238
+ hani
239
+ hasılı
240
+ hulasaten
241
+ iken
242
+ illa
243
+ illaki
244
+ itibarıyla
245
+ iyice
246
+ iyicene
247
+ kala
248
+ kez
249
+ kısaca
250
+ külliyen
251
+ lütfen
252
+ nasıl
253
+ nasılsa
254
+ nazaran
255
+ neden
256
+ nedeniyle
257
+ nedense
258
+ nerde
259
+ nerden
260
+ nerdeyse
261
+ nerede
262
+ nereden
263
+ neredeyse
264
+ nereye
265
+ neye
266
+ neyi
267
+ nice
268
+ niçin
269
+ nihayet
270
+ nihayetinde
271
+ niye
272
+ oldu
273
+ oldukça
274
+ olur
275
+ onca
276
+ önce
277
+ önceden
278
+ önceleri
279
+ öncelikle
280
+ onculayın
281
+ ondan
282
+ oracık
283
+ oracıkta
284
+ orada
285
+ oradan
286
+ oranca
287
+ oranla
288
+ oraya
289
+ öyle
290
+ öylece
291
+ öylelikle
292
+ öylemesine
293
+ pek
294
+ pekala
295
+ pekçe
296
+ peki
297
+ peyderpey
298
+ sadece
299
+ sahi
300
+ sahiden
301
+ sanki
302
+ sonra
303
+ sonradan
304
+ sonraları
305
+ sonunda
306
+ şöyle
307
+ şuncacık
308
+ şuracıkta
309
+ tabii
310
+ tam
311
+ tamam
312
+ tamamen
313
+ tamamıyla
314
+ tek
315
+ vasıtasıyla
316
+ yakinen
317
+ yakında
318
+ yakından
319
+ yakınlarda
320
+ yalnız
321
+ yalnızca
322
+ yeniden
323
+ yenilerde
324
+ yine
325
+ yok
326
+ yoluyla
327
+ yüzünden
328
+ zaten
329
+ zati
330
+ ait
331
+ bari
332
+ beri
333
+ bile
334
+ değin
335
+ dek
336
+ denli
337
+ doğru
338
+ dolayı
339
+ dolayısıyla
340
+ gelgelelim
341
+ gibi
342
+ gırla
343
+ göre
344
+ hasebiyle
345
+ için
346
+ ila
347
+ ile
348
+ ilen
349
+ indinde
350
+ inen
351
+ kadar
352
+ kaffesi
353
+ karşın
354
+ kelli
355
+ Leh
356
+ maada
357
+ mebni
358
+ naşi
359
+ rağmen
360
+ üzere
361
+ zarfında
362
+ öbür
363
+ bana
364
+ başkası
365
+ ben
366
+ beriki
367
+ birbiri
368
+ birçoğu
369
+ biri
370
+ birileri
371
+ birisi
372
+ birkaçı
373
+ biz
374
+ bizimki
375
+ buna
376
+ bunda
377
+ bundan
378
+ bunlar
379
+ bunu
380
+ bunun
381
+ burası
382
+ çoğu
383
+ çoğu
384
+ çokları
385
+ çoklarınca
386
+ cümlesi
387
+ değil
388
+ diğeri
389
+ filanca
390
+ hangisi
391
+ hepsi
392
+ hiçbiri
393
+
394
+ kaçı
395
+ kaynak
396
+ kendi
397
+ kim
398
+ kimi
399
+ kimisi
400
+ kimse
401
+ kimse
402
+ kimsecik
403
+ kimsecikler
404
+ nere
405
+ neresi
406
+ öbürkü
407
+ öbürü
408
+ ona
409
+ onda
410
+ ondan
411
+ onlar
412
+ onu
413
+ onun
414
+ öteki
415
+ ötekisi
416
+ öz
417
+ sana
418
+ sen
419
+ siz
420
+ şuna
421
+ şunda
422
+ şundan
423
+ şunlar
424
+ şunu
425
+ şunun
426
+ şura
427
+ şuracık
428
+ şurası