Spaces:

yonkasoft
/

makaleChatbotu

Build error

App Files Files Community

yonkasoft commited on Aug 5

Commit

b9ab29b

•

1 Parent(s): 02cf58a

Upload 11 files

Browse files

Files changed (11) hide show

Dockerfile +14 -0
bart.txt +5 -0
load.ipynb +694 -0
load2.ipynb +378 -0
merged_train.parquet +3 -0
model.ipynb +371 -0
mongoDb.py +39 -0
mongoDb_2.py +24 -0
mongodb_egitim.py +1 -0
pyvenv.cfg +5 -0
requirements.txt +6 -2

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# Temel imaj
+FROM python:3.11.4
+# Çalışma dizinini ayarla
+WORKDIR /deneme
+# Gereksinimler dosyasını kopyala
+COPY requirements.txt /deneme/requirements.txt
+# Gereksinimleri yükle
+RUN pip install --no-cache-dir --upgrade -r /deneme/requirements.txt
+# Uygulamanızı başlatma komutu
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

bart.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+MNLI: a bitext classification task to predict whether one sentence entails another. The fine-tuned model concatenates the two sentences with appended an EOS token, and passes them to both the BART encoder and decoder. In contrast to BERT, the representation of the EOS token is used to classify the sentences relations.
+ELI5: a long-form abstractive question answering dataset. Models generate answers conditioned on the concatenation of a question and supporting documents.
+ConvAI2: a dialogue response generation task, conditioned on context and a persona.
+CNN/DM: a news summarization dataset. Summaries here are typically closely related to source sentences.

load.ipynb ADDED Viewed

	@@ -0,0 +1,694 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#birleştirilcek dosyaların listesi \n",
+    "train_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00000-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00001-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00002-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00003-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00004-of-00007.parquet']\n",
+    "test_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00005-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00006-of-00007.parquet']\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'Automodel' from 'transformers' (c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[11], line 4\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m  \u001b[38;5;21;01mtransformers\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Automodel \n",
+      "\u001b[1;31mImportError\u001b[0m: cannot import name 'Automodel' from 'transformers' (c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py)"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "import  transformers\n",
+    "from datasets import Dataset\n",
+    "from transformers import Automodel "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Package           Version\n",
+      "----------------- -----------\n",
+      "asttokens         2.4.1\n",
+      "colorama          0.4.6\n",
+      "comm              0.2.2\n",
+      "debugpy           1.8.2\n",
+      "decorator         5.1.1\n",
+      "executing         2.0.1\n",
+      "ipykernel         6.29.5\n",
+      "ipython           8.26.0\n",
+      "jedi              0.19.1\n",
+      "jupyter_client    8.6.2\n",
+      "jupyter_core      5.7.2\n",
+      "matplotlib-inline 0.1.7\n",
+      "nest-asyncio      1.6.0\n",
+      "packaging         24.1\n",
+      "parso             0.8.4\n",
+      "pip               24.2\n",
+      "platformdirs      4.2.2\n",
+      "prompt_toolkit    3.0.47\n",
+      "psutil            6.0.0\n",
+      "pure_eval         0.2.3\n",
+      "Pygments          2.18.0\n",
+      "python-dateutil   2.9.0.post0\n",
+      "pywin32           306\n",
+      "pyzmq             26.0.3\n",
+      "setuptools        65.5.0\n",
+      "six               1.16.0\n",
+      "stack-data        0.6.3\n",
+      "tornado           6.4.1\n",
+      "traitlets         5.14.3\n",
+      "typing_extensions 4.12.2\n",
+      "wcwidth           0.2.13\n",
+      "Collecting transformers\n",
+      "  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)\n",
+      "Collecting filelock (from transformers)\n",
+      "  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)\n",
+      "Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)\n",
+      "  Using cached huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting numpy>=1.17 (from transformers)\n",
+      "  Using cached numpy-2.0.1-cp311-cp311-win_amd64.whl.metadata (60 kB)\n",
+      "Requirement already satisfied: packaging>=20.0 in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from transformers) (24.1)\n",
+      "Collecting pyyaml>=5.1 (from transformers)\n",
+      "  Using cached PyYAML-6.0.1-cp311-cp311-win_amd64.whl.metadata (2.1 kB)\n",
+      "Collecting regex!=2019.12.17 (from transformers)\n",
+      "  Downloading regex-2024.7.24-cp311-cp311-win_amd64.whl.metadata (41 kB)\n",
+      "Collecting requests (from transformers)\n",
+      "  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)\n",
+      "Collecting safetensors>=0.4.1 (from transformers)\n",
+      "  Downloading safetensors-0.4.3-cp311-none-win_amd64.whl.metadata (3.9 kB)\n",
+      "Collecting tokenizers<0.20,>=0.19 (from transformers)\n",
+      "  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)\n",
+      "Collecting tqdm>=4.27 (from transformers)\n",
+      "  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n",
+      "Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)\n",
+      "  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
+      "Requirement already satisfied: colorama in c:\\gitprojects\\deneme\\.venv\\lib\\site-packages (from tqdm>=4.27->transformers) (0.4.6)\n",
+      "Collecting charset-normalizer<4,>=2 (from requests->transformers)\n",
+      "  Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl.metadata (34 kB)\n",
+      "Collecting idna<4,>=2.5 (from requests->transformers)\n",
+      "  Using cached idna-3.7-py3-none-any.whl.metadata (9.9 kB)\n",
+      "Collecting urllib3<3,>=1.21.1 (from requests->transformers)\n",
+      "  Using cached urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)\n",
+      "Collecting certifi>=2017.4.17 (from requests->transformers)\n",
+      "  Using cached certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)\n",
+      "Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)\n",
+      "   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--\n",
+      "   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--\n",
+      "   - -------------------------------------- 0.3/9.4 MB ? eta -:--:--\n",
+      "   -- ------------------------------------- 0.5/9.4 MB 932.9 kB/s eta 0:00:10\n",
+      "   -- ------------------------------------- 0.5/9.4 MB 932.9 kB/s eta 0:00:10\n",
+      "   --- ------------------------------------ 0.8/9.4 MB 838.9 kB/s eta 0:00:11\n",
+      "   ---- ----------------------------------- 1.0/9.4 MB 825.2 kB/s eta 0:00:11\n",
+      "   ---- ----------------------------------- 1.0/9.4 MB 825.2 kB/s eta 0:00:11\n",
+      "   ----- ---------------------------------- 1.3/9.4 MB 818.6 kB/s eta 0:00:10\n",
+      "   ------ --------------------------------- 1.6/9.4 MB 822.8 kB/s eta 0:00:10\n",
+      "   ------ --------------------------------- 1.6/9.4 MB 822.8 kB/s eta 0:00:10\n",
+      "   ------- -------------------------------- 1.8/9.4 MB 838.9 kB/s eta 0:00:10\n",
+      "   -------- ------------------------------- 2.1/9.4 MB 851.1 kB/s eta 0:00:09\n",
+      "   -------- ------------------------------- 2.1/9.4 MB 851.1 kB/s eta 0:00:09\n",
+      "   ---------- ----------------------------- 2.4/9.4 MB 860.5 kB/s eta 0:00:09\n",
+      "   ----------- ---------------------------- 2.6/9.4 MB 878.0 kB/s eta 0:00:08\n",
+      "   ------------ --------------------------- 2.9/9.4 MB 897.4 kB/s eta 0:00:08\n",
+      "   ------------- -------------------------- 3.1/9.4 MB 913.7 kB/s eta 0:00:07\n",
+      "   -------------- ------------------------- 3.4/9.4 MB 911.0 kB/s eta 0:00:07\n",
+      "   -------------- ------------------------- 3.4/9.4 MB 911.0 kB/s eta 0:00:07\n",
+      "   --------------- ------------------------ 3.7/9.4 MB 908.8 kB/s eta 0:00:07\n",
+      "   ---------------- ----------------------- 3.9/9.4 MB 910.4 kB/s eta 0:00:07\n",
+      "   ----------------- ---------------------- 4.2/9.4 MB 918.5 kB/s eta 0:00:06\n",
+      "   ----------------- ---------------------- 4.2/9.4 MB 918.5 kB/s eta 0:00:06\n",
+      "   ------------------ --------------------- 4.5/9.4 MB 916.2 kB/s eta 0:00:06\n",
+      "   -------------------- ------------------- 4.7/9.4 MB 926.1 kB/s eta 0:00:06\n",
+      "   --------------------- ------------------ 5.0/9.4 MB 935.1 kB/s eta 0:00:05\n",
+      "   ---------------------- ----------------- 5.2/9.4 MB 940.5 kB/s eta 0:00:05\n",
+      "   ----------------------- ---------------- 5.5/9.4 MB 950.7 kB/s eta 0:00:05\n",
+      "   ------------------------ --------------- 5.8/9.4 MB 957.4 kB/s eta 0:00:04\n",
+      "   ------------------------- -------------- 6.0/9.4 MB 966.3 kB/s eta 0:00:04\n",
+      "   -------------------------- ------------- 6.3/9.4 MB 974.5 kB/s eta 0:00:04\n",
+      "   --------------------------- ------------ 6.6/9.4 MB 984.6 kB/s eta 0:00:03\n",
+      "   ---------------------------- ----------- 6.8/9.4 MB 991.6 kB/s eta 0:00:03\n",
+      "   ------------------------------ --------- 7.1/9.4 MB 1.0 MB/s eta 0:00:03\n",
+      "   ------------------------------- -------- 7.3/9.4 MB 1.0 MB/s eta 0:00:03\n",
+      "   -------------------------------- ------- 7.6/9.4 MB 1.0 MB/s eta 0:00:02\n",
+      "   --------------------------------- ------ 7.9/9.4 MB 1.0 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ----- 8.1/9.4 MB 1.0 MB/s eta 0:00:02\n",
+      "   ----------------------------------- ---- 8.4/9.4 MB 1.0 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 8.7/9.4 MB 1.1 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 8.9/9.4 MB 1.1 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 9.4/9.4 MB 1.1 MB/s eta 0:00:00\n",
+      "Using cached huggingface_hub-0.24.5-py3-none-any.whl (417 kB)\n",
+      "Using cached numpy-2.0.1-cp311-cp311-win_amd64.whl (16.6 MB)\n",
+      "Using cached PyYAML-6.0.1-cp311-cp311-win_amd64.whl (144 kB)\n",
+      "Downloading regex-2024.7.24-cp311-cp311-win_amd64.whl (269 kB)\n",
+      "Downloading safetensors-0.4.3-cp311-none-win_amd64.whl (287 kB)\n",
+      "Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl (2.2 MB)\n",
+      "   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n",
+      "   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n",
+      "   --------- ------------------------------ 0.5/2.2 MB 1.4 MB/s eta 0:00:02\n",
+      "   -------------- ------------------------- 0.8/2.2 MB 1.3 MB/s eta 0:00:02\n",
+      "   ------------------ --------------------- 1.0/2.2 MB 1.3 MB/s eta 0:00:01\n",
+      "   ----------------------- ---------------- 1.3/2.2 MB 1.4 MB/s eta 0:00:01\n",
+      "   ---------------------------- ----------- 1.6/2.2 MB 1.4 MB/s eta 0:00:01\n",
+      "   --------------------------------- ------ 1.8/2.2 MB 1.4 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 2.2/2.2 MB 1.4 MB/s eta 0:00:00\n",
+      "Using cached tqdm-4.66.4-py3-none-any.whl (78 kB)\n",
+      "Using cached filelock-3.15.4-py3-none-any.whl (16 kB)\n",
+      "Using cached requests-2.32.3-py3-none-any.whl (64 kB)\n",
+      "Using cached certifi-2024.7.4-py3-none-any.whl (162 kB)\n",
+      "Using cached charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl (99 kB)\n",
+      "Using cached fsspec-2024.6.1-py3-none-any.whl (177 kB)\n",
+      "Using cached idna-3.7-py3-none-any.whl (66 kB)\n",
+      "Using cached urllib3-2.2.2-py3-none-any.whl (121 kB)\n",
+      "Installing collected packages: urllib3, tqdm, safetensors, regex, pyyaml, numpy, idna, fsspec, filelock, charset-normalizer, certifi, requests, huggingface-hub, tokenizers, transformers\n",
+      "Successfully installed certifi-2024.7.4 charset-normalizer-3.3.2 filelock-3.15.4 fsspec-2024.6.1 huggingface-hub-0.24.5 idna-3.7 numpy-2.0.1 pyyaml-6.0.1 regex-2024.7.24 requests-2.32.3 safetensors-0.4.3 tokenizers-0.19.1 tqdm-4.66.4 transformers-4.43.3 urllib3-2.2.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip list dataset\n",
+    "!pip install transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dosyaları yükleyin ve birleştirin\n",
+    "train_dfs=[pd.read_parquet(file) for file in train_files]\n",
+    "test_dfs=[pd.read_parquet(file) for file in test_files]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#parque dosyalarının birleştirilmesi\n",
+    "train_df=pd.concat(train_dfs,ignore_index=True)\n",
+    "test_df=pd.concat(test_dfs,ignore_index=True)\n",
+    "\n",
+    "print(train_df.head())\n",
+    "print(train_df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#train  ve test dosyaları oluşturma \n",
+    "train_df.to_parquet('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n",
+    "test_df.to_parquet('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test ve train yollarını belirleme ve test, traindeki önemli sütunları alma\n",
+    "train_file_path=('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n",
+    "test_file_path=('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')\n",
+    "\n",
+    "train_df=pd.read_parquet(train_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
+    "test_df=pd.read_parquet(test_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
+    "\n",
+    "print(train_df.head())\n",
+    "print(test_df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \n",
+    "# Load model directly\n",
+    "from transformers import AutoModel,AutoTokenizer\n",
+    "from transformers import (WEIGHTS_NAME, BertConfig,\n",
+    "                                  BertForQuestionAnswering, BertTokenizer)\n",
+    "from torch.utils.data import DataLoader, SequentialSampler, TensorDataset\n",
+    "\n",
+    "#from utils import (get_answer, input_to_squad_example,squad_examples_to_features, to_list)\n",
+    "import collections\n",
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymongo import MongoClient\n",
+    "import pandas as pd\n",
+    "\n",
+    "# MongoDB connection settings\n",
+    "\n",
+    "def get_mongodb(database_name='yeniDatabase', collection_name='train', host='localhost', port=27017):\n",
+    "    \"\"\"\n",
+    "    MongoDB connection and collection selection\n",
+    "    \"\"\"\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
+    "    return collection\n",
+    "\n",
+    "# Function to load dataset into MongoDB\n",
+    "def dataset_read():\n",
+    "    train_file_path = ('C:\\\\gitProjects\\\\train_Egitim\\\\merged_train.parquet')\n",
+    "    data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
+    "    data_dict = data.to_dict(\"records\")\n",
+    "\n",
+    "    # Get the MongoDB collection\n",
+    "    source_collection = get_mongodb(database_name='yeniDatabase', collection_name='train')  # Collection for translation\n",
+    "\n",
+    "    # Insert data into MongoDB\n",
+    "    source_collection.insert_many(data_dict)\n",
+    "\n",
+    "    print(\"Data successfully loaded into MongoDB.\")\n",
+    "    return source_collection\n",
+    "\n",
+    "# Call the function to load the dataset into MongoDB\n",
+    "source_collection = dataset_read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test ve train verilerini mongodb ye yükleme"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_mongodb(database_name='yeniDatabase', collection_name='test', mongo_url='mongodb://localhost:27017/'):\n",
+    "    \"\"\"\n",
+    "    MongoDB connection and collection selection\n",
+    "    \"\"\"\n",
+    "    client = MongoClient(mongo_url)\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
+    "    return collection\n",
+    "\n",
+    "# Function to load dataset into MongoDB\n",
+    "def dataset_read():\n",
+    "    train_file_path = ('C:\\\\gitProjects\\\\test_Egitim\\\\merged_train.parquet')\n",
+    "    data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
+    "    data_dict = data.to_dict(\"records\")\n",
+    "\n",
+    "    # Get the MongoDB collection\n",
+    "    source_collection = get_mongodb(database_name='yeniDatabase', collection_name='test')  # Collection for translation\n",
+    "\n",
+    "    # Insert data into MongoDB\n",
+    "    source_collection.insert_many(data_dict)\n",
+    "\n",
+    "    print(\"Data successfully loaded into MongoDB.\")\n",
+    "    return source_collection\n",
+    "\n",
+    "# Call the function to load the dataset into MongoDB\n",
+    "source_collection = dataset_read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Model eğitimi \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uygulama için kullanılcak olan özelliklerin tanımlanması\n",
+    "from transformers import BertTokenizer,BertForQuestionAnswering,BertConfig\n",
+    "class QA:\n",
+    "    def __init__(self,model_path: str):\n",
+    "        self.max_seq_length = 384 #max seq\n",
+    "        self.doc_stride = 128 #stride \n",
+    "        self.do_lower_case = False\n",
+    "        self.max_query_length = 30\n",
+    "        self.n_best_size = 3\n",
+    "        self.max_answer_length = 30\n",
+    "        self.version_2_with_negative = False\n",
+    "        #modelin yüklenmesi\n",
+    "        self.model, self.tokenizer = self.load_model(model_path)\n",
+    "        #hangi işlmecinin kullanıldığının belirlenmesi\n",
+    "        if torch.cuda.is_available():\n",
+    "            self.device = 'cuda'\n",
+    "        else:\n",
+    "            self.device = 'cpu'\n",
+    "        self.model.to(self.device)\n",
+    "        self.model.eval()\n",
+    "        \n",
+    "        # This function is used to load the model\n",
+    "    def load_model(self,model_path: str,do_lower_case=False):\n",
+    "        config = BertConfig.from_pretrained(model_path + \"C:\\\\gitProjects\\\\train_Egitim\")\n",
+    "        tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=do_lower_case)\n",
+    "        model = BertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config)\n",
+    "        return model, tokenizer\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymongo import MongoClient\n",
+    "\n",
+    "def get_mongodb():\n",
+    "    # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.\n",
+    "    return 'mongodb://localhost:27017/', 'yeniDatabase', 'test'\n",
+    "\n",
+    "def get_average_prompt_token_length():\n",
+    "    # MongoDB bağlantı bilgilerini alma\n",
+    "    mongo_url, db_name, collection_name = get_mongodb()\n",
+    "\n",
+    "    # MongoDB'ye bağlanma\n",
+    "    client = MongoClient(mongo_url)\n",
+    "    db = client[db_name]\n",
+    "    collection = db[collection_name]\n",
+    "\n",
+    "    # Tüm dökümanları çekme ve 'prompt_token_length' alanını alma\n",
+    "    docs = collection.find({}, {'Prompt_token_length': 1})\n",
+    "\n",
+    "    # 'prompt_token_length' değerlerini toplama ve sayma\n",
+    "    total_length = 0\n",
+    "    count = 0\n",
+    "\n",
+    "    for doc in docs:\n",
+    "        if 'Prompt_token_length' in doc:\n",
+    "            total_length += doc['Prompt_token_length']\n",
+    "            count += 1\n",
+    "    \n",
+    "    # Ortalama hesaplama\n",
+    "    if count > 0:\n",
+    "        average_length = total_length / count\n",
+    "    else:\n",
+    "        average_length = 0  # Eğer 'prompt_token_length' alanı olan döküman yoksa\n",
+    "\n",
+    "    return int(average_length)\n",
+    "\n",
+    "# Ortalama prompt token uzunluğunu al ve yazdır\n",
+    "average_length = get_average_prompt_token_length()\n",
+    "print(f\"Ortalama prompt token uzunluğu: {average_length}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymongo import MongoClient\n",
+    "from transformers import BertTokenizer\n",
+    "\n",
+    "#getmongodb oluştumak yerine içeriği değiştirilmeli \n",
+    "def get_mongodb():\n",
+    "    # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmalıdır.\n",
+    "    return 'mongodb://localhost:27017/', 'yeniDatabase', 'train'\n",
+    "\n",
+    "def get_input_texts():\n",
+    "    # MongoDB bağlantı bilgilerini alma\n",
+    "    mongo_url, db_name, collection_name = get_mongodb()\n",
+    "\n",
+    "    # MongoDB'ye bağlanma\n",
+    "    client = MongoClient(mongo_url)\n",
+    "    db = client[db_name]\n",
+    "    collection = db[collection_name]\n",
+    "    \n",
+    "    #input texleri mongodb üzerinde 'Prompt' lara denk gelir.\n",
+    "\n",
+    "    # Sorguyu tanımlama\n",
+    "    query = {\"Prompt\": {\"$exists\": True}}\n",
+    "\n",
+    "    # Sorguyu çalıştırma ve dökümanları çekme\n",
+    "    cursor = collection.find(query, {\"Prompt\": 1, \"_id\": 0})  # 'input_text' alanını almak için \"_id\": 0 ekleyin\n",
+    "\n",
+    "    # Cursor'ı döküman listesine dönüştürme\n",
+    "    input_texts_from_db = list(cursor)\n",
+    "\n",
+    "    # Input text'leri döndürme\n",
+    "    return input_texts_from_db\n",
+    "\n",
+    "input_texts_from_db= get_input_texts()\n",
+    "# Input text'leri al ve yazdır\n",
+    "\n",
+    "#tokenizer ı yükle\n",
+    "tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "    \n",
+    "#encode etmek için gerekli olan bilgiler \n",
+    "input_texts=[doc[\"Prompt\"] for doc in input_texts_from_db ]\n",
+    "\n",
+    "#encoding işleminde inputlar \n",
+    "\n",
+    "# Tokenize the input texts\n",
+    "encoded_inputs = tokenizer.batch_encode_plus(\n",
+    "    input_texts,\n",
+    "    padding=True,\n",
+    "    truncation=True,\n",
+    "    max_length=100,\n",
+    "    return_attention_mask=True,\n",
+    "    return_tensors='pt'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"encoded_inputs:{encoded_inputs}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#maskeleme yönetmiyle eğitim\n",
+    "# Define the number of epochs and learning rate\n",
+    "num_epochs = 3\n",
+    "learning_rate = 1e-4\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n",
+    "\n",
+    "#Iterate over the epochs\n",
+    "for epoch in range(num_epochs):\n",
+    "    total_loss = 0\n",
+    "    for input_ids, attention_mask, labels in encoded_inputs:\n",
+    "        #reset gradients\n",
+    "        optimizer.zero_grad()\n",
+    "        #forward pass \n",
+    "        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n",
+    "        loss = outputs.loss\n",
+    "        #backward pass \n",
+    "        loss.backward()\n",
+    "        #update optimizer \n",
+    "        optimizer.step()\n",
+    "        #accumulate total loss\n",
+    "        total_loss += loss.item()\n",
+    "    #calculate average loss\n",
+    "    average_loss = total_loss / len(encoded_inputs)\n",
+    "    #print the loss for current epoch\n",
+    "    print(f\"Epoch {epoch+1} - Loss: {average_loss:.4f}\")\n",
+    "\n",
+    "    #tüm bu verileri tutan bir \"batch_of_attention_masks\" verisini tanımlamam gerek"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader,TensorDataset\n",
+    "import torch\n",
+    "from transformers import BertTokenizer\n",
+    "\n",
+    "#hdef değerlerle karşılaştırma yapabilmek için ve doğruluğu ölçmek için\n",
+    "\n",
+    "# Assuming you have tokenized input texts and labels\n",
+    "#attetion mask bert dilinde modelin sadece gerçek tokenler üzerinde çalışmasını sağlar.\n",
+    "input_ids = encoded_inputs['input_ids'] # Replace with your tokenized input texts\n",
+    "attention_masks = encoded_inputs['attention_mask']\n",
+    "\n",
+    "\n",
+    "labels = torch.tensor([1]*len(input_ids))\n",
+    "\n",
+    "# Create a TensorDataset\n",
+    "dataset = TensorDataset(input_ids, attention_masks, labels)\n",
+    "\n",
+    "batch_size=10000\n",
+    "# Create a data loader\n",
+    "data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)\n",
+    "\n",
+    "for batch in data_loader:\n",
+    "    input_ids,attention_masks,labels\n",
+    "    print(f\"ınput ıds :{input_texts}\")\n",
+    "    print(f\"attetion masks: {attention_masks}\")\n",
+    "    print(f\"labels:{labels}\")\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # This function performs the prediction and return the reponse to the flask app\n",
+    " # This function performs the prediction and return the reponse to the flask app\n",
+    "RawResult = collection.namedtuple(\"RawResult\",[\"unique_id\", \"start_logits\", \"end_logits\"])\n",
+    "\n",
+    "def predict(self,passage :str,question :str):        \n",
+    "        example = input_to_squad_example(passage,question)        \n",
+    "        features = squad_examples_to_features(example,self.tokenizer,self.max_seq_length,self.doc_stride,self.max_query_length)        \n",
+    "        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
+    "        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
+    "        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n",
+    "        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
+    "        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,\n",
+    "                                all_example_index)\n",
+    "        eval_sampler = SequentialSampler(dataset)\n",
+    "        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)\n",
+    "        \n",
+    "        all_results = []\n",
+    "        for batch in eval_dataloader:\n",
+    "            batch = tuple(t.to(self.device) for t in batch)\n",
+    "            with torch.no_grad():\n",
+    "                inputs = {'input_ids':      batch[0],\n",
+    "                        'attention_mask': batch[1],\n",
+    "                        'token_type_ids': batch[2]  \n",
+    "                        }                \n",
+    "                example_indices = batch[3]             \n",
+    "                outputs = self.model(**inputs)\n",
+    "                \n",
+    "            for i, example_index in enumerate(example_indices):\n",
+    "                eval_feature = features[example_index.item()]\n",
+    "                unique_id = int(eval_feature.unique_id)\n",
+    "                result = RawResult(unique_id    = unique_id,\n",
+    "                                    start_logits = to_list(outputs[0][i]),\n",
+    "                                    end_logits   = to_list(outputs[1][i]))\n",
+    "                all_results.append(result)\n",
+    "            \n",
+    "        answer = get_answer(example,features,all_results,self.n_best_size,self.max_answer_length,self.do_lower_case)\n",
+    "        \n",
+    "        return answer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.batch_encode_plus()\n",
+    "torch.utils.data.DataLoader\n",
+    "input_ids = torch.tensor(batch_of_tokenized_input_texts)\n",
+    "attention_mask = torch.tensor(batch_of_attention_masks)\n",
+    "labels = torch.tensor(batch_of_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained(output_model_path)\n",
+    "tokenizer.save_pretrained(output_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from app import train_model_route\n",
+    "\n",
+    "#ön yüzle ilişkilendirme\n",
+    "\n",
+    "train_model_route\n",
+    "\n",
+    "#title category ile ilişkilendirlecek\n",
+    "\n",
+    "\n",
+    "#subheadingler subcategroy ile ilişkilendirieck\n",
+    "\n",
+    "#prompt token uzunlukları kontrol edilerek bütün tokenlerin aynı uzunlukta olması sağlanmalıdır.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

load2.ipynb ADDED Viewed

	@@ -0,0 +1,378 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Kütüphanelerin Yüklenmesi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'datasets'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'datasets'"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "from datasets import load_dataset\n",
+    "import pandas as pd \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[7], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# Load model directly\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoModel,AutoTokenizer\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (WEIGHTS_NAME, BertConfig,\n\u001b[0;32m      5\u001b[0m                                   BertForQuestionAnswering, BertTokenizer)\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataLoader, SequentialSampler, TensorDataset\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m     25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     28\u001b[0m     OptionalDependencyNotAvailable,\n\u001b[0;32m     29\u001b[0m     _LazyModule,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     48\u001b[0m     logging,\n\u001b[0;32m     49\u001b[0m )\n\u001b[0;32m     52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m)  \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m     13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m     20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m     25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m     26\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     27\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     37\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     38\u001b[0m ]\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m     26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     27\u001b[0m     add_code_sample_docstrings,\n\u001b[0;32m     28\u001b[0m     add_end_docstrings,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     32\u001b[0m     replace_return_docstrings,\n\u001b[0;32m     33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     35\u001b[0m     ContextManagers,\n\u001b[0;32m     36\u001b[0m     ExplicitEnum,\n\u001b[0;32m     37\u001b[0m     ModelOutput,\n\u001b[0;32m     38\u001b[0m     PaddingStrategy,\n\u001b[0;32m     39\u001b[0m     TensorType,\n\u001b[0;32m     40\u001b[0m     add_model_info_to_auto_map,\n\u001b[0;32m     41\u001b[0m     add_model_info_to_custom_pipelines,\n\u001b[0;32m     42\u001b[0m     cached_property,\n\u001b[0;32m     43\u001b[0m     can_return_loss,\n\u001b[0;32m     44\u001b[0m     expand_dims,\n\u001b[0;32m     45\u001b[0m     filter_out_non_signature_kwargs,\n\u001b[0;32m     46\u001b[0m     find_labels,\n\u001b[0;32m     47\u001b[0m     flatten_dict,\n\u001b[0;32m     48\u001b[0m     infer_framework,\n\u001b[0;32m     49\u001b[0m     is_jax_tensor,\n\u001b[0;32m     50\u001b[0m     is_numpy_array,\n\u001b[0;32m     51\u001b[0m     is_tensor,\n\u001b[0;32m     52\u001b[0m     is_tf_symbolic_tensor,\n\u001b[0;32m     53\u001b[0m     is_tf_tensor,\n\u001b[0;32m     54\u001b[0m     is_torch_device,\n\u001b[0;32m     55\u001b[0m     is_torch_dtype,\n\u001b[0;32m     56\u001b[0m     is_torch_tensor,\n\u001b[0;32m     57\u001b[0m     reshape,\n\u001b[0;32m     58\u001b[0m     squeeze,\n\u001b[0;32m     59\u001b[0m     strtobool,\n\u001b[0;32m     60\u001b[0m     tensor_size,\n\u001b[0;32m     61\u001b[0m     to_numpy,\n\u001b[0;32m     62\u001b[0m     to_py_obj,\n\u001b[0;32m     63\u001b[0m     torch_float,\n\u001b[0;32m     64\u001b[0m     torch_int,\n\u001b[0;32m     65\u001b[0m     transpose,\n\u001b[0;32m     66\u001b[0m     working_or_temp_dir,\n\u001b[0;32m     67\u001b[0m )\n\u001b[0;32m     68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     69\u001b[0m     CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m     70\u001b[0m     HF_MODULES_CACHE,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     96\u001b[0m     try_to_load_from_cache,\n\u001b[0;32m     97\u001b[0m )\n\u001b[0;32m     98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     99\u001b[0m     ACCELERATE_MIN_VERSION,\n\u001b[0;32m    100\u001b[0m     ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    219\u001b[0m     torch_only_method,\n\u001b[0;32m    220\u001b[0m )\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m    458\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m    461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m    464\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m    465\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m    146\u001b[0m                 err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m    147\u001b[0m                 err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m    150\u001b[0m     kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m    153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
+      "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
+     ]
+    }
+   ],
+   "source": [
+    "#verileri bart ile eğitme burada koleksiyon içerisindeki veriler tanımlanmalı \n",
+    "# Load model directly\n",
+    "from transformers import AutoModel,AutoTokenizer\n",
+    "from transformers import (WEIGHTS_NAME, BertConfig,\n",
+    "                                  BertForQuestionAnswering, BertTokenizer)\n",
+    "from torch.utils.data import DataLoader, SequentialSampler, TensorDataset\n",
+    "\n",
+    "#from utils import (get_answer, input_to_squad_example,squad_examples_to_features, to_list)\n",
+    "import collections\n",
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train ve Test Verilerine İlişkin Databaselerin İçerisindeki Bilgilerin Alınması "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#birleştirilcek dosyaların listesi \n",
+    "train_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00000-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00001-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00002-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00003-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00004-of-00007.parquet']\n",
+    "test_files=['C:\\\\gitProjects\\\\oak\\\\data\\\\train-00005-of-00007.parquet','C:\\\\gitProjects\\\\oak\\\\data\\\\train-00006-of-00007.parquet']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dosyaları yükleyin ve birleştirin\n",
+    "train_dfs=[pd.read_parquet(file) for file in train_files]\n",
+    "test_dfs=[pd.read_parquet(file) for file in test_files]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#parque dosyalarının birleştirilmesi\n",
+    "train_df=pd.concat(train_dfs,ignore_index=True)\n",
+    "test_df=pd.concat(test_dfs,ignore_index=True)\n",
+    "\n",
+    "print(train_df.head())\n",
+    "print(train_df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'train_df' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[9], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m#train  ve test dosyaları oluşturma \u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mtrain_df\u001b[49m\u001b[38;5;241m.\u001b[39mto_parquet(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mC:\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mgitProjects\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mdeneme\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124megitim\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mtrain_Egitim\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mmerged_train.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m      3\u001b[0m test_df\u001b[38;5;241m.\u001b[39mto_parquet(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mC:\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mgitProjects\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mdeneme\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mtest_Egitim\u001b[39m\u001b[38;5;130;01m\\\\\u001b[39;00m\u001b[38;5;124mmerged_train.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'train_df' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "#train  ve test dosyaları oluşturma \n",
+    "train_df.to_parquet('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\train_Egitim\\\\merged_train.parquet')\n",
+    "test_df.to_parquet('C:\\\\gitProjects\\\\deneme\\\\test_Egitim\\\\merged_train.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                          Prompt_ID  \\\n",
+      "0  bb26c95639b18fd88857bf0964cd1fb5   \n",
+      "1  56743c1870327184e058292a34ce12a8   \n",
+      "2  88aa2f72d37cb8671ff68a6f481e382b   \n",
+      "3  703c086f7ffd9d8cc0497e82732860c7   \n",
+      "4  a310cb6ed3f48e721473ec0525239e4e   \n",
+      "\n",
+      "                                              Prompt  \\\n",
+      "0  What were the crucial factors that contributed...   \n",
+      "1  Create a comprehensive guide to understanding ...   \n",
+      "2  Explore the historical significance and impact...   \n",
+      "3  How can advanced data analytics be leveraged t...   \n",
+      "4  Design a comprehensive diversity training prog...   \n",
+      "\n",
+      "                                            Response  \\\n",
+      "0  **Crucial Factors Contributing to the Success ...   \n",
+      "1  ## Comprehensive Guide to Weather Front Types:...   \n",
+      "2  ## The Fall of the Berlin Wall: Historical Sig...   \n",
+      "3  **1. Real-Time Sentiment Analysis:**\\n\\n* Anal...   \n",
+      "4  **Phase 1: Awareness and Self-Reflection**\\n\\n...   \n",
+      "\n",
+      "                      Category                  Subcategory  \\\n",
+      "0              Voskhod program            Voskhod 1 mission   \n",
+      "1            Science mnemonics          Weather front types   \n",
+      "2            Political history  The Fall of the Berlin Wall   \n",
+      "3                 Test matches               Data analytics   \n",
+      "4  Majority–minority relations           Diversity training   \n",
+      "\n",
+      "   Prompt_token_length  \n",
+      "0                   34  \n",
+      "1                   48  \n",
+      "2                   67  \n",
+      "3                   78  \n",
+      "4                   55  \n",
+      "                          Prompt_ID  \\\n",
+      "0  e75b977d9abe55f0d4b33d7ee6a77e43   \n",
+      "1  da7b42506d0c24c5f1d2371e0f53b8fe   \n",
+      "2  dc1e302eb77f44f32623f958bdf5b1f5   \n",
+      "3  3e276bb9e578d719809b9654d710d6f5   \n",
+      "4  3efc98322cc67bcf32abcf25576d6ba1   \n",
+      "\n",
+      "                                              Prompt  \\\n",
+      "0  In the grand arena of intellectual discourse, ...   \n",
+      "1  Amidst the tapestry of human knowledge, we inv...   \n",
+      "2  In a world teeming with ideas and viewpoints, ...   \n",
+      "3  Amidst the tapestry of human knowledge, we inv...   \n",
+      "4  In the grand odyssey of intellectual discourse...   \n",
+      "\n",
+      "                                            Response Category Subcategory  \\\n",
+      "0  In the spirit of the renowned English physicia...     None        None   \n",
+      "1  Title: The Interplay of Politics and Psycholog...     None        None   \n",
+      "2  Energy conservation has become a critical topi...     None        None   \n",
+      "3  Title: Workplace Bullying: A Silent Epidemic\\n...     None        None   \n",
+      "4  Title: The Grand Odyssey of Grito: A Historica...     None        None   \n",
+      "\n",
+      "   Prompt_token_length  \n",
+      "0                  134  \n",
+      "1                  121  \n",
+      "2                  191  \n",
+      "3                  128  \n",
+      "4                  190  \n"
+     ]
+    }
+   ],
+   "source": [
+    "#test ve train yollarını belirleme ve test, traindeki önemli sütunları alma\n",
+    "train_file_path=('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\train_Egitim\\\\merged_train.parquet')\n",
+    "test_file_path=('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\test_Egitim\\\\merged_train.parquet')\n",
+    "\n",
+    "train_df=pd.read_parquet(train_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
+    "test_df=pd.read_parquet(test_file_path,columns=['Prompt_ID','Prompt','Response','Category','Subcategory','Prompt_token_length'])\n",
+    "\n",
+    "print(train_df.head())\n",
+    "print(test_df.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Modelin Tokenizer ve İsminin Girilmesi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[13], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoModel,AutoTokenizer,AutoModelForSeq2SeqLM\n\u001b[0;32m      2\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mphilschmid/bart-large-cnn-samsum\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m      3\u001b[0m model \u001b[38;5;241m=\u001b[39m AutoModelForSeq2SeqLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mphilschmid/bart-large-cnn-samsum\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m     25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     28\u001b[0m     OptionalDependencyNotAvailable,\n\u001b[0;32m     29\u001b[0m     _LazyModule,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     48\u001b[0m     logging,\n\u001b[0;32m     49\u001b[0m )\n\u001b[0;32m     52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m)  \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m     13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m     20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m     25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m     26\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     27\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     37\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     38\u001b[0m ]\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m     26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     27\u001b[0m     add_code_sample_docstrings,\n\u001b[0;32m     28\u001b[0m     add_end_docstrings,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     32\u001b[0m     replace_return_docstrings,\n\u001b[0;32m     33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     35\u001b[0m     ContextManagers,\n\u001b[0;32m     36\u001b[0m     ExplicitEnum,\n\u001b[0;32m     37\u001b[0m     ModelOutput,\n\u001b[0;32m     38\u001b[0m     PaddingStrategy,\n\u001b[0;32m     39\u001b[0m     TensorType,\n\u001b[0;32m     40\u001b[0m     add_model_info_to_auto_map,\n\u001b[0;32m     41\u001b[0m     add_model_info_to_custom_pipelines,\n\u001b[0;32m     42\u001b[0m     cached_property,\n\u001b[0;32m     43\u001b[0m     can_return_loss,\n\u001b[0;32m     44\u001b[0m     expand_dims,\n\u001b[0;32m     45\u001b[0m     filter_out_non_signature_kwargs,\n\u001b[0;32m     46\u001b[0m     find_labels,\n\u001b[0;32m     47\u001b[0m     flatten_dict,\n\u001b[0;32m     48\u001b[0m     infer_framework,\n\u001b[0;32m     49\u001b[0m     is_jax_tensor,\n\u001b[0;32m     50\u001b[0m     is_numpy_array,\n\u001b[0;32m     51\u001b[0m     is_tensor,\n\u001b[0;32m     52\u001b[0m     is_tf_symbolic_tensor,\n\u001b[0;32m     53\u001b[0m     is_tf_tensor,\n\u001b[0;32m     54\u001b[0m     is_torch_device,\n\u001b[0;32m     55\u001b[0m     is_torch_dtype,\n\u001b[0;32m     56\u001b[0m     is_torch_tensor,\n\u001b[0;32m     57\u001b[0m     reshape,\n\u001b[0;32m     58\u001b[0m     squeeze,\n\u001b[0;32m     59\u001b[0m     strtobool,\n\u001b[0;32m     60\u001b[0m     tensor_size,\n\u001b[0;32m     61\u001b[0m     to_numpy,\n\u001b[0;32m     62\u001b[0m     to_py_obj,\n\u001b[0;32m     63\u001b[0m     torch_float,\n\u001b[0;32m     64\u001b[0m     torch_int,\n\u001b[0;32m     65\u001b[0m     transpose,\n\u001b[0;32m     66\u001b[0m     working_or_temp_dir,\n\u001b[0;32m     67\u001b[0m )\n\u001b[0;32m     68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     69\u001b[0m     CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m     70\u001b[0m     HF_MODULES_CACHE,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     96\u001b[0m     try_to_load_from_cache,\n\u001b[0;32m     97\u001b[0m )\n\u001b[0;32m     98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     99\u001b[0m     ACCELERATE_MIN_VERSION,\n\u001b[0;32m    100\u001b[0m     ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    219\u001b[0m     torch_only_method,\n\u001b[0;32m    220\u001b[0m )\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m    458\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m    461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m    464\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m    465\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
+      "File \u001b[1;32mc:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m    146\u001b[0m                 err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m    147\u001b[0m                 err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m    150\u001b[0m     kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m    153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
+      "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\deneme\\.venv\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModel,AutoTokenizer,AutoModelForSeq2SeqLM\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"philschmid/bart-large-cnn-samsum\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "MongoDb üzerinden önemli sütunların çekilmesi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data successfully loaded into MongoDB.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pymongo import MongoClient\n",
+    "import pandas as pd\n",
+    "\n",
+    "# MongoDB connection settings\n",
+    "\n",
+    "def get_mongodb(database_name='yeniDatabase', collection_name='train', host='localhost', port=27017):\n",
+    "    \"\"\"\n",
+    "    MongoDB connection and collection selection\n",
+    "    \"\"\"\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
+    "    return collection\n",
+    "\n",
+    "# Function to load dataset into MongoDB\n",
+    "def dataset_read():\n",
+    "    train_file_path = ('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\train_Egitim\\\\merged_train.parquet')\n",
+    "    data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
+    "    data_dict = data.to_dict(\"records\")\n",
+    "\n",
+    "    # Get the MongoDB collection\n",
+    "    source_collection = get_mongodb(database_name='yeniDatabase', collection_name='train')  # Collection for translation\n",
+    "\n",
+    "    # Insert data into MongoDB\n",
+    "    source_collection.insert_many(data_dict)\n",
+    "\n",
+    "    print(\"Data successfully loaded into MongoDB.\")\n",
+    "    return source_collection\n",
+    "\n",
+    "# Call the function to load the dataset into MongoDB\n",
+    "source_collection = dataset_read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data successfully loaded into MongoDB.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pymongo import MongoClient\n",
+    "import pandas as pd\n",
+    "\n",
+    "# MongoDB connection settings\n",
+    "\n",
+    "def get_mongodb(database_name='yeniDatabase', collection_name='test', host='localhost', port=27017):\n",
+    "    \"\"\"\n",
+    "    MongoDB connection and collection selection\n",
+    "    \"\"\"\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    db = client[database_name]\n",
+    "    collection = db[collection_name]\n",
+    "    return collection\n",
+    "\n",
+    "# Function to load dataset into MongoDB\n",
+    "def dataset_read():\n",
+    "    train_file_path = ('C:\\\\gitProjects\\\\deneme\\\\egitim\\\\test_Egitim\\\\merged_train.parquet')\n",
+    "    data = pd.read_parquet(train_file_path, columns=['Prompt_ID', 'Prompt', 'Response', 'Category', 'Subcategory', 'Prompt_token_length'])\n",
+    "    data_dict = data.to_dict(\"records\")\n",
+    "\n",
+    "    # Get the MongoDB collection\n",
+    "    source_collection = get_mongodb(database_name='yeniDatabase', collection_name='test')  # Collection for translation\n",
+    "\n",
+    "    # Insert data into MongoDB\n",
+    "    source_collection.insert_many(data_dict)\n",
+    "\n",
+    "    print(\"Data successfully loaded into MongoDB.\")\n",
+    "    return source_collection\n",
+    "\n",
+    "# Call the function to load the dataset into MongoDB\n",
+    "source_collection = dataset_read()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

merged_train.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f808996f2ad6145efb8d94e05c4cace910ed6b0f64ac205c87d876ff43673b7
+size 1271727822

model.ipynb ADDED Viewed

	@@ -0,0 +1,371 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'bs4'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[2], line 6\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mbs4\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BeautifulSoup\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'bs4'"
+     ]
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "import pandas as pd \n",
+    "from pymongo import MongoClient\n",
+    "\n",
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Connect to MongoDB\n",
+    "client = MongoClient(\"mongodb://localhost:27017/\")\n",
+    "db = client[\"myDatabase\"]\n",
+    "source_collection = db[\"data\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Export translated data to a CSV file #bu dosyayı json olarak indirdim\n",
+    "\"\"\"yeni_data = list(source_collection.find())\n",
+    "print(yeni_data)\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                    _id                             title  \\\n",
+      "0  {'$oid': '66a1020f29abc84d21689044'}               Mental Note Vol. 24   \n",
+      "1  {'$oid': '66a1020f29abc84d21689045'}         Your Brain On Coronavirus   \n",
+      "2  {'$oid': '66a1020f29abc84d21689046'}                    Mind Your Nose   \n",
+      "3  {'$oid': '66a1020f29abc84d21689047'}          The 4 Purposes of Dreams   \n",
+      "4  {'$oid': '66a1020f29abc84d21689048'}  Surviving a Rod Through the Head   \n",
+      "\n",
+      "                                                 url                 authors  \\\n",
+      "0  https://medium.com/invisible-illness/mental-no...            ['Ryan Fan']   \n",
+      "1  https://medium.com/age-of-awareness/how-the-pa...       ['Simon Spichak']   \n",
+      "2  https://medium.com/neodotlife/mind-your-nose-f...                      []   \n",
+      "3  https://medium.com/science-for-real/the-4-purp...  ['Eshan Samaranayake']   \n",
+      "4  https://medium.com/live-your-life-on-purpose/s...        ['Rishav Sinha']   \n",
+      "\n",
+      "                          timestamp  \\\n",
+      "0  2020-12-26 03:38:10.479000+00:00   \n",
+      "1  2020-09-23 22:10:17.126000+00:00   \n",
+      "2  2020-10-10 20:17:37.132000+00:00   \n",
+      "3  2020-12-21 16:05:19.524000+00:00   \n",
+      "4  2020-02-26 00:01:01.576000+00:00   \n",
+      "\n",
+      "                                                tags  \n",
+      "0  ['Mental Health', 'Health', 'Psychology', 'Sci...  \n",
+      "1  ['Mental Health', 'Coronavirus', 'Science', 'P...  \n",
+      "2  ['Biotechnology', 'Neuroscience', 'Brain', 'We...  \n",
+      "3  ['Health', 'Neuroscience', 'Mental Health', 'P...  \n",
+      "4  ['Brain', 'Health', 'Development', 'Psychology...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "#csv dosyası olarak yüklenmesi\n",
+    "df=pd.read_json('myDatabase.data.json')\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>_id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>url</th>\n",
+       "      <th>authors</th>\n",
+       "      <th>timestamp</th>\n",
+       "      <th>tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>{'$oid': '66a1020f29abc84d21689044'}</td>\n",
+       "      <td>Mental Note Vol. 24</td>\n",
+       "      <td>https://medium.com/invisible-illness/mental-no...</td>\n",
+       "      <td>['Ryan Fan']</td>\n",
+       "      <td>2020-12-26 03:38:10.479000+00:00</td>\n",
+       "      <td>['Mental Health', 'Health', 'Psychology', 'Sci...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>{'$oid': '66a1020f29abc84d21689045'}</td>\n",
+       "      <td>Your Brain On Coronavirus</td>\n",
+       "      <td>https://medium.com/age-of-awareness/how-the-pa...</td>\n",
+       "      <td>['Simon Spichak']</td>\n",
+       "      <td>2020-09-23 22:10:17.126000+00:00</td>\n",
+       "      <td>['Mental Health', 'Coronavirus', 'Science', 'P...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>{'$oid': '66a1020f29abc84d21689046'}</td>\n",
+       "      <td>Mind Your Nose</td>\n",
+       "      <td>https://medium.com/neodotlife/mind-your-nose-f...</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>2020-10-10 20:17:37.132000+00:00</td>\n",
+       "      <td>['Biotechnology', 'Neuroscience', 'Brain', 'We...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>{'$oid': '66a1020f29abc84d21689047'}</td>\n",
+       "      <td>The 4 Purposes of Dreams</td>\n",
+       "      <td>https://medium.com/science-for-real/the-4-purp...</td>\n",
+       "      <td>['Eshan Samaranayake']</td>\n",
+       "      <td>2020-12-21 16:05:19.524000+00:00</td>\n",
+       "      <td>['Health', 'Neuroscience', 'Mental Health', 'P...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>{'$oid': '66a1020f29abc84d21689048'}</td>\n",
+       "      <td>Surviving a Rod Through the Head</td>\n",
+       "      <td>https://medium.com/live-your-life-on-purpose/s...</td>\n",
+       "      <td>['Rishav Sinha']</td>\n",
+       "      <td>2020-02-26 00:01:01.576000+00:00</td>\n",
+       "      <td>['Brain', 'Health', 'Development', 'Psychology...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                    _id                             title  \\\n",
+       "0  {'$oid': '66a1020f29abc84d21689044'}               Mental Note Vol. 24   \n",
+       "1  {'$oid': '66a1020f29abc84d21689045'}         Your Brain On Coronavirus   \n",
+       "2  {'$oid': '66a1020f29abc84d21689046'}                    Mind Your Nose   \n",
+       "3  {'$oid': '66a1020f29abc84d21689047'}          The 4 Purposes of Dreams   \n",
+       "4  {'$oid': '66a1020f29abc84d21689048'}  Surviving a Rod Through the Head   \n",
+       "\n",
+       "                                                 url                 authors  \\\n",
+       "0  https://medium.com/invisible-illness/mental-no...            ['Ryan Fan']   \n",
+       "1  https://medium.com/age-of-awareness/how-the-pa...       ['Simon Spichak']   \n",
+       "2  https://medium.com/neodotlife/mind-your-nose-f...                      []   \n",
+       "3  https://medium.com/science-for-real/the-4-purp...  ['Eshan Samaranayake']   \n",
+       "4  https://medium.com/live-your-life-on-purpose/s...        ['Rishav Sinha']   \n",
+       "\n",
+       "                          timestamp  \\\n",
+       "0  2020-12-26 03:38:10.479000+00:00   \n",
+       "1  2020-09-23 22:10:17.126000+00:00   \n",
+       "2  2020-10-10 20:17:37.132000+00:00   \n",
+       "3  2020-12-21 16:05:19.524000+00:00   \n",
+       "4  2020-02-26 00:01:01.576000+00:00   \n",
+       "\n",
+       "                                                tags  \n",
+       "0  ['Mental Health', 'Health', 'Psychology', 'Sci...  \n",
+       "1  ['Mental Health', 'Coronavirus', 'Science', 'P...  \n",
+       "2  ['Biotechnology', 'Neuroscience', 'Brain', 'We...  \n",
+       "3  ['Health', 'Neuroscience', 'Mental Health', 'P...  \n",
+       "4  ['Brain', 'Health', 'Development', 'Psychology...  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "_id          object\n",
+       "title        object\n",
+       "url          object\n",
+       "authors      object\n",
+       "timestamp    object\n",
+       "tags         object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0    <class 'dict'>\n",
+      "Name: _id, dtype: object\n",
+      "0    <class 'str'>\n",
+      "Name: title, dtype: object\n",
+      "0    <class 'str'>\n",
+      "Name: url, dtype: object\n",
+      "0    <class 'str'>\n",
+      "Name: authors, dtype: object\n",
+      "0    <class 'str'>\n",
+      "Name: timestamp, dtype: object\n",
+      "0    <class 'str'>\n",
+      "Name: tags, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in df.columns:\n",
+    "    print(df[i].apply(lambda x:type(x)).head(1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#içeriklerin saklanacağı bir liste oluştrun\n",
+    "contents=[]\n",
+    "#her url için içeriği çekin \n",
+    "\n",
+    "for url in df['url']:\n",
+    "    try:\n",
+    "        response=requests.get(url)\n",
+    "        soup=BeautifulSoup(response.content,'html.parser')\n",
+    "\n",
+    "        #medium içeriğini çekmek için uygun seçiciyi kullanın\n",
+    "        article_content=soup.find('articles')\n",
+    "        content=article_content.get_text(separator='') if article_content else 'content not found'\n",
+    "\n",
+    "        contents.append(content)\n",
+    "    except Exception as e:\n",
+    "        contents.append(f'error retrieving content: {e}')\n",
+    "\n",
+    "#içerikleri veri çerçevesine ekleyin.\n",
+    "df['content']= contents\n",
+    "\n",
+    "#yeni veri kümesini kontrol edin\n",
+    "print(df.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#modeleğitimi için test valid değerleriğ oluşturma \n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_val, y_train, y_val = train_test_split(translated_data, translated_data, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.svm import SVC\n",
+    "\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "X_train_transformed = vectorizer.fit_transform(X_train)\n",
+    "X_val_transformed = vectorizer.transform(X_val)\n",
+    "\n",
+    "model = SVC()\n",
+    "model.fit(X_train_transformed, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "y_pred = model.predict(X_val_transformed)\n",
+    "accuracy = accuracy_score(y_val, y_pred)\n",
+    "print(f\"Accuracy: {accuracy:.2f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

mongoDb.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from pymongo import MongoClient
+import pandas as pd
+#mongodb bağlantı ayarları
+def get_mongodb(database_name='myDatabase',collection_name='new',host='localhost',port=27017):
+    "mongodb bağlantısı ve koleksiyon seçimi "
+    client=MongoClient(f'mongodb://{host}:{port}/')
+    db=client[database_name]
+    collection=db[collection_name]
+    return collection
+"""#koleksiyonun varlığını kontrol eder.
+def get_collection(self, collection_name):
+    #Get a collection if it exists, otherwise return None.
+    if self.check_collection_exists(collection_name):
+        return self.db[collection_name]
+    else:
+        print(f"Collection '{collection_name}' does not exist.")
+        return None"""
+#dataseti mongodb ye yükleme
+def dataset_read():
+    data=pd.read_csv('C:\gitProjects\medium-articles\medium_articles_no_text.csv')
+    data_dict=data.to_dict("records")
+    source_collection=get_mongodb(database_name='myDatabase',collection_name='data') #çeviri için kullanılan kaynak koleksiyonu
+    source_collection.insert_many(data_dict)
+    print("kayıt mongodb ye yüklendi." )
+    return source_collection
+if __name__=='__main__':
+    dataset_read()

mongoDb_2.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pymongo import MongoClient
+import pandas as pd
+#mongodb bağlantı ayarları
+def get_mongodb(database_name='yedekDatabase',collection_name='yeni',host='localhost',port=27017):
+    "mongodb bağlantısı ve koleksiyon seçimi "
+    client=MongoClient(f'mongodb://{host}:{port}/')
+    db=client[database_name]
+    collection=db[collection_name]
+    return collection
+#dataseti mongodb ye yükleme
+def dataset_read():
+    data=pd.read_csv('C:\gitProjects\medium-articles\medium_articles_no_text.csv')
+    data_dict=data.to_dict("records")
+    source_collection=get_mongodb(database_name='yedekDatabase',collection_name='yeni') #çeviri için kullanılan kaynak koleksiyonu
+    source_collection.insert_many(data_dict)
+    print("kayıt mongodb ye yüklendi." )
+    return source_collection
+if __name__=='__main__':
+    dataset_read()

mongodb_egitim.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

pyvenv.cfg ADDED Viewed

	@@ -0,0 +1,5 @@

+home = C:\Users\info\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0
+include-system-site-packages = false
+version = 3.11.9
+executable = C:\Users\info\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
+command = C:\Users\info\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m venv c:\gitProjects\deneme\.venv

requirements.txt CHANGED Viewed

@@ -1,2 +1,6 @@
-huggingface_hub==0.22.2
-gradio

+gradio==4.40.0.*
+pymongo==4.8.0.*
+pandas==2.2.2.*
+datasets==2.20.0.*
+torch==2.4.0.*
+transformers==4.43.4.*