Spaces:

yonkasoft
/

yeniChatbot

Build error

App Files Files Community

yonkasoft commited on Aug 15

Commit

7d245b9

•

1 Parent(s): df88f48

Upload 11 files

Browse files

Files changed (11) hide show

MLM.ipynb +539 -0
datasets.ipynb +460 -0
deneme.ipynb +69 -0
pyvenv.cfg +3 -0
requirements.txt +127 -1
test_Egitim/merged_test.parquet +3 -0
train_Egitim/merged_train.parquet +3 -0
wikipedia-tr/.gitattributes +54 -0
wikipedia-tr/README.md +85 -0
wikipedia-tr/data/train-00000-of-00002-ed6b025df7a1f653.parquet +3 -0
wikipedia-tr/data/train-00001-of-00002-0aa63953f8b51c17.parquet +3 -0

MLM.ipynb ADDED Viewed

	@@ -0,0 +1,539 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m     25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     28\u001b[0m     OptionalDependencyNotAvailable,\n\u001b[0;32m     29\u001b[0m     _LazyModule,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     48\u001b[0m     logging,\n\u001b[0;32m     49\u001b[0m )\n\u001b[0;32m     52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m)  \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m     13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m     20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m     25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m     26\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     27\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     37\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     38\u001b[0m ]\n",
+      "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m     26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     27\u001b[0m     add_code_sample_docstrings,\n\u001b[0;32m     28\u001b[0m     add_end_docstrings,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     32\u001b[0m     replace_return_docstrings,\n\u001b[0;32m     33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     35\u001b[0m     ContextManagers,\n\u001b[0;32m     36\u001b[0m     ExplicitEnum,\n\u001b[0;32m     37\u001b[0m     ModelOutput,\n\u001b[0;32m     38\u001b[0m     PaddingStrategy,\n\u001b[0;32m     39\u001b[0m     TensorType,\n\u001b[0;32m     40\u001b[0m     add_model_info_to_auto_map,\n\u001b[0;32m     41\u001b[0m     add_model_info_to_custom_pipelines,\n\u001b[0;32m     42\u001b[0m     cached_property,\n\u001b[0;32m     43\u001b[0m     can_return_loss,\n\u001b[0;32m     44\u001b[0m     expand_dims,\n\u001b[0;32m     45\u001b[0m     filter_out_non_signature_kwargs,\n\u001b[0;32m     46\u001b[0m     find_labels,\n\u001b[0;32m     47\u001b[0m     flatten_dict,\n\u001b[0;32m     48\u001b[0m     infer_framework,\n\u001b[0;32m     49\u001b[0m     is_jax_tensor,\n\u001b[0;32m     50\u001b[0m     is_numpy_array,\n\u001b[0;32m     51\u001b[0m     is_tensor,\n\u001b[0;32m     52\u001b[0m     is_tf_symbolic_tensor,\n\u001b[0;32m     53\u001b[0m     is_tf_tensor,\n\u001b[0;32m     54\u001b[0m     is_torch_device,\n\u001b[0;32m     55\u001b[0m     is_torch_dtype,\n\u001b[0;32m     56\u001b[0m     is_torch_tensor,\n\u001b[0;32m     57\u001b[0m     reshape,\n\u001b[0;32m     58\u001b[0m     squeeze,\n\u001b[0;32m     59\u001b[0m     strtobool,\n\u001b[0;32m     60\u001b[0m     tensor_size,\n\u001b[0;32m     61\u001b[0m     to_numpy,\n\u001b[0;32m     62\u001b[0m     to_py_obj,\n\u001b[0;32m     63\u001b[0m     torch_float,\n\u001b[0;32m     64\u001b[0m     torch_int,\n\u001b[0;32m     65\u001b[0m     transpose,\n\u001b[0;32m     66\u001b[0m     working_or_temp_dir,\n\u001b[0;32m     67\u001b[0m )\n\u001b[0;32m     68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     69\u001b[0m     CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m     70\u001b[0m     HF_MODULES_CACHE,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     96\u001b[0m     try_to_load_from_cache,\n\u001b[0;32m     97\u001b[0m )\n\u001b[0;32m     98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     99\u001b[0m     ACCELERATE_MIN_VERSION,\n\u001b[0;32m    100\u001b[0m     ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    219\u001b[0m     torch_only_method,\n\u001b[0;32m    220\u001b[0m )\n",
+      "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m    458\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m    461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m    464\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m    465\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
+      "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m    146\u001b[0m                 err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m    147\u001b[0m                 err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m    150\u001b[0m     kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m    153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
+      "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
+    "import torch "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
+      "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
+      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model=BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "text=(\"After reading these reports,\"\n",
+    "       \"we start an outline of the application of ML.\"\n",
+    "        \"It includes the [MASK] process \"\n",
+    "        \"and various applications (from various software development to hardware development), to [MASK] of IT systems, and various approaches on analytics.\"\n",
+    "        \"The approach incorporates [MASK] as well as computing and data mining.\"\n",
+    "        \"For example, software developers and manufacturing engineers used AI \"\n",
+    "        \"in manufacturing to develop their applications.\"\n",
+    "      )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#maskeleme yaptıktan sonra tokenlere çeviriyoruz\n",
+    "inputs= tokenizer(text,return_tensors='pt')\n",
+    "inputs.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[  101,  2044,  3752,  2122,  4311,  1010,  2057,  2707,  2019, 12685,\n",
+       "          1997,  1996,  4646,  1997, 19875,  1012,  2009,  2950,  1996,   103,\n",
+       "          2832,  1998,  2536,  5097,  1006,  2013,  2536,  4007,  2458,  2000,\n",
+       "          8051,  2458,  1007,  1010,  2000,   103,  1997,  2009,  3001,  1010,\n",
+       "          1998,  2536,  8107,  2006, 25095,  1012,  1996,  3921, 12374,   103,\n",
+       "          2004,  2092,  2004,  9798,  1998,  2951,  5471,  1012,  2005,  2742,\n",
+       "          1010,  4007,  9797,  1998,  5814,  6145,  2109,  9932,  1999,  5814,\n",
+       "          2000,  4503,  2037,  5097,  1012,   102]])"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs.input_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_normal= (\"After reading these reports,\"\n",
+    "       \"we start an outline of the application of ML.\"\n",
+    "        \"It includes the learning process \"\n",
+    "        \"and various applications (from various software development to hardware development), to analysis of IT systems, and various approaches on analytics.\"\n",
+    "        \"The approach incorporates AI as well as computing and data mining.\"\n",
+    "        \"For example, software developers and manufacturing engineers used AI \"\n",
+    "        \"in manufacturing to develop their applications.\"\n",
+    "      )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#texti tokenlere çeviriyoruz\n",
+    "inputs_2= tokenizer(text_normal,return_tensors='pt')\n",
+    "inputs_2.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[  101,  2044,  3752,  2122,  4311,  1010,  2057,  2707,  2019, 12685,\n",
+       "          1997,  1996,  4646,  1997, 19875,  1012,  2009,  2950,  1996,  4083,\n",
+       "          2832,  1998,  2536,  5097,  1006,  2013,  2536,  4007,  2458,  2000,\n",
+       "          8051,  2458,  1007,  1010,  2000,  4106,  1997,  2009,  3001,  1010,\n",
+       "          1998,  2536,  8107,  2006, 25095,  1012,  1996,  3921, 12374,  9932,\n",
+       "          2004,  2092,  2004,  9798,  1998,  2951,  5471,  1012,  2005,  2742,\n",
+       "          1010,  4007,  9797,  1998,  5814,  6145,  2109,  9932,  1999,  5814,\n",
+       "          2000,  4503,  2037,  5097,  1012,   102]])"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs_2.input_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs_2['labels']= inputs_2.input_ids.detach().clone()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[  101,  2044,  3752,  2122,  4311,  1010,  2057,  2707,  2019, 12685,\n",
+       "          1997,  1996,  4646,  1997, 19875,  1012,  2009,  2950,  1996,  4083,\n",
+       "          2832,  1998,  2536,  5097,  1006,  2013,  2536,  4007,  2458,  2000,\n",
+       "          8051,  2458,  1007,  1010,  2000,  4106,  1997,  2009,  3001,  1010,\n",
+       "          1998,  2536,  8107,  2006, 25095,  1012,  1996,  3921, 12374,  9932,\n",
+       "          2004,  2092,  2004,  9798,  1998,  2951,  5471,  1012,  2005,  2742,\n",
+       "          1010,  4007,  9797,  1998,  5814,  6145,  2109,  9932,  1999,  5814,\n",
+       "          2000,  4503,  2037,  5097,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1]]), 'labels': tensor([[  101,  2044,  3752,  2122,  4311,  1010,  2057,  2707,  2019, 12685,\n",
+       "          1997,  1996,  4646,  1997, 19875,  1012,  2009,  2950,  1996,  4083,\n",
+       "          2832,  1998,  2536,  5097,  1006,  2013,  2536,  4007,  2458,  2000,\n",
+       "          8051,  2458,  1007,  1010,  2000,  4106,  1997,  2009,  3001,  1010,\n",
+       "          1998,  2536,  8107,  2006, 25095,  1012,  1996,  3921, 12374,  9932,\n",
+       "          2004,  2092,  2004,  9798,  1998,  2951,  5471,  1012,  2005,  2742,\n",
+       "          1010,  4007,  9797,  1998,  5814,  6145,  2109,  9932,  1999,  5814,\n",
+       "          2000,  4503,  2037,  5097,  1012,   102]])}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs_2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 76])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#random tokenler oluşturacağız labelsiz\n",
+    "rand=torch.rand(inputs_2.input_ids.shape)\n",
+    "rand.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0.9397, 0.1325, 0.1893, 0.8258, 0.7453, 0.1766, 0.9338, 0.0806, 0.0626,\n",
+       "         0.6665, 0.4240, 0.3946, 0.5413, 0.3799, 0.4023, 0.8699, 0.8159, 0.1511,\n",
+       "         0.6842, 0.0242, 0.7235, 0.0063, 0.1857, 0.9684, 0.8930, 0.8208, 0.5711,\n",
+       "         0.0345, 0.9919, 0.1140, 0.7597, 0.4546, 0.6478, 0.2295, 0.2846, 0.6314,\n",
+       "         0.3640, 0.9291, 0.3843, 0.3553, 0.1125, 0.0790, 0.4261, 0.4307, 0.6724,\n",
+       "         0.8569, 0.4476, 0.8032, 0.0241, 0.0152, 0.4196, 0.5609, 0.0010, 0.7240,\n",
+       "         0.4531, 0.5834, 0.5232, 0.3602, 0.6575, 0.9012, 0.1519, 0.2255, 0.0799,\n",
+       "         0.5673, 0.7244, 0.4387, 0.2713, 0.4243, 0.8435, 0.1670, 0.8664, 0.6261,\n",
+       "         0.4090, 0.2988, 0.3379, 0.7784]])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rand"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[False,  True, False, False, False, False, False,  True,  True, False,\n",
+       "         False, False, False, False, False, False, False, False, False,  True,\n",
+       "         False,  True, False, False, False, False, False,  True, False,  True,\n",
+       "         False, False, False, False, False, False, False, False, False, False,\n",
+       "          True,  True, False, False, False, False, False, False,  True,  True,\n",
+       "         False, False,  True, False, False, False, False, False, False, False,\n",
+       "         False, False,  True, False, False, False, False, False, False, False,\n",
+       "         False, False, False, False, False, False]])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#cümledeki toknelerin yüzde 15 alınır \n",
+    "#mask_arr = rand < 0.15 ifadesi, rand fonksiyonunun her bir token için rastgele bir sayı üreteceğini ve bu sayının 0.15'ten küçük olup olmadığına bakarak token'ın maskelenip maskelenmeyeceğini belirler. Eğer sayı 0.15'ten küçükse, token maskelenir; değilse, maskelenmez. \n",
+    "mask_arr = rand < 0.15\n",
+    "mask_arr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[1, 7, 8, 19, 21, 27, 29, 40, 41, 48, 49, 52, 62]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#burada seçilen değer maskeleme yapılan tokenlarda 0 olmayan karakterlerin yazılmasıdır.\n",
+    "#torch flatten özelliği listeden çıkartarak yalnızca bir array olmasını sağladı\n",
+    "selection= torch.flatten(mask_arr[0].nonzero()).tolist()\n",
+    "selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[  101,   103,  3752,  2122,  4311,  1010,  2057,   103,   103, 12685,\n",
+       "          1997,  1996,  4646,  1997, 19875,  1012,  2009,  2950,  1996,   103,\n",
+       "          2832,   103,  2536,  5097,  1006,  2013,  2536,   103,  2458,   103,\n",
+       "          8051,  2458,  1007,  1010,  2000,  4106,  1997,  2009,  3001,  1010,\n",
+       "           103,   103,  8107,  2006, 25095,  1012,  1996,  3921,   103,   103,\n",
+       "          2004,  2092,   103,  9798,  1998,  2951,  5471,  1012,  2005,  2742,\n",
+       "          1010,  4007,   103,  1998,  5814,  6145,  2109,  9932,  1999,  5814,\n",
+       "          2000,  4503,  2037,  5097,  1012,   102]])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#input_ids değerleri 0 olanlar için 103 değerinim atadık \n",
+    "inputs_2.input_ids[0,selection]=103\n",
+    "inputs_2.input_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs= model(**inputs_2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "odict_keys(['loss', 'logits'])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(0.8399, grad_fn=<NllLossBackward0>)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs.loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--facebook--dpr-ctx_encoder-single-nq-base. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']\n",
+      "- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer\n",
+    "\n",
+    "ctx_model=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
+    "ctx_tokenizer=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
+    "\n",
+    "question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
+    "question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"title = [\"2024 Yılında Mobil Teknoloji Trendleri\"]\n",
+    "keywords = [\"mobil teknoloji\", \"2024 trendleri\", \"akıllı telefon yenilikleri\", \"5G teknolojisi\", \"giyilebilir cihazlar\"]\n",
+    "subheading = [\n",
+    "    \"2024'te Akıllı Telefonlardaki Yenilikler\",\n",
+    "    \"Giyilebilir Teknolojiler: Sağlık ve Fitness Trendleri\",\n",
+    "    \"5G'nin Mobil Cihazlar Üzerindeki Etkisi\",\n",
+    "    \"Mobil Güvenlikte Yeni Yaklaşımlar\"\n",
+    "]\"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "xb_tokens=ctx_tokenizer()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

datasets.ipynb ADDED Viewed

	@@ -0,0 +1,460 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Kütüphaneler eklenmesi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import pandas as pd \n",
+    "from pymongo import MongoClient\n",
+    "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder;\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Parquet dosyalarının dataframe olarak yüklenmesi(okuma yapabilmek için)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parquet dosyalarını DataFrame olarak yükleyin\n",
+    "train_df1 = pd.read_parquet('C:\\\\gitProjects\\\\yeni\\\\wikipedia-tr\\\\data\\\\train-00000-of-00002-ed6b025df7a1f653.parquet')\n",
+    "train_df2 = pd.read_parquet('C:\\\\gitProjects\\\\yeni\\\\wikipedia-tr\\\\data\\\\train-00001-of-00002-0aa63953f8b51c17.parquet')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# İki DataFrame'i birleştirin\n",
+    "merged_train = pd.concat([train_df1, train_df2], ignore_index=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Örneğin %80 train, %20 test olarak ayırın\n",
+    "train_data = merged_train.sample(frac=0.8, random_state=42)\n",
+    "test_data = merged_train.drop(train_data.index)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Dosya yolları\n",
+    "train_dir = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim'\n",
+    "test_dir = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim'\n",
+    "train_file_path = os.path.join(train_dir, 'merged_train.parquet')\n",
+    "test_file_path = os.path.join(test_dir, 'merged_test.parquet')\n",
+    "\n",
+    "# Dizinlerin var olup olmadığını kontrol etme, gerekirse oluşturma\n",
+    "os.makedirs(train_dir, exist_ok=True)\n",
+    "os.makedirs(test_dir, exist_ok=True)\n",
+    "\n",
+    "# Veriyi .parquet formatında kaydetme\n",
+    "train_data.to_parquet(train_file_path)\n",
+    "test_data.to_parquet(test_file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dataframe deki bilgileri görme "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "             id                                                url  \\\n",
+      "515773  3525037  https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%...   \n",
+      "517811  3532700      https://tr.wikipedia.org/wiki/Craterolophinae   \n",
+      "436350  3203545           https://tr.wikipedia.org/wiki/Notocrabro   \n",
+      "223281  1765445    https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko   \n",
+      "100272   575462        https://tr.wikipedia.org/wiki/Salah%20Cedid   \n",
+      "\n",
+      "                  title                                               text  \n",
+      "515773    Pşıqo Ahecaqo  Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom...  \n",
+      "517811  Craterolophinae  Craterolophinae, Depastridae familyasına bağlı...  \n",
+      "436350       Notocrabro  Notocrabro Crabronina oymağına bağlı bir cinst...  \n",
+      "223281  Ibrahim Sissoko  İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa...  \n",
+      "100272      Salah Cedid  Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su...  \n",
+      "    id                                             url        title  \\\n",
+      "5   35       https://tr.wikipedia.org/wiki/Karl%20Marx    Karl Marx   \n",
+      "13  48         https://tr.wikipedia.org/wiki/Ruhi%20Su      Ruhi Su   \n",
+      "15  53        https://tr.wikipedia.org/wiki/Bilgisayar   Bilgisayar   \n",
+      "18  59          https://tr.wikipedia.org/wiki/Edebiyat     Edebiyat   \n",
+      "19  64  https://tr.wikipedia.org/wiki/M%C3%BChendislik  Mühendislik   \n",
+      "\n",
+      "                                                 text  \n",
+      "5   Karl Marx (; 5 Mayıs 1818, Trier – 14 Mart 188...  \n",
+      "13  Mehmet Ruhi Su (1 Ocak 1912, Van - 20 Eylül 19...  \n",
+      "15  Bilgisayar, aritmetik veya mantıksal işlem diz...  \n",
+      "18  Edebiyat, yazın veya literatür; olay, düşünce,...  \n",
+      "19  Mühendis, insanların her türlü ihtiyacını karş...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(train_data.head())\n",
+    "print(test_data.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "MongoDb'ye bağlama ve bilgi çekme "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Veriler başarıyla Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'EgitimDatabase'), 'train') MongoDb koleksiyonuna indirildi.\n",
+      " Veriler başarıyla Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'EgitimDatabase'), 'test') MongoDb koleksiyonuna indirildi.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from pymongo import MongoClient\n",
+    "\n",
+    "def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
+    "    \"\"\"\n",
+    "    MongoDB connection and collection selection for train and test collections.\n",
+    "    \"\"\"\n",
+    "    client = MongoClient(f'mongodb://{host}:{port}/')\n",
+    "    \n",
+    "    # Veritabanını seçin\n",
+    "    db = client[database_name]\n",
+    "    \n",
+    "    # Train ve test koleksiyonlarını seçin\n",
+    "    train_collection = db[train_collection_name]\n",
+    "    test_collection = db[test_collection_name]\n",
+    "    \n",
+    "    return train_collection, test_collection\n",
+    "\n",
+    "# Function to load dataset into MongoDB\n",
+    "def dataset_read(train_file_path,test_file_path):\n",
+    "    data_train = pd.read_parquet(train_file_path, columns=['id', 'url', 'title', 'text'])\n",
+    "    data_test = pd.read_parquet(test_file_path, columns=['id', 'url', 'title', 'text'])\n",
+    "    data_dict_train = data_train.to_dict(\"records\")\n",
+    "    data_dict_test = data_test.to_dict(\"records\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "    # Get the MongoDB collections\n",
+    "    train_collection, test_collection = get_mongodb(database_name='EgitimDatabase')\n",
+    "\n",
+    " \n",
+    "\n",
+    "    # Insert data into MongoDB\n",
+    "    train_collection.insert_many(data_dict_train)\n",
+    "    test_collection.insert_many(data_dict_test)\n",
+    "\n",
+    "\n",
+    "    print(f\" Veriler başarıyla {train_collection} MongoDb koleksiyonuna indirildi.\")\n",
+    "    print(f\" Veriler başarıyla {test_collection} MongoDb koleksiyonuna indirildi.\")\n",
+    "    return train_collection,test_collection\n",
+    "\n",
+    "# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
+    "train_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
+    "test_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
+    "\n",
+    "train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Similarity Sentences "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'torch.amp'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m#datasete similarity sentence yardımıyla keywords ve subheadings tanımlama \u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SentenceTransformer\n\u001b[0;32m      6\u001b[0m model \u001b[38;5;241m=\u001b[39m SentenceTransformer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124memrecan/bert-base-turkish-cased-mean-nli-stsb-tr\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m      7\u001b[0m \u001b[38;5;66;03m#text dosyasını koleksiyon üzerinden çekme \u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;66;03m# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\__init__.py:7\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcross_encoder\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mCrossEncoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CrossEncoder\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParallelSentencesDataset, SentencesDataset\n\u001b[0;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mLoggingHandler\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LoggingHandler\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\cross_encoder\\__init__.py:1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mCrossEncoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CrossEncoder\n\u001b[0;32m      3\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCrossEncoder\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:7\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Callable, Dict, List, Literal, Optional, Tuple, Type, Union\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Tensor, nn\n\u001b[0;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moptim\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optimizer\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py:1686\u001b[0m\n\u001b[0;32m   1683\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to find torch_shm_manager at \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m path)\n\u001b[0;32m   1684\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m path\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m-> 1686\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamp\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m autocast, GradScaler\n\u001b[0;32m   1688\u001b[0m \u001b[38;5;66;03m# Initializing the extension shadows the built-in python float / int classes;\u001b[39;00m\n\u001b[0;32m   1689\u001b[0m \u001b[38;5;66;03m# store them for later use by SymInt / SymFloat.\u001b[39;00m\n\u001b[0;32m   1690\u001b[0m py_float \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mfloat\u001b[39m\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch.amp'"
+     ]
+    }
+   ],
+   "source": [
+    "#datasete similarity sentence yardımıyla keywords ve subheadings tanımlama \n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "\n",
+    "model = SentenceTransformer(\"emrecan/bert-base-turkish-cased-mean-nli-stsb-tr\")\n",
+    "#text dosyasını koleksiyon üzerinden çekme \n",
+    "# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
+    "class Database:\n",
+    "    @staticmethod\n",
+    "    def get_mongodb():\n",
+    "        # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmıştır.\n",
+    "        return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def get_input_titles():\n",
+    "        mongo_url, db_name, collection_name = Database.get_mongodb()\n",
+    "        client = MongoClient(mongo_url)\n",
+    "        db = client[db_name]\n",
+    "        collection = db[collection_name]\n",
+    "        query = {\"title\": {\"$exists\": True}}\n",
+    "        cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
+    "        title_from_db = [doc['title'] for doc in cursor]\n",
+    "        return title_from_db\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def get_input_texts():\n",
+    "        mongo_url, db_name, collection_name = Database.get_mongodb()\n",
+    "        client = MongoClient(mongo_url)\n",
+    "        db = client[db_name]\n",
+    "        collection = db[collection_name]\n",
+    "        query = {\"text\": {\"$exists\": True}}\n",
+    "        cursor = collection.find(query, {\"text\": 1, \"_id\": 0})\n",
+    "        text_from_db = [doc['text'] for doc in cursor]\n",
+    "        return text_from_db\n",
+    "\n",
+    "\n",
+    "#tf-ıdf hesaplama (anahtar kelimeler için)\n",
+    "\n",
+    "\n",
+    "#IDF = log ( Dokuman Sayısı / Terimin Geçtiği Dokuman Sayısı )\n",
+    "\n",
+    "#text ve title a göre keywords belirlenmesi\n",
+    "\n",
+    "#------------------------------------------------------------------------------\n",
+    "\n",
+    "\n",
+    "#sbert ile alt başlıkların oluşturulması\n",
+    "\n",
+    "#kümelenme ile alt başlıkların belirlenmesi \n",
+    "\n",
+    "#-------------------------------------------------------------------------------\n",
+    "\n",
+    "#anahatar kelime ve alt başlıkların veri tabnaına eklnemesi "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#prompt oluştururak generate etmek için hazırlık"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Bert Modeliyle tokenizer atama"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model=BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "\"\"\"BERT MODELİNİ AYARLAMA\n",
+    "\n",
+    "input_file: Modelin işlem yapacağı giriş dosyasının yolunu belirtir. Bu dosya, metin verilerini içermelidir.\n",
+    "-----------------------------------------------------------------------------------------------------------------\n",
+    "output_file: Modelin çıktılarının kaydedileceği dosyanın yolunu belirtir.\n",
+    "------------------------------------------------------------------------------------------------------------------\n",
+    "layers: Hangi BERT katmanlarının kullanılacağını belirler. Örneğin, \"-1,-2,-3,-4\" son dört katmanı ifade eder.\n",
+    "----------------------------------------------------------------------------------------------------------------------\n",
+    "bert_config_file: Önceden eğitilmiş BERT modelinin yapılandırma dosyasının yolu. Bu dosya modelin mimarisini belirler.\n",
+    "--------------------------------------------------------------------------------------------------------------------------\n",
+    "max_seq_length: Giriş sekanslarının maksimum uzunluğu. Sekanslar bu uzunluktan uzunsa kesilir, kısa ise sıfır ile doldurulur.\n",
+    "--------------------------------------------------------------------------------------------------------------------------------\n",
+    "init_checkpoint: Başlangıç ağırlıkları. Genellikle önceden eğitilmiş bir BERT modelinin ağırlıkları buradan yüklenir.\n",
+    "----------------------------------------------------------------------------------------------------------------------------\n",
+    "vocab_file: BERT modelinin eğitildiği kelime dağarcığının (vocabulary) dosya yolu. Modelin kelime parçacıklarını tanıması için gereklidir.\n",
+    "--------------------------------------------------------------------------------------------------------------------------------------------------\n",
+    "do_lower_case: Giriş metinlerinin küçük harfe mi dönüştürüleceğini belirler. Küçük harfli model için True, büyük harfli model için False olmalıdır.\n",
+    "-----------------------------------------------------------------------------------------------------------------------------------------------------------\n",
+    "batch_size: Tahminler sırasında kullanılacak veri kümesi boyutu.\n",
+    "--------------------------------------------------------------------------------------------------------------------------------------\n",
+    "use_tpu: TPU (Tensor Processing Unit) kullanılıp kullanılmayacağını belirler. True ise TPU, False ise GPU/CPU kullanılır.\n",
+    "--------------------------------------------------------------------------------------------------------------------------------\n",
+    "master: TPU kullanılıyorsa, TPU'nun ana makinesinin adresi.\n",
+    "---------------------------------------------------------------------------------------------------------------------------------------\n",
+    "num_tpu_cores: TPU kullanılacaksa, toplam TPU çekirdek sayısını belirtir.\n",
+    "-----------------------------------------------------------------------------------------------------------------------------------------\n",
+    "use_one_hot_embeddings: TPUs'da genellikle True olarak ayarlanır çünkü bu, tf.one_hot fonksiyonunu kullanarak embedding lookup işlemlerini hızlandırır. GPU/CPU kullanılıyorsa False tercih edilir.\"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "t5 Modeli"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "from dotenv import load_dotenv\n",
+    "import os \n",
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "\n",
+    "\n",
+    "#tokenizer ve modelin yüklenmesi\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n",
+    "prompt = \"Write an article about Machine Learning in Healthcare focusing on Introduction to ML and Applications in Healthcare.\"\n",
+    "#api anahtarını çevresel değişken al\n",
+    "api_key= os.getenv('HUGGINGFACE_API_KEY')\n",
+    "#env dosyasını yükleme\n",
+    "load_dotenv()\n",
+    "\n",
+    "#---------------------------------------------------------------------------------\n",
+    "if api_key is None:\n",
+    "    raise ValueError(\"Apı anahtarı .env dosyasında bulunamadı\")\n",
+    "\n",
+    "# Başlıkları oluştur\n",
+    "headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
+    "\n",
+    "inputs=tokenizer(prompt, return_tensors=\"pt\")\n",
+    "input_sequence = \"[CLS] Machine Learning in Healthcare [SEP] Introduction to ML [SEP] Applications in Healthcare [SEP] machine learning, healthcare, AI [SEP]\"\n",
+    "#deneme data parçası\n",
+    "data = {\n",
+    "    \"title\": \"Machine Learning in Healthcare\",\n",
+    "    \"sub_headings\": [\"Introduction to ML\", \"Applications in Healthcare\"],\n",
+    "    \"keywords\": [\"machine learning\", \"healthcare\", \"AI\"]\n",
+    "}\n",
+    "\n",
+    "# Girdiyi oluşturma\n",
+    "prompt = (\n",
+    "    f\"Title: {data['title']}\\n\"\n",
+    "    f\"Sub-headings: {', '.join(data['sub_headings'])}\\n\"\n",
+    "    f\"Keywords: {', '.join(data['keywords'])}\\n\"\n",
+    "    f\"Content: {input_sequence}\\n\"\n",
+    "    \"Please generate a detailed article based on the above information.\"\n",
+    ")\n",
+    "\n",
+    "#metin üretimi \n",
+    "output_sequences = model.generate(\n",
+    "    inputs['input_ids'],\n",
+    "    max_length=300,  # Üretilecek metnin maksimum uzunluğu\n",
+    "    min_length=150,  # Üretilecek metnin minimum uzunluğu\n",
+    "    num_return_sequences=1,  # Döndürülecek metin sayısı\n",
+    "    do_sample=True,  # Örneklemeye izin ver\n",
+    "    top_k=50,  # Top-k sampling kullan\n",
+    "    top_p=0.95,  # Top-p sampling kullan\n",
+    "    repetition_penalty=1.2,  # Anlamsız tekrarları önlemek için ceza\n",
+    "    eos_token_id=tokenizer.eos_token_id  # Tam cümlelerin oluşturulmasını sağla\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# Üretilen metni token'lardan çözüp string'e çevir\n",
+    "generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)\n",
+    "\n",
+    "print(generated_text)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

deneme.ipynb ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "ename": "OSError",
+     "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[3], line 4\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m     25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     28\u001b[0m     OptionalDependencyNotAvailable,\n\u001b[0;32m     29\u001b[0m     _LazyModule,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     48\u001b[0m     logging,\n\u001b[0;32m     49\u001b[0m )\n\u001b[0;32m     52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m)  \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m     13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m     20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m     25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m     26\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     27\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     37\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     38\u001b[0m ]\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m     26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     27\u001b[0m     add_code_sample_docstrings,\n\u001b[0;32m     28\u001b[0m     add_end_docstrings,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     32\u001b[0m     replace_return_docstrings,\n\u001b[0;32m     33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     35\u001b[0m     ContextManagers,\n\u001b[0;32m     36\u001b[0m     ExplicitEnum,\n\u001b[0;32m     37\u001b[0m     ModelOutput,\n\u001b[0;32m     38\u001b[0m     PaddingStrategy,\n\u001b[0;32m     39\u001b[0m     TensorType,\n\u001b[0;32m     40\u001b[0m     add_model_info_to_auto_map,\n\u001b[0;32m     41\u001b[0m     add_model_info_to_custom_pipelines,\n\u001b[0;32m     42\u001b[0m     cached_property,\n\u001b[0;32m     43\u001b[0m     can_return_loss,\n\u001b[0;32m     44\u001b[0m     expand_dims,\n\u001b[0;32m     45\u001b[0m     filter_out_non_signature_kwargs,\n\u001b[0;32m     46\u001b[0m     find_labels,\n\u001b[0;32m     47\u001b[0m     flatten_dict,\n\u001b[0;32m     48\u001b[0m     infer_framework,\n\u001b[0;32m     49\u001b[0m     is_jax_tensor,\n\u001b[0;32m     50\u001b[0m     is_numpy_array,\n\u001b[0;32m     51\u001b[0m     is_tensor,\n\u001b[0;32m     52\u001b[0m     is_tf_symbolic_tensor,\n\u001b[0;32m     53\u001b[0m     is_tf_tensor,\n\u001b[0;32m     54\u001b[0m     is_torch_device,\n\u001b[0;32m     55\u001b[0m     is_torch_dtype,\n\u001b[0;32m     56\u001b[0m     is_torch_tensor,\n\u001b[0;32m     57\u001b[0m     reshape,\n\u001b[0;32m     58\u001b[0m     squeeze,\n\u001b[0;32m     59\u001b[0m     strtobool,\n\u001b[0;32m     60\u001b[0m     tensor_size,\n\u001b[0;32m     61\u001b[0m     to_numpy,\n\u001b[0;32m     62\u001b[0m     to_py_obj,\n\u001b[0;32m     63\u001b[0m     torch_float,\n\u001b[0;32m     64\u001b[0m     torch_int,\n\u001b[0;32m     65\u001b[0m     transpose,\n\u001b[0;32m     66\u001b[0m     working_or_temp_dir,\n\u001b[0;32m     67\u001b[0m )\n\u001b[0;32m     68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     69\u001b[0m     CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m     70\u001b[0m     HF_MODULES_CACHE,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     96\u001b[0m     try_to_load_from_cache,\n\u001b[0;32m     97\u001b[0m )\n\u001b[0;32m     98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     99\u001b[0m     ACCELERATE_MIN_VERSION,\n\u001b[0;32m    100\u001b[0m     ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    219\u001b[0m     torch_only_method,\n\u001b[0;32m    220\u001b[0m )\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m    458\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m    461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m    464\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m    465\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
+      "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m    146\u001b[0m                 err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m    147\u001b[0m                 err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m    150\u001b[0m     kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m    153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
+      "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "import pandas as pd \n",
+    "from pymongo import MongoClient\n",
+    "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

pyvenv.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+home = C:\Users\info\AppData\Local\Programs\Python\Python310
+include-system-site-packages = false
+version = 3.10.11

requirements.txt CHANGED Viewed

	@@ -1 +1,127 @@
1	- ~~huggingface_hub~~==0.22.2

+absl-py==2.1.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.3.4
+aiohttp==3.10.1
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.4.0
+asttokens==2.4.1
+astunparse==1.6.3
+attrs==24.1.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+comm==0.2.2
+contourpy==1.2.1
+cycler==0.12.1
+datasets==2.20.0
+debugpy==1.8.5
+decorator==5.1.1
+dill==0.3.8
+dnspython==2.6.1
+executing==2.0.1
+fastapi==0.112.0
+ffmpy==0.4.0
+filelock==3.15.4
+flatbuffers==24.3.25
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.5.0
+gast==0.6.0
+google-pasta==0.2.0
+gradio==4.40.0
+gradio_client==1.2.0
+grpcio==1.65.4
+h11==0.14.0
+h5py==3.11.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+idna==3.7
+importlib_resources==6.4.0
+ipykernel==6.29.5
+ipython==8.26.0
+jedi==0.19.1
+Jinja2==3.1.4
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+keras==3.4.1
+kiwisolver==1.4.5
+libclang==18.1.1
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+ml-dtypes==0.4.0
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+namex==0.0.8
+nest-asyncio==1.6.0
+networkx==3.3
+numpy==1.26.4
+opt-einsum==3.3.0
+optree==0.12.1
+orjson==3.10.6
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pillow==10.4.0
+platformdirs==4.2.2
+prompt_toolkit==3.0.47
+protobuf==4.25.4
+psutil==6.0.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pyarrow-hotfix==0.6
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+Pygments==2.18.0
+pymongo==4.8.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+pywin32==306
+PyYAML==6.0.1
+pyzmq==26.1.0
+regex==2024.7.24
+requests==2.32.3
+rich==13.7.1
+ruff==0.5.6
+safetensors==0.4.4
+semantic-version==2.10.0
+sentence-transformers==3.0.1
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.37.2
+sympy==1.13.1
+tensorboard==2.17.0
+tensorboard-data-server==0.7.2
+tensorflow==2.17.0
+tensorflow-intel==2.17.0
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==2.4.0
+tokenizers==0.19.1
+tomlkit==0.12.0
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.43.4
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.5
+wcwidth==0.2.13
+websockets==12.0
+Werkzeug==3.0.3
+wrapt==1.16.0
+xxhash==3.4.1
+yarl==1.9.4

test_Egitim/merged_test.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e1d337fc9fef9ba455b318d88c240308e260ae31a80552abfa690ecd897a05c
+size 107383375

train_Egitim/merged_train.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8de4c861ff88766cd74eefde2e1eb501f782bfeca81779f4a29645735d01f2f
+size 447530173

wikipedia-tr/.gitattributes ADDED Viewed

	@@ -0,0 +1,54 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

wikipedia-tr/README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+annotations_creators:
+- no-annotation
+language:
+- tr
+language_creators:
+- crowdsourced
+license:
+- cc-by-sa-3.0
+- gfdl
+multilinguality: []
+pretty_name: Turkish Wikipedia 2023
+size_categories:
+- 100K<n<1M
+source_datasets:
+- original
+tags:
+- wikipedia, wiki,
+task_categories:
+- fill-mask
+- text-generation
+task_ids:
+- masked-language-modeling
+dataset_info:
+  features:
+  - name: id
+    dtype: string
+  - name: url
+    dtype: string
+  - name: title
+    dtype: string
+  - name: text
+    dtype: string
+  splits:
+  - name: train
+    num_bytes: 956353353
+    num_examples: 520542
+  download_size: 529875169
+  dataset_size: 956353353
+---
+# 📖 Türkçe Vikipedi Mayıs 2023
+Bu veri kümesi, Türkçe Vikipedi'den alınan makalelerin bir derlemesi olup, maskeleme dil modelleme ve metin oluşturma görevleri için tasarlanmıştır.
+## 🗣️ Etiketlemeler
+Bu veri kümesindeki makaleler, özellikle belirli bir görev için etiketlenmemiş olup, veri kümesi etiketsizdir.
+## 🌐 Dil
+Bu veri kümesi Türkçe yazılmış olup, gönüllülerden oluşan bir ekip tarafından topluluk katılımı yöntemleri ile oluşturulmuştur.
+## 📜 Lisans
+CC-BY-SA 3.0 ve GFDL
+## 💻 Kaynak Veri Kümeleri
+Bu veri kümesi, Türkçe Vikipedi'den oluşturulan orijinal bir veri kümesidir.
+Türkçe Vikipedi veri kümesini kullandığınız için teşekkürler! Dil modelleme ve metin oluşturma görevleriniz için faydalı olmasını umuyoruz.
+---
+# 📖 Wikipedia Turkish 2023
+This dataset is a collection of articles from the Turkish Wikipedia and is designed to be used for masked language modeling and text generation tasks.
+## 📚 Dataset Info
+Processed and cleaned using Huggingface wikipedia cleaner.
+## 🗣️ Annotations
+The articles in this dataset were not specifically annotated for any particular task, meaning that the dataset is unlabeled.
+## 🌐 Language
+This dataset is written in Turkish and was created using crowdsourcing methods by a team of volunteers.
+## 📜 License
+CC-BY-SA 3.0 and GFDL
+## 💻 Source Datasets
+This dataset is an original dataset created from the Turkish Wikipedia.

wikipedia-tr/data/train-00000-of-00002-ed6b025df7a1f653.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5ad76fe61cebe5b5a165c9c74a510cd9ee8013dc23c04fca44c58d7ffe13cb
+size 328791077

wikipedia-tr/data/train-00001-of-00002-0aa63953f8b51c17.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2165870d3d8504c01cd1783ac654efe9144f9f1094aed360deb64a0eaa211246
+size 201084092