yonkasoft commited on
Commit
7d245b9
1 Parent(s): df88f48

Upload 11 files

Browse files
MLM.ipynb ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "ename": "OSError",
10
+ "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n",
16
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
17
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
18
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
19
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
20
+ "File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
21
+ "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
27
+ "import torch "
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 2,
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
40
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
41
+ " warnings.warn(message)\n",
42
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
43
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
44
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
45
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
46
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
47
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
48
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
49
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
50
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
51
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
52
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
53
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
54
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
55
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
56
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
57
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
58
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
59
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
60
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
61
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
62
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
63
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
64
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
65
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
66
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
67
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
68
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
69
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
70
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
71
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
72
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
73
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
74
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
75
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
76
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
77
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
78
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
79
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
80
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
81
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
82
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
83
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
84
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
85
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
86
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
87
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
88
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
89
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
90
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
91
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
92
+ "A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
93
+ "A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
94
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
95
+ "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
96
+ "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
102
+ "model=BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
103
+ "\n",
104
+ "text=(\"After reading these reports,\"\n",
105
+ " \"we start an outline of the application of ML.\"\n",
106
+ " \"It includes the [MASK] process \"\n",
107
+ " \"and various applications (from various software development to hardware development), to [MASK] of IT systems, and various approaches on analytics.\"\n",
108
+ " \"The approach incorporates [MASK] as well as computing and data mining.\"\n",
109
+ " \"For example, software developers and manufacturing engineers used AI \"\n",
110
+ " \"in manufacturing to develop their applications.\"\n",
111
+ " )"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 4,
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/plain": [
122
+ "dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
123
+ ]
124
+ },
125
+ "execution_count": 4,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "#maskeleme yaptıktan sonra tokenlere çeviriyoruz\n",
132
+ "inputs= tokenizer(text,return_tensors='pt')\n",
133
+ "inputs.keys()"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 5,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "text/plain": [
144
+ "tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
145
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
146
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
147
+ " 8051, 2458, 1007, 1010, 2000, 103, 1997, 2009, 3001, 1010,\n",
148
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 103,\n",
149
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
150
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
151
+ " 2000, 4503, 2037, 5097, 1012, 102]])"
152
+ ]
153
+ },
154
+ "execution_count": 5,
155
+ "metadata": {},
156
+ "output_type": "execute_result"
157
+ }
158
+ ],
159
+ "source": [
160
+ "inputs.input_ids"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": 6,
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": [
169
+ "text_normal= (\"After reading these reports,\"\n",
170
+ " \"we start an outline of the application of ML.\"\n",
171
+ " \"It includes the learning process \"\n",
172
+ " \"and various applications (from various software development to hardware development), to analysis of IT systems, and various approaches on analytics.\"\n",
173
+ " \"The approach incorporates AI as well as computing and data mining.\"\n",
174
+ " \"For example, software developers and manufacturing engineers used AI \"\n",
175
+ " \"in manufacturing to develop their applications.\"\n",
176
+ " )"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 8,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "data": {
186
+ "text/plain": [
187
+ "dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
188
+ ]
189
+ },
190
+ "execution_count": 8,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "#texti tokenlere çeviriyoruz\n",
197
+ "inputs_2= tokenizer(text_normal,return_tensors='pt')\n",
198
+ "inputs_2.keys()"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 9,
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "data": {
208
+ "text/plain": [
209
+ "tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
210
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
211
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
212
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
213
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
214
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
215
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
216
+ " 2000, 4503, 2037, 5097, 1012, 102]])"
217
+ ]
218
+ },
219
+ "execution_count": 9,
220
+ "metadata": {},
221
+ "output_type": "execute_result"
222
+ }
223
+ ],
224
+ "source": [
225
+ "inputs_2.input_ids"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 10,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "inputs_2['labels']= inputs_2.input_ids.detach().clone()"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 11,
240
+ "metadata": {},
241
+ "outputs": [
242
+ {
243
+ "data": {
244
+ "text/plain": [
245
+ "{'input_ids': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
246
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
247
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
248
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
249
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
250
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
251
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
252
+ " 2000, 4503, 2037, 5097, 1012, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
253
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
254
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
255
+ " 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
256
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
257
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
258
+ " 1, 1, 1, 1]]), 'labels': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
259
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
260
+ " 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
261
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
262
+ " 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
263
+ " 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
264
+ " 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
265
+ " 2000, 4503, 2037, 5097, 1012, 102]])}"
266
+ ]
267
+ },
268
+ "execution_count": 11,
269
+ "metadata": {},
270
+ "output_type": "execute_result"
271
+ }
272
+ ],
273
+ "source": [
274
+ "inputs_2"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 12,
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "data": {
284
+ "text/plain": [
285
+ "torch.Size([1, 76])"
286
+ ]
287
+ },
288
+ "execution_count": 12,
289
+ "metadata": {},
290
+ "output_type": "execute_result"
291
+ }
292
+ ],
293
+ "source": [
294
+ "#random tokenler oluşturacağız labelsiz\n",
295
+ "rand=torch.rand(inputs_2.input_ids.shape)\n",
296
+ "rand.shape"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 13,
302
+ "metadata": {},
303
+ "outputs": [
304
+ {
305
+ "data": {
306
+ "text/plain": [
307
+ "tensor([[0.9397, 0.1325, 0.1893, 0.8258, 0.7453, 0.1766, 0.9338, 0.0806, 0.0626,\n",
308
+ " 0.6665, 0.4240, 0.3946, 0.5413, 0.3799, 0.4023, 0.8699, 0.8159, 0.1511,\n",
309
+ " 0.6842, 0.0242, 0.7235, 0.0063, 0.1857, 0.9684, 0.8930, 0.8208, 0.5711,\n",
310
+ " 0.0345, 0.9919, 0.1140, 0.7597, 0.4546, 0.6478, 0.2295, 0.2846, 0.6314,\n",
311
+ " 0.3640, 0.9291, 0.3843, 0.3553, 0.1125, 0.0790, 0.4261, 0.4307, 0.6724,\n",
312
+ " 0.8569, 0.4476, 0.8032, 0.0241, 0.0152, 0.4196, 0.5609, 0.0010, 0.7240,\n",
313
+ " 0.4531, 0.5834, 0.5232, 0.3602, 0.6575, 0.9012, 0.1519, 0.2255, 0.0799,\n",
314
+ " 0.5673, 0.7244, 0.4387, 0.2713, 0.4243, 0.8435, 0.1670, 0.8664, 0.6261,\n",
315
+ " 0.4090, 0.2988, 0.3379, 0.7784]])"
316
+ ]
317
+ },
318
+ "execution_count": 13,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "rand"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 14,
330
+ "metadata": {},
331
+ "outputs": [
332
+ {
333
+ "data": {
334
+ "text/plain": [
335
+ "tensor([[False, True, False, False, False, False, False, True, True, False,\n",
336
+ " False, False, False, False, False, False, False, False, False, True,\n",
337
+ " False, True, False, False, False, False, False, True, False, True,\n",
338
+ " False, False, False, False, False, False, False, False, False, False,\n",
339
+ " True, True, False, False, False, False, False, False, True, True,\n",
340
+ " False, False, True, False, False, False, False, False, False, False,\n",
341
+ " False, False, True, False, False, False, False, False, False, False,\n",
342
+ " False, False, False, False, False, False]])"
343
+ ]
344
+ },
345
+ "execution_count": 14,
346
+ "metadata": {},
347
+ "output_type": "execute_result"
348
+ }
349
+ ],
350
+ "source": [
351
+ "#cümledeki toknelerin yüzde 15 alınır \n",
352
+ "#mask_arr = rand < 0.15 ifadesi, rand fonksiyonunun her bir token için rastgele bir sayı üreteceğini ve bu sayının 0.15'ten küçük olup olmadığına bakarak token'ın maskelenip maskelenmeyeceğini belirler. Eğer sayı 0.15'ten küçükse, token maskelenir; değilse, maskelenmez. \n",
353
+ "mask_arr = rand < 0.15\n",
354
+ "mask_arr"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 15,
360
+ "metadata": {},
361
+ "outputs": [
362
+ {
363
+ "data": {
364
+ "text/plain": [
365
+ "[1, 7, 8, 19, 21, 27, 29, 40, 41, 48, 49, 52, 62]"
366
+ ]
367
+ },
368
+ "execution_count": 15,
369
+ "metadata": {},
370
+ "output_type": "execute_result"
371
+ }
372
+ ],
373
+ "source": [
374
+ "#burada seçilen değer maskeleme yapılan tokenlarda 0 olmayan karakterlerin yazılmasıdır.\n",
375
+ "#torch flatten özelliği listeden çıkartarak yalnızca bir array olmasını sağladı\n",
376
+ "selection= torch.flatten(mask_arr[0].nonzero()).tolist()\n",
377
+ "selection"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": 16,
383
+ "metadata": {},
384
+ "outputs": [
385
+ {
386
+ "data": {
387
+ "text/plain": [
388
+ "tensor([[ 101, 103, 3752, 2122, 4311, 1010, 2057, 103, 103, 12685,\n",
389
+ " 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
390
+ " 2832, 103, 2536, 5097, 1006, 2013, 2536, 103, 2458, 103,\n",
391
+ " 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
392
+ " 103, 103, 8107, 2006, 25095, 1012, 1996, 3921, 103, 103,\n",
393
+ " 2004, 2092, 103, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
394
+ " 1010, 4007, 103, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
395
+ " 2000, 4503, 2037, 5097, 1012, 102]])"
396
+ ]
397
+ },
398
+ "execution_count": 16,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "#input_ids değerleri 0 olanlar için 103 değerinim atadık \n",
405
+ "inputs_2.input_ids[0,selection]=103\n",
406
+ "inputs_2.input_ids"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 17,
412
+ "metadata": {},
413
+ "outputs": [],
414
+ "source": [
415
+ "outputs= model(**inputs_2)"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 18,
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "data": {
425
+ "text/plain": [
426
+ "odict_keys(['loss', 'logits'])"
427
+ ]
428
+ },
429
+ "execution_count": 18,
430
+ "metadata": {},
431
+ "output_type": "execute_result"
432
+ }
433
+ ],
434
+ "source": [
435
+ "outputs.keys()"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 19,
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "data": {
445
+ "text/plain": [
446
+ "tensor(0.8399, grad_fn=<NllLossBackward0>)"
447
+ ]
448
+ },
449
+ "execution_count": 19,
450
+ "metadata": {},
451
+ "output_type": "execute_result"
452
+ }
453
+ ],
454
+ "source": [
455
+ "outputs.loss"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 22,
461
+ "metadata": {},
462
+ "outputs": [
463
+ {
464
+ "name": "stderr",
465
+ "output_type": "stream",
466
+ "text": [
467
+ "c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:159: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--facebook--dpr-ctx_encoder-single-nq-base. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
468
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
469
+ " warnings.warn(message)\n",
470
+ "Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
471
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
472
+ "Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
473
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
474
+ "Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']\n",
475
+ "- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
476
+ "- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
477
+ ]
478
+ }
479
+ ],
480
+ "source": [
481
+ "from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer\n",
482
+ "\n",
483
+ "ctx_model=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
484
+ "ctx_tokenizer=DPRQuestionEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')\n",
485
+ "\n",
486
+ "question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
487
+ "question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
488
+ "\n"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": null,
494
+ "metadata": {},
495
+ "outputs": [],
496
+ "source": [
497
+ "\"\"\"title = [\"2024 Yılında Mobil Teknoloji Trendleri\"]\n",
498
+ "keywords = [\"mobil teknoloji\", \"2024 trendleri\", \"akıllı telefon yenilikleri\", \"5G teknolojisi\", \"giyilebilir cihazlar\"]\n",
499
+ "subheading = [\n",
500
+ " \"2024'te Akıllı Telefonlardaki Yenilikler\",\n",
501
+ " \"Giyilebilir Teknolojiler: Sağlık ve Fitness Trendleri\",\n",
502
+ " \"5G'nin Mobil Cihazlar Üzerindeki Etkisi\",\n",
503
+ " \"Mobil Güvenlikte Yeni Yaklaşımlar\"\n",
504
+ "]\"\"\"\n"
505
+ ]
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "execution_count": null,
510
+ "metadata": {},
511
+ "outputs": [],
512
+ "source": [
513
+ "\n",
514
+ "xb_tokens=ctx_tokenizer()\n"
515
+ ]
516
+ }
517
+ ],
518
+ "metadata": {
519
+ "kernelspec": {
520
+ "display_name": "myenv",
521
+ "language": "python",
522
+ "name": "python3"
523
+ },
524
+ "language_info": {
525
+ "codemirror_mode": {
526
+ "name": "ipython",
527
+ "version": 3
528
+ },
529
+ "file_extension": ".py",
530
+ "mimetype": "text/x-python",
531
+ "name": "python",
532
+ "nbconvert_exporter": "python",
533
+ "pygments_lexer": "ipython3",
534
+ "version": "3.12.4"
535
+ }
536
+ },
537
+ "nbformat": 4,
538
+ "nbformat_minor": 2
539
+ }
datasets.ipynb ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "Kütüphaneler eklenmesi"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
+ " from .autonotebook import tqdm as notebook_tqdm\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "from datasets import load_dataset\n",
26
+ "import pandas as pd \n",
27
+ "from pymongo import MongoClient\n",
28
+ "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder;\n",
29
+ "\n",
30
+ "\n"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "markdown",
35
+ "metadata": {},
36
+ "source": [
37
+ "Parquet dosyalarının dataframe olarak yüklenmesi(okuma yapabilmek için)"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 3,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "# Parquet dosyalarını DataFrame olarak yükleyin\n",
47
+ "train_df1 = pd.read_parquet('C:\\\\gitProjects\\\\yeni\\\\wikipedia-tr\\\\data\\\\train-00000-of-00002-ed6b025df7a1f653.parquet')\n",
48
+ "train_df2 = pd.read_parquet('C:\\\\gitProjects\\\\yeni\\\\wikipedia-tr\\\\data\\\\train-00001-of-00002-0aa63953f8b51c17.parquet')\n"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 4,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "# İki DataFrame'i birleştirin\n",
58
+ "merged_train = pd.concat([train_df1, train_df2], ignore_index=True)\n"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 5,
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "# Örneğin %80 train, %20 test olarak ayırın\n",
68
+ "train_data = merged_train.sample(frac=0.8, random_state=42)\n",
69
+ "test_data = merged_train.drop(train_data.index)\n"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 6,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "import os\n",
79
+ "\n",
80
+ "# Dosya yolları\n",
81
+ "train_dir = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim'\n",
82
+ "test_dir = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim'\n",
83
+ "train_file_path = os.path.join(train_dir, 'merged_train.parquet')\n",
84
+ "test_file_path = os.path.join(test_dir, 'merged_test.parquet')\n",
85
+ "\n",
86
+ "# Dizinlerin var olup olmadığını kontrol etme, gerekirse oluşturma\n",
87
+ "os.makedirs(train_dir, exist_ok=True)\n",
88
+ "os.makedirs(test_dir, exist_ok=True)\n",
89
+ "\n",
90
+ "# Veriyi .parquet formatında kaydetme\n",
91
+ "train_data.to_parquet(train_file_path)\n",
92
+ "test_data.to_parquet(test_file_path)"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "metadata": {},
98
+ "source": [
99
+ "Dataframe deki bilgileri görme "
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 7,
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "name": "stdout",
109
+ "output_type": "stream",
110
+ "text": [
111
+ " id url \\\n",
112
+ "515773 3525037 https://tr.wikipedia.org/wiki/P%C5%9F%C4%B1qo%... \n",
113
+ "517811 3532700 https://tr.wikipedia.org/wiki/Craterolophinae \n",
114
+ "436350 3203545 https://tr.wikipedia.org/wiki/Notocrabro \n",
115
+ "223281 1765445 https://tr.wikipedia.org/wiki/Ibrahim%20Sissoko \n",
116
+ "100272 575462 https://tr.wikipedia.org/wiki/Salah%20Cedid \n",
117
+ "\n",
118
+ " title text \n",
119
+ "515773 Pşıqo Ahecaqo Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom... \n",
120
+ "517811 Craterolophinae Craterolophinae, Depastridae familyasına bağlı... \n",
121
+ "436350 Notocrabro Notocrabro Crabronina oymağına bağlı bir cinst... \n",
122
+ "223281 Ibrahim Sissoko İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa... \n",
123
+ "100272 Salah Cedid Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su... \n",
124
+ " id url title \\\n",
125
+ "5 35 https://tr.wikipedia.org/wiki/Karl%20Marx Karl Marx \n",
126
+ "13 48 https://tr.wikipedia.org/wiki/Ruhi%20Su Ruhi Su \n",
127
+ "15 53 https://tr.wikipedia.org/wiki/Bilgisayar Bilgisayar \n",
128
+ "18 59 https://tr.wikipedia.org/wiki/Edebiyat Edebiyat \n",
129
+ "19 64 https://tr.wikipedia.org/wiki/M%C3%BChendislik Mühendislik \n",
130
+ "\n",
131
+ " text \n",
132
+ "5 Karl Marx (; 5 Mayıs 1818, Trier – 14 Mart 188... \n",
133
+ "13 Mehmet Ruhi Su (1 Ocak 1912, Van - 20 Eylül 19... \n",
134
+ "15 Bilgisayar, aritmetik veya mantıksal işlem diz... \n",
135
+ "18 Edebiyat, yazın veya literatür; olay, düşünce,... \n",
136
+ "19 Mühendis, insanların her türlü ihtiyacını karş... \n"
137
+ ]
138
+ }
139
+ ],
140
+ "source": [
141
+ "print(train_data.head())\n",
142
+ "print(test_data.head())"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "markdown",
147
+ "metadata": {},
148
+ "source": [
149
+ "MongoDb'ye bağlama ve bilgi çekme "
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 7,
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ " Veriler başarıyla Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'EgitimDatabase'), 'train') MongoDb koleksiyonuna indirildi.\n",
162
+ " Veriler başarıyla Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'EgitimDatabase'), 'test') MongoDb koleksiyonuna indirildi.\n"
163
+ ]
164
+ }
165
+ ],
166
+ "source": [
167
+ "import pandas as pd\n",
168
+ "from pymongo import MongoClient\n",
169
+ "\n",
170
+ "def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
171
+ " \"\"\"\n",
172
+ " MongoDB connection and collection selection for train and test collections.\n",
173
+ " \"\"\"\n",
174
+ " client = MongoClient(f'mongodb://{host}:{port}/')\n",
175
+ " \n",
176
+ " # Veritabanını seçin\n",
177
+ " db = client[database_name]\n",
178
+ " \n",
179
+ " # Train ve test koleksiyonlarını seçin\n",
180
+ " train_collection = db[train_collection_name]\n",
181
+ " test_collection = db[test_collection_name]\n",
182
+ " \n",
183
+ " return train_collection, test_collection\n",
184
+ "\n",
185
+ "# Function to load dataset into MongoDB\n",
186
+ "def dataset_read(train_file_path,test_file_path):\n",
187
+ " data_train = pd.read_parquet(train_file_path, columns=['id', 'url', 'title', 'text'])\n",
188
+ " data_test = pd.read_parquet(test_file_path, columns=['id', 'url', 'title', 'text'])\n",
189
+ " data_dict_train = data_train.to_dict(\"records\")\n",
190
+ " data_dict_test = data_test.to_dict(\"records\")\n",
191
+ "\n",
192
+ "\n",
193
+ "\n",
194
+ " # Get the MongoDB collections\n",
195
+ " train_collection, test_collection = get_mongodb(database_name='EgitimDatabase')\n",
196
+ "\n",
197
+ " \n",
198
+ "\n",
199
+ " # Insert data into MongoDB\n",
200
+ " train_collection.insert_many(data_dict_train)\n",
201
+ " test_collection.insert_many(data_dict_test)\n",
202
+ "\n",
203
+ "\n",
204
+ " print(f\" Veriler başarıyla {train_collection} MongoDb koleksiyonuna indirildi.\")\n",
205
+ " print(f\" Veriler başarıyla {test_collection} MongoDb koleksiyonuna indirildi.\")\n",
206
+ " return train_collection,test_collection\n",
207
+ "\n",
208
+ "# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
209
+ "train_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
210
+ "test_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
211
+ "\n",
212
+ "train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "markdown",
217
+ "metadata": {},
218
+ "source": [
219
+ "Similarity Sentences "
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": 8,
225
+ "metadata": {},
226
+ "outputs": [
227
+ {
228
+ "ename": "ModuleNotFoundError",
229
+ "evalue": "No module named 'torch.amp'",
230
+ "output_type": "error",
231
+ "traceback": [
232
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
233
+ "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
234
+ "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#datasete similarity sentence yardımıyla keywords ve subheadings tanımlama \u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SentenceTransformer\n\u001b[0;32m 6\u001b[0m model \u001b[38;5;241m=\u001b[39m SentenceTransformer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124memrecan/bert-base-turkish-cased-mean-nli-stsb-tr\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m#text dosyasını koleksiyon üzerinden çekme \u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\u001b[39;00m\n",
235
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\__init__.py:7\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcross_encoder\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mCrossEncoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CrossEncoder\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParallelSentencesDataset, SentencesDataset\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mLoggingHandler\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LoggingHandler\n",
236
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\cross_encoder\\__init__.py:1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mCrossEncoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CrossEncoder\n\u001b[0;32m 3\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCrossEncoder\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
237
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:7\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Callable, Dict, List, Literal, Optional, Tuple, Type, Union\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Tensor, nn\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moptim\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optimizer\n",
238
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py:1686\u001b[0m\n\u001b[0;32m 1683\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to find torch_shm_manager at \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m path)\n\u001b[0;32m 1684\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m path\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m-> 1686\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamp\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m autocast, GradScaler\n\u001b[0;32m 1688\u001b[0m \u001b[38;5;66;03m# Initializing the extension shadows the built-in python float / int classes;\u001b[39;00m\n\u001b[0;32m 1689\u001b[0m \u001b[38;5;66;03m# store them for later use by SymInt / SymFloat.\u001b[39;00m\n\u001b[0;32m 1690\u001b[0m py_float \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mfloat\u001b[39m\n",
239
+ "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch.amp'"
240
+ ]
241
+ }
242
+ ],
243
+ "source": [
244
+ "#datasete similarity sentence yardımıyla keywords ve subheadings tanımlama \n",
245
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
246
+ "from sentence_transformers import SentenceTransformer\n",
247
+ "\n",
248
+ "\n",
249
+ "model = SentenceTransformer(\"emrecan/bert-base-turkish-cased-mean-nli-stsb-tr\")\n",
250
+ "#text dosyasını koleksiyon üzerinden çekme \n",
251
+ "# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
252
+ "class Database:\n",
253
+ " @staticmethod\n",
254
+ " def get_mongodb():\n",
255
+ " # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmıştır.\n",
256
+ " return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
257
+ "\n",
258
+ " @staticmethod\n",
259
+ " def get_input_titles():\n",
260
+ " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
261
+ " client = MongoClient(mongo_url)\n",
262
+ " db = client[db_name]\n",
263
+ " collection = db[collection_name]\n",
264
+ " query = {\"title\": {\"$exists\": True}}\n",
265
+ " cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
266
+ " title_from_db = [doc['title'] for doc in cursor]\n",
267
+ " return title_from_db\n",
268
+ " \n",
269
+ " @staticmethod\n",
270
+ " def get_input_texts():\n",
271
+ " mongo_url, db_name, collection_name = Database.get_mongodb()\n",
272
+ " client = MongoClient(mongo_url)\n",
273
+ " db = client[db_name]\n",
274
+ " collection = db[collection_name]\n",
275
+ " query = {\"text\": {\"$exists\": True}}\n",
276
+ " cursor = collection.find(query, {\"text\": 1, \"_id\": 0})\n",
277
+ " text_from_db = [doc['text'] for doc in cursor]\n",
278
+ " return text_from_db\n",
279
+ "\n",
280
+ "\n",
281
+ "#tf-ıdf hesaplama (anahtar kelimeler için)\n",
282
+ "\n",
283
+ "\n",
284
+ "#IDF = log ( Dokuman Sayısı / Terimin Geçtiği Dokuman Sayısı )\n",
285
+ "\n",
286
+ "#text ve title a göre keywords belirlenmesi\n",
287
+ "\n",
288
+ "#------------------------------------------------------------------------------\n",
289
+ "\n",
290
+ "\n",
291
+ "#sbert ile alt başlıkların oluşturulması\n",
292
+ "\n",
293
+ "#kümelenme ile alt başlıkların belirlenmesi \n",
294
+ "\n",
295
+ "#-------------------------------------------------------------------------------\n",
296
+ "\n",
297
+ "#anahatar kelime ve alt başlıkların veri tabnaına eklnemesi "
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "markdown",
302
+ "metadata": {},
303
+ "source": []
304
+ },
305
+ {
306
+ "cell_type": "markdown",
307
+ "metadata": {},
308
+ "source": []
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": null,
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": [
316
+ "#prompt oluştururak generate etmek için hazırlık"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "markdown",
321
+ "metadata": {},
322
+ "source": [
323
+ "Bert Modeliyle tokenizer atama"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": null,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
333
+ "model=BertForMaskedLM.from_pretrained('bert-base-uncased')\n",
334
+ "\n",
335
+ "\"\"\"BERT MODELİNİ AYARLAMA\n",
336
+ "\n",
337
+ "input_file: Modelin işlem yapacağı giriş dosyasının yolunu belirtir. Bu dosya, metin verilerini içermelidir.\n",
338
+ "-----------------------------------------------------------------------------------------------------------------\n",
339
+ "output_file: Modelin çıktılarının kaydedileceği dosyanın yolunu belirtir.\n",
340
+ "------------------------------------------------------------------------------------------------------------------\n",
341
+ "layers: Hangi BERT katmanlarının kullanılacağını belirler. Örneğin, \"-1,-2,-3,-4\" son dört katmanı ifade eder.\n",
342
+ "----------------------------------------------------------------------------------------------------------------------\n",
343
+ "bert_config_file: Önceden eğitilmiş BERT modelinin yapılandırma dosyasının yolu. Bu dosya modelin mimarisini belirler.\n",
344
+ "--------------------------------------------------------------------------------------------------------------------------\n",
345
+ "max_seq_length: Giriş sekanslarının maksimum uzunluğu. Sekanslar bu uzunluktan uzunsa kesilir, kısa ise sıfır ile doldurulur.\n",
346
+ "--------------------------------------------------------------------------------------------------------------------------------\n",
347
+ "init_checkpoint: Başlangıç ağırlıkları. Genellikle önceden eğitilmiş bir BERT modelinin ağırlıkları buradan yüklenir.\n",
348
+ "----------------------------------------------------------------------------------------------------------------------------\n",
349
+ "vocab_file: BERT modelinin eğitildiği kelime dağarcığının (vocabulary) dosya yolu. Modelin kelime parçacıklarını tanıması için gereklidir.\n",
350
+ "--------------------------------------------------------------------------------------------------------------------------------------------------\n",
351
+ "do_lower_case: Giriş metinlerinin küçük harfe mi dönüştürüleceğini belirler. Küçük harfli model için True, büyük harfli model için False olmalıdır.\n",
352
+ "-----------------------------------------------------------------------------------------------------------------------------------------------------------\n",
353
+ "batch_size: Tahminler sırasında kullanılacak veri kümesi boyutu.\n",
354
+ "--------------------------------------------------------------------------------------------------------------------------------------\n",
355
+ "use_tpu: TPU (Tensor Processing Unit) kullanılıp kullanılmayacağını belirler. True ise TPU, False ise GPU/CPU kullanılır.\n",
356
+ "--------------------------------------------------------------------------------------------------------------------------------\n",
357
+ "master: TPU kullanılıyorsa, TPU'nun ana makinesinin adresi.\n",
358
+ "---------------------------------------------------------------------------------------------------------------------------------------\n",
359
+ "num_tpu_cores: TPU kullanılacaksa, toplam TPU çekirdek sayısını belirtir.\n",
360
+ "-----------------------------------------------------------------------------------------------------------------------------------------\n",
361
+ "use_one_hot_embeddings: TPUs'da genellikle True olarak ayarlanır çünkü bu, tf.one_hot fonksiyonunu kullanarak embedding lookup işlemlerini hızlandırır. GPU/CPU kullanılıyorsa False tercih edilir.\"\"\"\n"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "markdown",
366
+ "metadata": {},
367
+ "source": [
368
+ "t5 Modeli"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": null,
374
+ "metadata": {},
375
+ "outputs": [],
376
+ "source": [
377
+ "from transformers import pipeline\n",
378
+ "from dotenv import load_dotenv\n",
379
+ "import os \n",
380
+ "# Load model directly\n",
381
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
382
+ "\n",
383
+ "\n",
384
+ "#tokenizer ve modelin yüklenmesi\n",
385
+ "tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n",
386
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n",
387
+ "prompt = \"Write an article about Machine Learning in Healthcare focusing on Introduction to ML and Applications in Healthcare.\"\n",
388
+ "#api anahtarını çevresel değişken al\n",
389
+ "api_key= os.getenv('HUGGINGFACE_API_KEY')\n",
390
+ "#env dosyasını yükleme\n",
391
+ "load_dotenv()\n",
392
+ "\n",
393
+ "#---------------------------------------------------------------------------------\n",
394
+ "if api_key is None:\n",
395
+ " raise ValueError(\"Apı anahtarı .env dosyasında bulunamadı\")\n",
396
+ "\n",
397
+ "# Başlıkları oluştur\n",
398
+ "headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
399
+ "\n",
400
+ "inputs=tokenizer(prompt, return_tensors=\"pt\")\n",
401
+ "input_sequence = \"[CLS] Machine Learning in Healthcare [SEP] Introduction to ML [SEP] Applications in Healthcare [SEP] machine learning, healthcare, AI [SEP]\"\n",
402
+ "#deneme data parçası\n",
403
+ "data = {\n",
404
+ " \"title\": \"Machine Learning in Healthcare\",\n",
405
+ " \"sub_headings\": [\"Introduction to ML\", \"Applications in Healthcare\"],\n",
406
+ " \"keywords\": [\"machine learning\", \"healthcare\", \"AI\"]\n",
407
+ "}\n",
408
+ "\n",
409
+ "# Girdiyi oluşturma\n",
410
+ "prompt = (\n",
411
+ " f\"Title: {data['title']}\\n\"\n",
412
+ " f\"Sub-headings: {', '.join(data['sub_headings'])}\\n\"\n",
413
+ " f\"Keywords: {', '.join(data['keywords'])}\\n\"\n",
414
+ " f\"Content: {input_sequence}\\n\"\n",
415
+ " \"Please generate a detailed article based on the above information.\"\n",
416
+ ")\n",
417
+ "\n",
418
+ "#metin üretimi \n",
419
+ "output_sequences = model.generate(\n",
420
+ " inputs['input_ids'],\n",
421
+ " max_length=300, # Üretilecek metnin maksimum uzunluğu\n",
422
+ " min_length=150, # Üretilecek metnin minimum uzunluğu\n",
423
+ " num_return_sequences=1, # Döndürülecek metin sayısı\n",
424
+ " do_sample=True, # Örneklemeye izin ver\n",
425
+ " top_k=50, # Top-k sampling kullan\n",
426
+ " top_p=0.95, # Top-p sampling kullan\n",
427
+ " repetition_penalty=1.2, # Anlamsız tekrarları önlemek için ceza\n",
428
+ " eos_token_id=tokenizer.eos_token_id # Tam cümlelerin oluşturulmasını sağla\n",
429
+ ")\n",
430
+ "\n",
431
+ "\n",
432
+ "# Üretilen metni token'lardan çözüp string'e çevir\n",
433
+ "generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)\n",
434
+ "\n",
435
+ "print(generated_text)\n"
436
+ ]
437
+ }
438
+ ],
439
+ "metadata": {
440
+ "kernelspec": {
441
+ "display_name": "base",
442
+ "language": "python",
443
+ "name": "python3"
444
+ },
445
+ "language_info": {
446
+ "codemirror_mode": {
447
+ "name": "ipython",
448
+ "version": 3
449
+ },
450
+ "file_extension": ".py",
451
+ "mimetype": "text/x-python",
452
+ "name": "python",
453
+ "nbconvert_exporter": "python",
454
+ "pygments_lexer": "ipython3",
455
+ "version": "3.10.11"
456
+ }
457
+ },
458
+ "nbformat": 4,
459
+ "nbformat_minor": 2
460
+ }
deneme.ipynb ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ },
16
+ {
17
+ "ename": "OSError",
18
+ "evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
19
+ "output_type": "error",
20
+ "traceback": [
21
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
22
+ "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
23
+ "Cell \u001b[1;32mIn[3], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
24
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\__init__.py:26\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
25
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\dependency_versions_check.py:16\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in setup.py)\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
26
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\__init__.py:34\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
27
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\generic.py:462\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
28
+ "File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\__init__.py:148\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
29
+ "\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
30
+ ]
31
+ }
32
+ ],
33
+ "source": [
34
+ "from datasets import load_dataset\n",
35
+ "import pandas as pd \n",
36
+ "from pymongo import MongoClient\n",
37
+ "from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": []
46
+ }
47
+ ],
48
+ "metadata": {
49
+ "kernelspec": {
50
+ "display_name": ".venv",
51
+ "language": "python",
52
+ "name": "python3"
53
+ },
54
+ "language_info": {
55
+ "codemirror_mode": {
56
+ "name": "ipython",
57
+ "version": 3
58
+ },
59
+ "file_extension": ".py",
60
+ "mimetype": "text/x-python",
61
+ "name": "python",
62
+ "nbconvert_exporter": "python",
63
+ "pygments_lexer": "ipython3",
64
+ "version": "3.10.11"
65
+ }
66
+ },
67
+ "nbformat": 4,
68
+ "nbformat_minor": 2
69
+ }
pyvenv.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ home = C:\Users\info\AppData\Local\Programs\Python\Python310
2
+ include-system-site-packages = false
3
+ version = 3.10.11
requirements.txt CHANGED
@@ -1 +1,127 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiofiles==23.2.1
3
+ aiohappyeyeballs==2.3.4
4
+ aiohttp==3.10.1
5
+ aiosignal==1.3.1
6
+ annotated-types==0.7.0
7
+ anyio==4.4.0
8
+ asttokens==2.4.1
9
+ astunparse==1.6.3
10
+ attrs==24.1.0
11
+ certifi==2024.7.4
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ colorama==0.4.6
15
+ comm==0.2.2
16
+ contourpy==1.2.1
17
+ cycler==0.12.1
18
+ datasets==2.20.0
19
+ debugpy==1.8.5
20
+ decorator==5.1.1
21
+ dill==0.3.8
22
+ dnspython==2.6.1
23
+ executing==2.0.1
24
+ fastapi==0.112.0
25
+ ffmpy==0.4.0
26
+ filelock==3.15.4
27
+ flatbuffers==24.3.25
28
+ fonttools==4.53.1
29
+ frozenlist==1.4.1
30
+ fsspec==2024.5.0
31
+ gast==0.6.0
32
+ google-pasta==0.2.0
33
+ gradio==4.40.0
34
+ gradio_client==1.2.0
35
+ grpcio==1.65.4
36
+ h11==0.14.0
37
+ h5py==3.11.0
38
+ httpcore==1.0.5
39
+ httpx==0.27.0
40
+ huggingface-hub==0.24.5
41
+ idna==3.7
42
+ importlib_resources==6.4.0
43
+ ipykernel==6.29.5
44
+ ipython==8.26.0
45
+ jedi==0.19.1
46
+ Jinja2==3.1.4
47
+ jupyter_client==8.6.2
48
+ jupyter_core==5.7.2
49
+ keras==3.4.1
50
+ kiwisolver==1.4.5
51
+ libclang==18.1.1
52
+ Markdown==3.6
53
+ markdown-it-py==3.0.0
54
+ MarkupSafe==2.1.5
55
+ matplotlib==3.9.0
56
+ matplotlib-inline==0.1.7
57
+ mdurl==0.1.2
58
+ ml-dtypes==0.4.0
59
+ mpmath==1.3.0
60
+ multidict==6.0.5
61
+ multiprocess==0.70.16
62
+ namex==0.0.8
63
+ nest-asyncio==1.6.0
64
+ networkx==3.3
65
+ numpy==1.26.4
66
+ opt-einsum==3.3.0
67
+ optree==0.12.1
68
+ orjson==3.10.6
69
+ packaging==24.1
70
+ pandas==2.2.2
71
+ parso==0.8.4
72
+ pillow==10.4.0
73
+ platformdirs==4.2.2
74
+ prompt_toolkit==3.0.47
75
+ protobuf==4.25.4
76
+ psutil==6.0.0
77
+ pure_eval==0.2.3
78
+ pyarrow==17.0.0
79
+ pyarrow-hotfix==0.6
80
+ pydantic==2.8.2
81
+ pydantic_core==2.20.1
82
+ pydub==0.25.1
83
+ Pygments==2.18.0
84
+ pymongo==4.8.0
85
+ pyparsing==3.1.2
86
+ python-dateutil==2.9.0.post0
87
+ python-multipart==0.0.9
88
+ pytz==2024.1
89
+ pywin32==306
90
+ PyYAML==6.0.1
91
+ pyzmq==26.1.0
92
+ regex==2024.7.24
93
+ requests==2.32.3
94
+ rich==13.7.1
95
+ ruff==0.5.6
96
+ safetensors==0.4.4
97
+ semantic-version==2.10.0
98
+ sentence-transformers==3.0.1
99
+ shellingham==1.5.4
100
+ six==1.16.0
101
+ sniffio==1.3.1
102
+ stack-data==0.6.3
103
+ starlette==0.37.2
104
+ sympy==1.13.1
105
+ tensorboard==2.17.0
106
+ tensorboard-data-server==0.7.2
107
+ tensorflow==2.17.0
108
+ tensorflow-intel==2.17.0
109
+ tensorflow-io-gcs-filesystem==0.31.0
110
+ termcolor==2.4.0
111
+ tokenizers==0.19.1
112
+ tomlkit==0.12.0
113
+ tqdm==4.66.5
114
+ traitlets==5.14.3
115
+ transformers==4.43.4
116
+ typer==0.12.3
117
+ typing_extensions==4.12.2
118
+ tzdata==2024.1
119
+ urllib3==2.2.2
120
+ uvicorn==0.30.5
121
+ wcwidth==0.2.13
122
+ websockets==12.0
123
+ Werkzeug==3.0.3
124
+ wrapt==1.16.0
125
+ xxhash==3.4.1
126
+ yarl==1.9.4
127
+
test_Egitim/merged_test.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e1d337fc9fef9ba455b318d88c240308e260ae31a80552abfa690ecd897a05c
3
+ size 107383375
train_Egitim/merged_train.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8de4c861ff88766cd74eefde2e1eb501f782bfeca81779f4a29645735d01f2f
3
+ size 447530173
wikipedia-tr/.gitattributes ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ # Audio files - uncompressed
37
+ *.pcm filter=lfs diff=lfs merge=lfs -text
38
+ *.sam filter=lfs diff=lfs merge=lfs -text
39
+ *.raw filter=lfs diff=lfs merge=lfs -text
40
+ # Audio files - compressed
41
+ *.aac filter=lfs diff=lfs merge=lfs -text
42
+ *.flac filter=lfs diff=lfs merge=lfs -text
43
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
44
+ *.ogg filter=lfs diff=lfs merge=lfs -text
45
+ *.wav filter=lfs diff=lfs merge=lfs -text
46
+ # Image files - uncompressed
47
+ *.bmp filter=lfs diff=lfs merge=lfs -text
48
+ *.gif filter=lfs diff=lfs merge=lfs -text
49
+ *.png filter=lfs diff=lfs merge=lfs -text
50
+ *.tiff filter=lfs diff=lfs merge=lfs -text
51
+ # Image files - compressed
52
+ *.jpg filter=lfs diff=lfs merge=lfs -text
53
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
54
+ *.webp filter=lfs diff=lfs merge=lfs -text
wikipedia-tr/README.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ annotations_creators:
3
+ - no-annotation
4
+ language:
5
+ - tr
6
+ language_creators:
7
+ - crowdsourced
8
+ license:
9
+ - cc-by-sa-3.0
10
+ - gfdl
11
+ multilinguality: []
12
+ pretty_name: Turkish Wikipedia 2023
13
+ size_categories:
14
+ - 100K<n<1M
15
+ source_datasets:
16
+ - original
17
+ tags:
18
+ - wikipedia, wiki,
19
+ task_categories:
20
+ - fill-mask
21
+ - text-generation
22
+ task_ids:
23
+ - masked-language-modeling
24
+ dataset_info:
25
+ features:
26
+ - name: id
27
+ dtype: string
28
+ - name: url
29
+ dtype: string
30
+ - name: title
31
+ dtype: string
32
+ - name: text
33
+ dtype: string
34
+ splits:
35
+ - name: train
36
+ num_bytes: 956353353
37
+ num_examples: 520542
38
+ download_size: 529875169
39
+ dataset_size: 956353353
40
+ ---
41
+
42
+ # 📖 Türkçe Vikipedi Mayıs 2023
43
+ Bu veri kümesi, Türkçe Vikipedi'den alınan makalelerin bir derlemesi olup, maskeleme dil modelleme ve metin oluşturma görevleri için tasarlanmıştır.
44
+
45
+
46
+ ## 🗣️ Etiketlemeler
47
+ Bu veri kümesindeki makaleler, özellikle belirli bir görev için etiketlenmemiş olup, veri kümesi etiketsizdir.
48
+
49
+ ## 🌐 Dil
50
+ Bu veri kümesi Türkçe yazılmış olup, gönüllülerden oluşan bir ekip tarafından topluluk katılımı yöntemleri ile oluşturulmuştur.
51
+
52
+ ## 📜 Lisans
53
+ CC-BY-SA 3.0 ve GFDL
54
+
55
+ ## 💻 Kaynak Veri Kümeleri
56
+ Bu veri kümesi, Türkçe Vikipedi'den oluşturulan orijinal bir veri kümesidir.
57
+
58
+
59
+ Türkçe Vikipedi veri kümesini kullandığınız için teşekkürler! Dil modelleme ve metin oluşturma görevleriniz için faydalı olmasını umuyoruz.
60
+
61
+ ---
62
+
63
+ # 📖 Wikipedia Turkish 2023
64
+
65
+ This dataset is a collection of articles from the Turkish Wikipedia and is designed to be used for masked language modeling and text generation tasks.
66
+
67
+ ## 📚 Dataset Info
68
+
69
+ Processed and cleaned using Huggingface wikipedia cleaner.
70
+
71
+ ## 🗣️ Annotations
72
+
73
+ The articles in this dataset were not specifically annotated for any particular task, meaning that the dataset is unlabeled.
74
+
75
+ ## 🌐 Language
76
+
77
+ This dataset is written in Turkish and was created using crowdsourcing methods by a team of volunteers.
78
+
79
+ ## 📜 License
80
+
81
+ CC-BY-SA 3.0 and GFDL
82
+
83
+ ## 💻 Source Datasets
84
+
85
+ This dataset is an original dataset created from the Turkish Wikipedia.
wikipedia-tr/data/train-00000-of-00002-ed6b025df7a1f653.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf5ad76fe61cebe5b5a165c9c74a510cd9ee8013dc23c04fca44c58d7ffe13cb
3
+ size 328791077
wikipedia-tr/data/train-00001-of-00002-0aa63953f8b51c17.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2165870d3d8504c01cd1783ac654efe9144f9f1094aed360deb64a0eaa211246
3
+ size 201084092