Build error
Build error
Upload 11 files
Browse files- MLM.ipynb +539 -0
- datasets.ipynb +460 -0
- deneme.ipynb +69 -0
- pyvenv.cfg +3 -0
- requirements.txt +127 -1
- test_Egitim/merged_test.parquet +3 -0
- train_Egitim/merged_train.parquet +3 -0
- wikipedia-tr/.gitattributes +54 -0
- wikipedia-tr/ +85 -0
- wikipedia-tr/data/train-00000-of-00002-ed6b025df7a1f653.parquet +3 -0
- wikipedia-tr/data/train-00001-of-00002-0aa63953f8b51c17.parquet +3 -0
@@ -0,0 +1,539 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 1,
6 |
"metadata": {},
7 |
"outputs": [
8 |
9 |
"ename": "OSError",
10 |
"evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
11 |
"output_type": "error",
12 |
"traceback": [
13 |
14 |
"\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
15 |
"Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n",
16 |
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
17 |
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
18 |
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
19 |
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\transformers\\utils\\\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
20 |
"File \u001b[1;32mc:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
21 |
"\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\Users\\info\\anaconda3\\Lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
22 |
23 |
24 |
25 |
"source": [
26 |
"from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
27 |
"import torch "
28 |
29 |
30 |
31 |
"cell_type": "code",
32 |
"execution_count": 2,
33 |
"metadata": {},
34 |
"outputs": [
35 |
36 |
"name": "stderr",
37 |
"output_type": "stream",
38 |
"text": [
39 |
"c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\ UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see\n",
40 |
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article:\n",
41 |
" warnings.warn(message)\n",
42 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
43 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
44 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
45 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
46 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
47 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
48 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
49 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
50 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
51 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
52 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
53 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
54 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
55 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
56 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
57 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
58 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
59 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
60 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
61 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
62 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
63 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
64 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
65 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
66 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
67 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
68 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
69 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
70 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
71 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
72 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
73 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
74 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
75 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
76 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
77 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
78 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
79 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
80 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
81 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
82 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
83 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
84 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
85 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
86 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
87 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
88 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
89 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
90 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
91 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
92 |
"A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning.\n",
93 |
"A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.\n",
94 |
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
95 |
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
96 |
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
97 |
98 |
99 |
100 |
"source": [
101 |
"tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
102 |
103 |
104 |
"text=(\"After reading these reports,\"\n",
105 |
" \"we start an outline of the application of ML.\"\n",
106 |
" \"It includes the [MASK] process \"\n",
107 |
" \"and various applications (from various software development to hardware development), to [MASK] of IT systems, and various approaches on analytics.\"\n",
108 |
" \"The approach incorporates [MASK] as well as computing and data mining.\"\n",
109 |
" \"For example, software developers and manufacturing engineers used AI \"\n",
110 |
" \"in manufacturing to develop their applications.\"\n",
111 |
" )"
112 |
113 |
114 |
115 |
"cell_type": "code",
116 |
"execution_count": 4,
117 |
"metadata": {},
118 |
"outputs": [
119 |
120 |
"data": {
121 |
"text/plain": [
122 |
"dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
123 |
124 |
125 |
"execution_count": 4,
126 |
"metadata": {},
127 |
"output_type": "execute_result"
128 |
129 |
130 |
"source": [
131 |
"#maskeleme yaptıktan sonra tokenlere çeviriyoruz\n",
132 |
"inputs= tokenizer(text,return_tensors='pt')\n",
133 |
134 |
135 |
136 |
137 |
"cell_type": "code",
138 |
"execution_count": 5,
139 |
"metadata": {},
140 |
"outputs": [
141 |
142 |
"data": {
143 |
"text/plain": [
144 |
"tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
145 |
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
146 |
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
147 |
" 8051, 2458, 1007, 1010, 2000, 103, 1997, 2009, 3001, 1010,\n",
148 |
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 103,\n",
149 |
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
150 |
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
151 |
" 2000, 4503, 2037, 5097, 1012, 102]])"
152 |
153 |
154 |
"execution_count": 5,
155 |
"metadata": {},
156 |
"output_type": "execute_result"
157 |
158 |
159 |
"source": [
160 |
161 |
162 |
163 |
164 |
"cell_type": "code",
165 |
"execution_count": 6,
166 |
"metadata": {},
167 |
"outputs": [],
168 |
"source": [
169 |
"text_normal= (\"After reading these reports,\"\n",
170 |
" \"we start an outline of the application of ML.\"\n",
171 |
" \"It includes the learning process \"\n",
172 |
" \"and various applications (from various software development to hardware development), to analysis of IT systems, and various approaches on analytics.\"\n",
173 |
" \"The approach incorporates AI as well as computing and data mining.\"\n",
174 |
" \"For example, software developers and manufacturing engineers used AI \"\n",
175 |
" \"in manufacturing to develop their applications.\"\n",
176 |
" )"
177 |
178 |
179 |
180 |
"cell_type": "code",
181 |
"execution_count": 8,
182 |
"metadata": {},
183 |
"outputs": [
184 |
185 |
"data": {
186 |
"text/plain": [
187 |
"dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])"
188 |
189 |
190 |
"execution_count": 8,
191 |
"metadata": {},
192 |
"output_type": "execute_result"
193 |
194 |
195 |
"source": [
196 |
"#texti tokenlere çeviriyoruz\n",
197 |
"inputs_2= tokenizer(text_normal,return_tensors='pt')\n",
198 |
199 |
200 |
201 |
202 |
"cell_type": "code",
203 |
"execution_count": 9,
204 |
"metadata": {},
205 |
"outputs": [
206 |
207 |
"data": {
208 |
"text/plain": [
209 |
"tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
210 |
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
211 |
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
212 |
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
213 |
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
214 |
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
215 |
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
216 |
" 2000, 4503, 2037, 5097, 1012, 102]])"
217 |
218 |
219 |
"execution_count": 9,
220 |
"metadata": {},
221 |
"output_type": "execute_result"
222 |
223 |
224 |
"source": [
225 |
226 |
227 |
228 |
229 |
"cell_type": "code",
230 |
"execution_count": 10,
231 |
"metadata": {},
232 |
"outputs": [],
233 |
"source": [
234 |
"inputs_2['labels']= inputs_2.input_ids.detach().clone()"
235 |
236 |
237 |
238 |
"cell_type": "code",
239 |
"execution_count": 11,
240 |
"metadata": {},
241 |
"outputs": [
242 |
243 |
"data": {
244 |
"text/plain": [
245 |
"{'input_ids': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
246 |
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
247 |
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
248 |
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
249 |
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
250 |
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
251 |
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
252 |
" 2000, 4503, 2037, 5097, 1012, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
253 |
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
254 |
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
255 |
" 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
256 |
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
257 |
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
258 |
" 1, 1, 1, 1]]), 'labels': tensor([[ 101, 2044, 3752, 2122, 4311, 1010, 2057, 2707, 2019, 12685,\n",
259 |
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 4083,\n",
260 |
" 2832, 1998, 2536, 5097, 1006, 2013, 2536, 4007, 2458, 2000,\n",
261 |
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
262 |
" 1998, 2536, 8107, 2006, 25095, 1012, 1996, 3921, 12374, 9932,\n",
263 |
" 2004, 2092, 2004, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
264 |
" 1010, 4007, 9797, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
265 |
" 2000, 4503, 2037, 5097, 1012, 102]])}"
266 |
267 |
268 |
"execution_count": 11,
269 |
"metadata": {},
270 |
"output_type": "execute_result"
271 |
272 |
273 |
"source": [
274 |
275 |
276 |
277 |
278 |
"cell_type": "code",
279 |
"execution_count": 12,
280 |
"metadata": {},
281 |
"outputs": [
282 |
283 |
"data": {
284 |
"text/plain": [
285 |
"torch.Size([1, 76])"
286 |
287 |
288 |
"execution_count": 12,
289 |
"metadata": {},
290 |
"output_type": "execute_result"
291 |
292 |
293 |
"source": [
294 |
"#random tokenler oluşturacağız labelsiz\n",
295 |
296 |
297 |
298 |
299 |
300 |
"cell_type": "code",
301 |
"execution_count": 13,
302 |
"metadata": {},
303 |
"outputs": [
304 |
305 |
"data": {
306 |
"text/plain": [
307 |
"tensor([[0.9397, 0.1325, 0.1893, 0.8258, 0.7453, 0.1766, 0.9338, 0.0806, 0.0626,\n",
308 |
" 0.6665, 0.4240, 0.3946, 0.5413, 0.3799, 0.4023, 0.8699, 0.8159, 0.1511,\n",
309 |
" 0.6842, 0.0242, 0.7235, 0.0063, 0.1857, 0.9684, 0.8930, 0.8208, 0.5711,\n",
310 |
" 0.0345, 0.9919, 0.1140, 0.7597, 0.4546, 0.6478, 0.2295, 0.2846, 0.6314,\n",
311 |
" 0.3640, 0.9291, 0.3843, 0.3553, 0.1125, 0.0790, 0.4261, 0.4307, 0.6724,\n",
312 |
" 0.8569, 0.4476, 0.8032, 0.0241, 0.0152, 0.4196, 0.5609, 0.0010, 0.7240,\n",
313 |
" 0.4531, 0.5834, 0.5232, 0.3602, 0.6575, 0.9012, 0.1519, 0.2255, 0.0799,\n",
314 |
" 0.5673, 0.7244, 0.4387, 0.2713, 0.4243, 0.8435, 0.1670, 0.8664, 0.6261,\n",
315 |
" 0.4090, 0.2988, 0.3379, 0.7784]])"
316 |
317 |
318 |
"execution_count": 13,
319 |
"metadata": {},
320 |
"output_type": "execute_result"
321 |
322 |
323 |
"source": [
324 |
325 |
326 |
327 |
328 |
"cell_type": "code",
329 |
"execution_count": 14,
330 |
"metadata": {},
331 |
"outputs": [
332 |
333 |
"data": {
334 |
"text/plain": [
335 |
"tensor([[False, True, False, False, False, False, False, True, True, False,\n",
336 |
" False, False, False, False, False, False, False, False, False, True,\n",
337 |
" False, True, False, False, False, False, False, True, False, True,\n",
338 |
" False, False, False, False, False, False, False, False, False, False,\n",
339 |
" True, True, False, False, False, False, False, False, True, True,\n",
340 |
" False, False, True, False, False, False, False, False, False, False,\n",
341 |
" False, False, True, False, False, False, False, False, False, False,\n",
342 |
" False, False, False, False, False, False]])"
343 |
344 |
345 |
"execution_count": 14,
346 |
"metadata": {},
347 |
"output_type": "execute_result"
348 |
349 |
350 |
"source": [
351 |
"#cümledeki toknelerin yüzde 15 alınır \n",
352 |
"#mask_arr = rand < 0.15 ifadesi, rand fonksiyonunun her bir token için rastgele bir sayı üreteceğini ve bu sayının 0.15'ten küçük olup olmadığına bakarak token'ın maskelenip maskelenmeyeceğini belirler. Eğer sayı 0.15'ten küçükse, token maskelenir; değilse, maskelenmez. \n",
353 |
"mask_arr = rand < 0.15\n",
354 |
355 |
356 |
357 |
358 |
"cell_type": "code",
359 |
"execution_count": 15,
360 |
"metadata": {},
361 |
"outputs": [
362 |
363 |
"data": {
364 |
"text/plain": [
365 |
"[1, 7, 8, 19, 21, 27, 29, 40, 41, 48, 49, 52, 62]"
366 |
367 |
368 |
"execution_count": 15,
369 |
"metadata": {},
370 |
"output_type": "execute_result"
371 |
372 |
373 |
"source": [
374 |
"#burada seçilen değer maskeleme yapılan tokenlarda 0 olmayan karakterlerin yazılmasıdır.\n",
375 |
"#torch flatten özelliği listeden çıkartarak yalnızca bir array olmasını sağladı\n",
376 |
"selection= torch.flatten(mask_arr[0].nonzero()).tolist()\n",
377 |
378 |
379 |
380 |
381 |
"cell_type": "code",
382 |
"execution_count": 16,
383 |
"metadata": {},
384 |
"outputs": [
385 |
386 |
"data": {
387 |
"text/plain": [
388 |
"tensor([[ 101, 103, 3752, 2122, 4311, 1010, 2057, 103, 103, 12685,\n",
389 |
" 1997, 1996, 4646, 1997, 19875, 1012, 2009, 2950, 1996, 103,\n",
390 |
" 2832, 103, 2536, 5097, 1006, 2013, 2536, 103, 2458, 103,\n",
391 |
" 8051, 2458, 1007, 1010, 2000, 4106, 1997, 2009, 3001, 1010,\n",
392 |
" 103, 103, 8107, 2006, 25095, 1012, 1996, 3921, 103, 103,\n",
393 |
" 2004, 2092, 103, 9798, 1998, 2951, 5471, 1012, 2005, 2742,\n",
394 |
" 1010, 4007, 103, 1998, 5814, 6145, 2109, 9932, 1999, 5814,\n",
395 |
" 2000, 4503, 2037, 5097, 1012, 102]])"
396 |
397 |
398 |
"execution_count": 16,
399 |
"metadata": {},
400 |
"output_type": "execute_result"
401 |
402 |
403 |
"source": [
404 |
"#input_ids değerleri 0 olanlar için 103 değerinim atadık \n",
405 |
406 |
407 |
408 |
409 |
410 |
"cell_type": "code",
411 |
"execution_count": 17,
412 |
"metadata": {},
413 |
"outputs": [],
414 |
"source": [
415 |
"outputs= model(**inputs_2)"
416 |
417 |
418 |
419 |
"cell_type": "code",
420 |
"execution_count": 18,
421 |
"metadata": {},
422 |
"outputs": [
423 |
424 |
"data": {
425 |
"text/plain": [
426 |
"odict_keys(['loss', 'logits'])"
427 |
428 |
429 |
"execution_count": 18,
430 |
"metadata": {},
431 |
"output_type": "execute_result"
432 |
433 |
434 |
"source": [
435 |
436 |
437 |
438 |
439 |
"cell_type": "code",
440 |
"execution_count": 19,
441 |
"metadata": {},
442 |
"outputs": [
443 |
444 |
"data": {
445 |
"text/plain": [
446 |
"tensor(0.8399, grad_fn=<NllLossBackward0>)"
447 |
448 |
449 |
"execution_count": 19,
450 |
"metadata": {},
451 |
"output_type": "execute_result"
452 |
453 |
454 |
"source": [
455 |
456 |
457 |
458 |
459 |
"cell_type": "code",
460 |
"execution_count": 22,
461 |
"metadata": {},
462 |
"outputs": [
463 |
464 |
"name": "stderr",
465 |
"output_type": "stream",
466 |
"text": [
467 |
"c:\\gitProjects\\makaleChatUI\\myenv\\Lib\\site-packages\\huggingface_hub\\ UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\info\\.cache\\huggingface\\hub\\models--facebook--dpr-ctx_encoder-single-nq-base. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see\n",
468 |
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article:\n",
469 |
" warnings.warn(message)\n",
470 |
"Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
471 |
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
472 |
"Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.intermediate.dense.bias', 'bert_model.encoder.layer.0.intermediate.dense.weight', 'bert_model.encoder.layer.0.output.LayerNorm.bias', 'bert_model.encoder.layer.0.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.bias', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.1.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.1.attention.output.dense.bias', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.1.attention.self.key.bias', 'bert_model.encoder.layer.1.attention.self.key.weight', 'bert_model.encoder.layer.1.attention.self.query.bias', 'bert_model.encoder.layer.1.attention.self.query.weight', 'bert_model.encoder.layer.1.attention.self.value.bias', 'bert_model.encoder.layer.1.attention.self.value.weight', 'bert_model.encoder.layer.1.intermediate.dense.bias', 'bert_model.encoder.layer.1.intermediate.dense.weight', 'bert_model.encoder.layer.1.output.LayerNorm.bias', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.1.output.dense.bias', 'bert_model.encoder.layer.1.output.dense.weight', 'bert_model.encoder.layer.10.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.10.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.10.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.output.dense.weight', 'bert_model.encoder.layer.10.attention.self.key.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.10.attention.self.query.bias', 'bert_model.encoder.layer.10.attention.self.query.weight', 'bert_model.encoder.layer.10.attention.self.value.bias', 'bert_model.encoder.layer.10.attention.self.value.weight', 'bert_model.encoder.layer.10.intermediate.dense.bias', 'bert_model.encoder.layer.10.intermediate.dense.weight', 'bert_model.encoder.layer.10.output.LayerNorm.bias', 'bert_model.encoder.layer.10.output.LayerNorm.weight', 'bert_model.encoder.layer.10.output.dense.bias', 'bert_model.encoder.layer.10.output.dense.weight', 'bert_model.encoder.layer.11.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.11.attention.output.dense.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.11.attention.self.key.weight', 'bert_model.encoder.layer.11.attention.self.query.bias', 'bert_model.encoder.layer.11.attention.self.query.weight', 'bert_model.encoder.layer.11.attention.self.value.bias', 'bert_model.encoder.layer.11.attention.self.value.weight', 'bert_model.encoder.layer.11.intermediate.dense.bias', 'bert_model.encoder.layer.11.intermediate.dense.weight', 'bert_model.encoder.layer.11.output.LayerNorm.bias', 'bert_model.encoder.layer.11.output.LayerNorm.weight', 'bert_model.encoder.layer.11.output.dense.bias', 'bert_model.encoder.layer.11.output.dense.weight', 'bert_model.encoder.layer.2.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.2.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.2.attention.output.dense.bias', 'bert_model.encoder.layer.2.attention.output.dense.weight', 'bert_model.encoder.layer.2.attention.self.key.bias', 'bert_model.encoder.layer.2.attention.self.key.weight', 'bert_model.encoder.layer.2.attention.self.query.bias', 'bert_model.encoder.layer.2.attention.self.query.weight', 'bert_model.encoder.layer.2.attention.self.value.bias', 'bert_model.encoder.layer.2.attention.self.value.weight', 'bert_model.encoder.layer.2.intermediate.dense.bias', 'bert_model.encoder.layer.2.intermediate.dense.weight', 'bert_model.encoder.layer.2.output.LayerNorm.bias', 'bert_model.encoder.layer.2.output.LayerNorm.weight', 'bert_model.encoder.layer.2.output.dense.bias', 'bert_model.encoder.layer.2.output.dense.weight', 'bert_model.encoder.layer.3.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.3.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.3.attention.output.dense.weight', 'bert_model.encoder.layer.3.attention.self.key.bias', 'bert_model.encoder.layer.3.attention.self.key.weight', 'bert_model.encoder.layer.3.attention.self.query.bias', 'bert_model.encoder.layer.3.attention.self.query.weight', 'bert_model.encoder.layer.3.attention.self.value.bias', 'bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.3.intermediate.dense.bias', 'bert_model.encoder.layer.3.intermediate.dense.weight', 'bert_model.encoder.layer.3.output.LayerNorm.bias', 'bert_model.encoder.layer.3.output.LayerNorm.weight', 'bert_model.encoder.layer.3.output.dense.bias', 'bert_model.encoder.layer.3.output.dense.weight', 'bert_model.encoder.layer.4.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.4.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.4.attention.output.dense.bias', 'bert_model.encoder.layer.4.attention.output.dense.weight', 'bert_model.encoder.layer.4.attention.self.key.bias', 'bert_model.encoder.layer.4.attention.self.key.weight', 'bert_model.encoder.layer.4.attention.self.query.bias', 'bert_model.encoder.layer.4.attention.self.query.weight', 'bert_model.encoder.layer.4.attention.self.value.bias', 'bert_model.encoder.layer.4.attention.self.value.weight', 'bert_model.encoder.layer.4.intermediate.dense.bias', 'bert_model.encoder.layer.4.intermediate.dense.weight', 'bert_model.encoder.layer.4.output.LayerNorm.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.4.output.dense.bias', 'bert_model.encoder.layer.4.output.dense.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.dense.bias', 'bert_model.encoder.layer.5.attention.output.dense.weight', 'bert_model.encoder.layer.5.attention.self.key.bias', 'bert_model.encoder.layer.5.attention.self.key.weight', 'bert_model.encoder.layer.5.attention.self.query.bias', 'bert_model.encoder.layer.5.attention.self.query.weight', 'bert_model.encoder.layer.5.attention.self.value.bias', 'bert_model.encoder.layer.5.attention.self.value.weight', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_model.encoder.layer.5.output.LayerNorm.bias', 'bert_model.encoder.layer.5.output.LayerNorm.weight', 'bert_model.encoder.layer.5.output.dense.bias', 'bert_model.encoder.layer.5.output.dense.weight', 'bert_model.encoder.layer.6.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.6.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.6.attention.output.dense.bias', 'bert_model.encoder.layer.6.attention.output.dense.weight', 'bert_model.encoder.layer.6.attention.self.key.bias', 'bert_model.encoder.layer.6.attention.self.key.weight', 'bert_model.encoder.layer.6.attention.self.query.bias', 'bert_model.encoder.layer.6.attention.self.query.weight', 'bert_model.encoder.layer.6.attention.self.value.bias', 'bert_model.encoder.layer.6.attention.self.value.weight', 'bert_model.encoder.layer.6.intermediate.dense.bias', 'bert_model.encoder.layer.6.intermediate.dense.weight', 'bert_model.encoder.layer.6.output.LayerNorm.bias', 'bert_model.encoder.layer.6.output.LayerNorm.weight', 'bert_model.encoder.layer.6.output.dense.bias', 'bert_model.encoder.layer.6.output.dense.weight', 'bert_model.encoder.layer.7.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.7.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.7.attention.output.dense.bias', 'bert_model.encoder.layer.7.attention.output.dense.weight', 'bert_model.encoder.layer.7.attention.self.key.bias', 'bert_model.encoder.layer.7.attention.self.key.weight', 'bert_model.encoder.layer.7.attention.self.query.bias', 'bert_model.encoder.layer.7.attention.self.query.weight', 'bert_model.encoder.layer.7.attention.self.value.bias', 'bert_model.encoder.layer.7.attention.self.value.weight', 'bert_model.encoder.layer.7.intermediate.dense.bias', 'bert_model.encoder.layer.7.intermediate.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.7.output.LayerNorm.weight', 'bert_model.encoder.layer.7.output.dense.bias', 'bert_model.encoder.layer.7.output.dense.weight', 'bert_model.encoder.layer.8.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.8.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.8.attention.output.dense.bias', 'bert_model.encoder.layer.8.attention.output.dense.weight', 'bert_model.encoder.layer.8.attention.self.key.bias', 'bert_model.encoder.layer.8.attention.self.key.weight', 'bert_model.encoder.layer.8.attention.self.query.bias', 'bert_model.encoder.layer.8.attention.self.query.weight', 'bert_model.encoder.layer.8.attention.self.value.bias', 'bert_model.encoder.layer.8.attention.self.value.weight', 'bert_model.encoder.layer.8.intermediate.dense.bias', 'bert_model.encoder.layer.8.intermediate.dense.weight', 'bert_model.encoder.layer.8.output.LayerNorm.bias', 'bert_model.encoder.layer.8.output.LayerNorm.weight', 'bert_model.encoder.layer.8.output.dense.bias', 'bert_model.encoder.layer.8.output.dense.weight', 'bert_model.encoder.layer.9.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.9.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.9.attention.output.dense.bias', 'bert_model.encoder.layer.9.attention.output.dense.weight', 'bert_model.encoder.layer.9.attention.self.key.bias', 'bert_model.encoder.layer.9.attention.self.key.weight', 'bert_model.encoder.layer.9.attention.self.query.bias', 'bert_model.encoder.layer.9.attention.self.query.weight', 'bert_model.encoder.layer.9.attention.self.value.bias', 'bert_model.encoder.layer.9.attention.self.value.weight', 'bert_model.encoder.layer.9.intermediate.dense.bias', 'bert_model.encoder.layer.9.intermediate.dense.weight', 'bert_model.encoder.layer.9.output.LayerNorm.bias', 'bert_model.encoder.layer.9.output.LayerNorm.weight', 'bert_model.encoder.layer.9.output.dense.bias', 'bert_model.encoder.layer.9.output.dense.weight']\n",
473 |
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
474 |
"Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']\n",
475 |
"- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
476 |
"- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
477 |
478 |
479 |
480 |
"source": [
481 |
"from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer\n",
482 |
483 |
484 |
485 |
486 |
"question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
487 |
"question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')\n",
488 |
489 |
490 |
491 |
492 |
"cell_type": "code",
493 |
"execution_count": null,
494 |
"metadata": {},
495 |
"outputs": [],
496 |
"source": [
497 |
"\"\"\"title = [\"2024 Yılında Mobil Teknoloji Trendleri\"]\n",
498 |
"keywords = [\"mobil teknoloji\", \"2024 trendleri\", \"akıllı telefon yenilikleri\", \"5G teknolojisi\", \"giyilebilir cihazlar\"]\n",
499 |
"subheading = [\n",
500 |
" \"2024'te Akıllı Telefonlardaki Yenilikler\",\n",
501 |
" \"Giyilebilir Teknolojiler: Sağlık ve Fitness Trendleri\",\n",
502 |
" \"5G'nin Mobil Cihazlar Üzerindeki Etkisi\",\n",
503 |
" \"Mobil Güvenlikte Yeni Yaklaşımlar\"\n",
504 |
505 |
506 |
507 |
508 |
"cell_type": "code",
509 |
"execution_count": null,
510 |
"metadata": {},
511 |
"outputs": [],
512 |
"source": [
513 |
514 |
515 |
516 |
517 |
518 |
"metadata": {
519 |
"kernelspec": {
520 |
"display_name": "myenv",
521 |
"language": "python",
522 |
"name": "python3"
523 |
524 |
"language_info": {
525 |
"codemirror_mode": {
526 |
"name": "ipython",
527 |
"version": 3
528 |
529 |
"file_extension": ".py",
530 |
"mimetype": "text/x-python",
531 |
"name": "python",
532 |
"nbconvert_exporter": "python",
533 |
"pygments_lexer": "ipython3",
534 |
"version": "3.12.4"
535 |
536 |
537 |
"nbformat": 4,
538 |
"nbformat_minor": 2
539 |
@@ -0,0 +1,460 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "markdown",
5 |
"metadata": {},
6 |
"source": [
7 |
"Kütüphaneler eklenmesi"
8 |
9 |
10 |
11 |
"cell_type": "code",
12 |
"execution_count": 1,
13 |
"metadata": {},
14 |
"outputs": [
15 |
16 |
"name": "stderr",
17 |
"output_type": "stream",
18 |
"text": [
19 |
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\ TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See\n",
20 |
" from .autonotebook import tqdm as notebook_tqdm\n"
21 |
22 |
23 |
24 |
"source": [
25 |
"from datasets import load_dataset\n",
26 |
"import pandas as pd \n",
27 |
"from pymongo import MongoClient\n",
28 |
"from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder;\n",
29 |
30 |
31 |
32 |
33 |
34 |
"cell_type": "markdown",
35 |
"metadata": {},
36 |
"source": [
37 |
"Parquet dosyalarının dataframe olarak yüklenmesi(okuma yapabilmek için)"
38 |
39 |
40 |
41 |
"cell_type": "code",
42 |
"execution_count": 3,
43 |
"metadata": {},
44 |
"outputs": [],
45 |
"source": [
46 |
"# Parquet dosyalarını DataFrame olarak yükleyin\n",
47 |
"train_df1 = pd.read_parquet('C:\\\\gitProjects\\\\yeni\\\\wikipedia-tr\\\\data\\\\train-00000-of-00002-ed6b025df7a1f653.parquet')\n",
48 |
"train_df2 = pd.read_parquet('C:\\\\gitProjects\\\\yeni\\\\wikipedia-tr\\\\data\\\\train-00001-of-00002-0aa63953f8b51c17.parquet')\n"
49 |
50 |
51 |
52 |
"cell_type": "code",
53 |
"execution_count": 4,
54 |
"metadata": {},
55 |
"outputs": [],
56 |
"source": [
57 |
"# İki DataFrame'i birleştirin\n",
58 |
"merged_train = pd.concat([train_df1, train_df2], ignore_index=True)\n"
59 |
60 |
61 |
62 |
"cell_type": "code",
63 |
"execution_count": 5,
64 |
"metadata": {},
65 |
"outputs": [],
66 |
"source": [
67 |
"# Örneğin %80 train, %20 test olarak ayırın\n",
68 |
"train_data = merged_train.sample(frac=0.8, random_state=42)\n",
69 |
"test_data = merged_train.drop(train_data.index)\n"
70 |
71 |
72 |
73 |
"cell_type": "code",
74 |
"execution_count": 6,
75 |
"metadata": {},
76 |
"outputs": [],
77 |
"source": [
78 |
"import os\n",
79 |
80 |
"# Dosya yolları\n",
81 |
"train_dir = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\train_Egitim'\n",
82 |
"test_dir = 'C:\\\\gitProjects\\\\yeni\\\\datasets\\\\test_Egitim'\n",
83 |
"train_file_path = os.path.join(train_dir, 'merged_train.parquet')\n",
84 |
"test_file_path = os.path.join(test_dir, 'merged_test.parquet')\n",
85 |
86 |
"# Dizinlerin var olup olmadığını kontrol etme, gerekirse oluşturma\n",
87 |
"os.makedirs(train_dir, exist_ok=True)\n",
88 |
"os.makedirs(test_dir, exist_ok=True)\n",
89 |
90 |
"# Veriyi .parquet formatında kaydetme\n",
91 |
92 |
93 |
94 |
95 |
96 |
"cell_type": "markdown",
97 |
"metadata": {},
98 |
"source": [
99 |
"Dataframe deki bilgileri görme "
100 |
101 |
102 |
103 |
"cell_type": "code",
104 |
"execution_count": 7,
105 |
"metadata": {},
106 |
"outputs": [
107 |
108 |
"name": "stdout",
109 |
"output_type": "stream",
110 |
"text": [
111 |
" id url \\\n",
112 |
"515773 3525037 \n",
113 |
"517811 3532700 \n",
114 |
"436350 3203545 \n",
115 |
"223281 1765445 \n",
116 |
"100272 575462 \n",
117 |
118 |
" title text \n",
119 |
"515773 Pşıqo Ahecaqo Pşıqo Ahecaqo (), Çerkes siyasetçi, askeri kom... \n",
120 |
"517811 Craterolophinae Craterolophinae, Depastridae familyasına bağlı... \n",
121 |
"436350 Notocrabro Notocrabro Crabronina oymağına bağlı bir cinst... \n",
122 |
"223281 Ibrahim Sissoko İbrahim Sissoko (d. 30 Kasım 1991), Fildişi Sa... \n",
123 |
"100272 Salah Cedid Salah Cedid (1926-1993) (Arapça: صلاح جديد) Su... \n",
124 |
" id url title \\\n",
125 |
"5 35 Karl Marx \n",
126 |
"13 48 Ruhi Su \n",
127 |
"15 53 Bilgisayar \n",
128 |
"18 59 Edebiyat \n",
129 |
"19 64 Mühendislik \n",
130 |
131 |
" text \n",
132 |
"5 Karl Marx (; 5 Mayıs 1818, Trier – 14 Mart 188... \n",
133 |
"13 Mehmet Ruhi Su (1 Ocak 1912, Van - 20 Eylül 19... \n",
134 |
"15 Bilgisayar, aritmetik veya mantıksal işlem diz... \n",
135 |
"18 Edebiyat, yazın veya literatür; olay, düşünce,... \n",
136 |
"19 Mühendis, insanların her türlü ihtiyacını karş... \n"
137 |
138 |
139 |
140 |
"source": [
141 |
142 |
143 |
144 |
145 |
146 |
"cell_type": "markdown",
147 |
"metadata": {},
148 |
"source": [
149 |
"MongoDb'ye bağlama ve bilgi çekme "
150 |
151 |
152 |
153 |
"cell_type": "code",
154 |
"execution_count": 7,
155 |
"metadata": {},
156 |
"outputs": [
157 |
158 |
"name": "stdout",
159 |
"output_type": "stream",
160 |
"text": [
161 |
" Veriler başarıyla Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'EgitimDatabase'), 'train') MongoDb koleksiyonuna indirildi.\n",
162 |
" Veriler başarıyla Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'EgitimDatabase'), 'test') MongoDb koleksiyonuna indirildi.\n"
163 |
164 |
165 |
166 |
"source": [
167 |
"import pandas as pd\n",
168 |
"from pymongo import MongoClient\n",
169 |
170 |
"def get_mongodb(database_name='EgitimDatabase', train_collection_name='train', test_collection_name='test', host='localhost', port=27017):\n",
171 |
" \"\"\"\n",
172 |
" MongoDB connection and collection selection for train and test collections.\n",
173 |
" \"\"\"\n",
174 |
" client = MongoClient(f'mongodb://{host}:{port}/')\n",
175 |
" \n",
176 |
" # Veritabanını seçin\n",
177 |
" db = client[database_name]\n",
178 |
" \n",
179 |
" # Train ve test koleksiyonlarını seçin\n",
180 |
" train_collection = db[train_collection_name]\n",
181 |
" test_collection = db[test_collection_name]\n",
182 |
" \n",
183 |
" return train_collection, test_collection\n",
184 |
185 |
"# Function to load dataset into MongoDB\n",
186 |
"def dataset_read(train_file_path,test_file_path):\n",
187 |
" data_train = pd.read_parquet(train_file_path, columns=['id', 'url', 'title', 'text'])\n",
188 |
" data_test = pd.read_parquet(test_file_path, columns=['id', 'url', 'title', 'text'])\n",
189 |
" data_dict_train = data_train.to_dict(\"records\")\n",
190 |
" data_dict_test = data_test.to_dict(\"records\")\n",
191 |
192 |
193 |
194 |
" # Get the MongoDB collections\n",
195 |
" train_collection, test_collection = get_mongodb(database_name='EgitimDatabase')\n",
196 |
197 |
" \n",
198 |
199 |
" # Insert data into MongoDB\n",
200 |
" train_collection.insert_many(data_dict_train)\n",
201 |
" test_collection.insert_many(data_dict_test)\n",
202 |
203 |
204 |
" print(f\" Veriler başarıyla {train_collection} MongoDb koleksiyonuna indirildi.\")\n",
205 |
" print(f\" Veriler başarıyla {test_collection} MongoDb koleksiyonuna indirildi.\")\n",
206 |
" return train_collection,test_collection\n",
207 |
208 |
"# Train ve test datasetlerini MongoDB'ye yüklemek için fonksiyonu çağır\n",
209 |
"train_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\train_Egitim\\\\merged_train.parquet'\n",
210 |
"test_file_path = 'C:\\\\gitProjects\\\\bert\\\\datasets\\\\test_Egitim\\\\merged_test.parquet'\n",
211 |
212 |
"train_collection, test_collection = dataset_read(train_file_path, test_file_path)"
213 |
214 |
215 |
216 |
"cell_type": "markdown",
217 |
"metadata": {},
218 |
"source": [
219 |
"Similarity Sentences "
220 |
221 |
222 |
223 |
"cell_type": "code",
224 |
"execution_count": 8,
225 |
"metadata": {},
226 |
"outputs": [
227 |
228 |
"ename": "ModuleNotFoundError",
229 |
"evalue": "No module named 'torch.amp'",
230 |
"output_type": "error",
231 |
"traceback": [
232 |
233 |
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
234 |
"Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m#datasete similarity sentence yardımıyla keywords ve subheadings tanımlama \u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SentenceTransformer\n\u001b[0;32m 6\u001b[0m model \u001b[38;5;241m=\u001b[39m SentenceTransformer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124memrecan/bert-base-turkish-cased-mean-nli-stsb-tr\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 7\u001b[0m \u001b[38;5;66;03m#text dosyasını koleksiyon üzerinden çekme \u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\u001b[39;00m\n",
235 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcross_encoder\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mCrossEncoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CrossEncoder\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParallelSentencesDataset, SentencesDataset\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mLoggingHandler\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LoggingHandler\n",
236 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\cross_encoder\\\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mCrossEncoder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CrossEncoder\n\u001b[0;32m 3\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCrossEncoder\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
237 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\sentence_transformers\\cross_encoder\\\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Callable, Dict, List, Literal, Optional, Tuple, Type, Union\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Tensor, nn\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moptim\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optimizer\n",
238 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\\u001b[0m\n\u001b[0;32m 1683\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to find torch_shm_manager at \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m path)\n\u001b[0;32m 1684\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m path\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m-> 1686\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamp\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m autocast, GradScaler\n\u001b[0;32m 1688\u001b[0m \u001b[38;5;66;03m# Initializing the extension shadows the built-in python float / int classes;\u001b[39;00m\n\u001b[0;32m 1689\u001b[0m \u001b[38;5;66;03m# store them for later use by SymInt / SymFloat.\u001b[39;00m\n\u001b[0;32m 1690\u001b[0m py_float \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mfloat\u001b[39m\n",
239 |
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch.amp'"
240 |
241 |
242 |
243 |
"source": [
244 |
"#datasete similarity sentence yardımıyla keywords ve subheadings tanımlama \n",
245 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
246 |
"from sentence_transformers import SentenceTransformer\n",
247 |
248 |
249 |
"model = SentenceTransformer(\"emrecan/bert-base-turkish-cased-mean-nli-stsb-tr\")\n",
250 |
"#text dosyasını koleksiyon üzerinden çekme \n",
251 |
"# Database sınıfı: Veritabanı bağlantıları ve verileri çekme işlevleri\n",
252 |
"class Database:\n",
253 |
" @staticmethod\n",
254 |
" def get_mongodb():\n",
255 |
" # MongoDB bağlantı bilgilerini döndürecek şekilde tanımlanmıştır.\n",
256 |
" return 'mongodb://localhost:27017/', 'EgitimDatabase', 'train'\n",
257 |
258 |
" @staticmethod\n",
259 |
" def get_input_titles():\n",
260 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
261 |
" client = MongoClient(mongo_url)\n",
262 |
" db = client[db_name]\n",
263 |
" collection = db[collection_name]\n",
264 |
" query = {\"title\": {\"$exists\": True}}\n",
265 |
" cursor = collection.find(query, {\"title\": 1, \"_id\": 0})\n",
266 |
" title_from_db = [doc['title'] for doc in cursor]\n",
267 |
" return title_from_db\n",
268 |
" \n",
269 |
" @staticmethod\n",
270 |
" def get_input_texts():\n",
271 |
" mongo_url, db_name, collection_name = Database.get_mongodb()\n",
272 |
" client = MongoClient(mongo_url)\n",
273 |
" db = client[db_name]\n",
274 |
" collection = db[collection_name]\n",
275 |
" query = {\"text\": {\"$exists\": True}}\n",
276 |
" cursor = collection.find(query, {\"text\": 1, \"_id\": 0})\n",
277 |
" text_from_db = [doc['text'] for doc in cursor]\n",
278 |
" return text_from_db\n",
279 |
280 |
281 |
"#tf-ıdf hesaplama (anahtar kelimeler için)\n",
282 |
283 |
284 |
"#IDF = log ( Dokuman Sayısı / Terimin Geçtiği Dokuman Sayısı )\n",
285 |
286 |
"#text ve title a göre keywords belirlenmesi\n",
287 |
288 |
289 |
290 |
291 |
"#sbert ile alt başlıkların oluşturulması\n",
292 |
293 |
"#kümelenme ile alt başlıkların belirlenmesi \n",
294 |
295 |
296 |
297 |
"#anahatar kelime ve alt başlıkların veri tabnaına eklnemesi "
298 |
299 |
300 |
301 |
"cell_type": "markdown",
302 |
"metadata": {},
303 |
"source": []
304 |
305 |
306 |
"cell_type": "markdown",
307 |
"metadata": {},
308 |
"source": []
309 |
310 |
311 |
"cell_type": "code",
312 |
"execution_count": null,
313 |
"metadata": {},
314 |
"outputs": [],
315 |
"source": [
316 |
"#prompt oluştururak generate etmek için hazırlık"
317 |
318 |
319 |
320 |
"cell_type": "markdown",
321 |
"metadata": {},
322 |
"source": [
323 |
"Bert Modeliyle tokenizer atama"
324 |
325 |
326 |
327 |
"cell_type": "code",
328 |
"execution_count": null,
329 |
"metadata": {},
330 |
"outputs": [],
331 |
"source": [
332 |
"tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')\n",
333 |
334 |
335 |
336 |
337 |
"input_file: Modelin işlem yapacağı giriş dosyasının yolunu belirtir. Bu dosya, metin verilerini içermelidir.\n",
338 |
339 |
"output_file: Modelin çıktılarının kaydedileceği dosyanın yolunu belirtir.\n",
340 |
341 |
"layers: Hangi BERT katmanlarının kullanılacağını belirler. Örneğin, \"-1,-2,-3,-4\" son dört katmanı ifade eder.\n",
342 |
343 |
"bert_config_file: Önceden eğitilmiş BERT modelinin yapılandırma dosyasının yolu. Bu dosya modelin mimarisini belirler.\n",
344 |
345 |
"max_seq_length: Giriş sekanslarının maksimum uzunluğu. Sekanslar bu uzunluktan uzunsa kesilir, kısa ise sıfır ile doldurulur.\n",
346 |
347 |
"init_checkpoint: Başlangıç ağırlıkları. Genellikle önceden eğitilmiş bir BERT modelinin ağırlıkları buradan yüklenir.\n",
348 |
349 |
"vocab_file: BERT modelinin eğitildiği kelime dağarcığının (vocabulary) dosya yolu. Modelin kelime parçacıklarını tanıması için gereklidir.\n",
350 |
351 |
"do_lower_case: Giriş metinlerinin küçük harfe mi dönüştürüleceğini belirler. Küçük harfli model için True, büyük harfli model için False olmalıdır.\n",
352 |
353 |
"batch_size: Tahminler sırasında kullanılacak veri kümesi boyutu.\n",
354 |
355 |
"use_tpu: TPU (Tensor Processing Unit) kullanılıp kullanılmayacağını belirler. True ise TPU, False ise GPU/CPU kullanılır.\n",
356 |
357 |
"master: TPU kullanılıyorsa, TPU'nun ana makinesinin adresi.\n",
358 |
359 |
"num_tpu_cores: TPU kullanılacaksa, toplam TPU çekirdek sayısını belirtir.\n",
360 |
361 |
"use_one_hot_embeddings: TPUs'da genellikle True olarak ayarlanır çünkü bu, tf.one_hot fonksiyonunu kullanarak embedding lookup işlemlerini hızlandırır. GPU/CPU kullanılıyorsa False tercih edilir.\"\"\"\n"
362 |
363 |
364 |
365 |
"cell_type": "markdown",
366 |
"metadata": {},
367 |
"source": [
368 |
"t5 Modeli"
369 |
370 |
371 |
372 |
"cell_type": "code",
373 |
"execution_count": null,
374 |
"metadata": {},
375 |
"outputs": [],
376 |
"source": [
377 |
"from transformers import pipeline\n",
378 |
"from dotenv import load_dotenv\n",
379 |
"import os \n",
380 |
"# Load model directly\n",
381 |
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
382 |
383 |
384 |
"#tokenizer ve modelin yüklenmesi\n",
385 |
"tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-small\")\n",
386 |
"model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-small\")\n",
387 |
"prompt = \"Write an article about Machine Learning in Healthcare focusing on Introduction to ML and Applications in Healthcare.\"\n",
388 |
"#api anahtarını çevresel değişken al\n",
389 |
"api_key= os.getenv('HUGGINGFACE_API_KEY')\n",
390 |
"#env dosyasını yükleme\n",
391 |
392 |
393 |
394 |
"if api_key is None:\n",
395 |
" raise ValueError(\"Apı anahtarı .env dosyasında bulunamadı\")\n",
396 |
397 |
"# Başlıkları oluştur\n",
398 |
"headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
399 |
400 |
"inputs=tokenizer(prompt, return_tensors=\"pt\")\n",
401 |
"input_sequence = \"[CLS] Machine Learning in Healthcare [SEP] Introduction to ML [SEP] Applications in Healthcare [SEP] machine learning, healthcare, AI [SEP]\"\n",
402 |
"#deneme data parçası\n",
403 |
"data = {\n",
404 |
" \"title\": \"Machine Learning in Healthcare\",\n",
405 |
" \"sub_headings\": [\"Introduction to ML\", \"Applications in Healthcare\"],\n",
406 |
" \"keywords\": [\"machine learning\", \"healthcare\", \"AI\"]\n",
407 |
408 |
409 |
"# Girdiyi oluşturma\n",
410 |
"prompt = (\n",
411 |
" f\"Title: {data['title']}\\n\"\n",
412 |
" f\"Sub-headings: {', '.join(data['sub_headings'])}\\n\"\n",
413 |
" f\"Keywords: {', '.join(data['keywords'])}\\n\"\n",
414 |
" f\"Content: {input_sequence}\\n\"\n",
415 |
" \"Please generate a detailed article based on the above information.\"\n",
416 |
417 |
418 |
"#metin üretimi \n",
419 |
"output_sequences = model.generate(\n",
420 |
" inputs['input_ids'],\n",
421 |
" max_length=300, # Üretilecek metnin maksimum uzunluğu\n",
422 |
" min_length=150, # Üretilecek metnin minimum uzunluğu\n",
423 |
" num_return_sequences=1, # Döndürülecek metin sayısı\n",
424 |
" do_sample=True, # Örneklemeye izin ver\n",
425 |
" top_k=50, # Top-k sampling kullan\n",
426 |
" top_p=0.95, # Top-p sampling kullan\n",
427 |
" repetition_penalty=1.2, # Anlamsız tekrarları önlemek için ceza\n",
428 |
" eos_token_id=tokenizer.eos_token_id # Tam cümlelerin oluşturulmasını sağla\n",
429 |
430 |
431 |
432 |
"# Üretilen metni token'lardan çözüp string'e çevir\n",
433 |
"generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)\n",
434 |
435 |
436 |
437 |
438 |
439 |
"metadata": {
440 |
"kernelspec": {
441 |
"display_name": "base",
442 |
"language": "python",
443 |
"name": "python3"
444 |
445 |
"language_info": {
446 |
"codemirror_mode": {
447 |
"name": "ipython",
448 |
"version": 3
449 |
450 |
"file_extension": ".py",
451 |
"mimetype": "text/x-python",
452 |
"name": "python",
453 |
"nbconvert_exporter": "python",
454 |
"pygments_lexer": "ipython3",
455 |
"version": "3.10.11"
456 |
457 |
458 |
"nbformat": 4,
459 |
"nbformat_minor": 2
460 |
@@ -0,0 +1,69 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 3,
6 |
"metadata": {},
7 |
"outputs": [
8 |
9 |
"name": "stderr",
10 |
"output_type": "stream",
11 |
"text": [
12 |
"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\tqdm\\ TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See\n",
13 |
" from .autonotebook import tqdm as notebook_tqdm\n"
14 |
15 |
16 |
17 |
"ename": "OSError",
18 |
"evalue": "[WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies.",
19 |
"output_type": "error",
20 |
"traceback": [
21 |
22 |
"\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)",
23 |
"Cell \u001b[1;32mIn[3], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m \n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpymongo\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MongoClient\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder\n",
24 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING\n\u001b[0;32m 25\u001b[0m \u001b[38;5;66;03m# Check the dependencies satisfy the minimal versions required.\u001b[39;00m\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dependency_versions_check\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 28\u001b[0m OptionalDependencyNotAvailable,\n\u001b[0;32m 29\u001b[0m _LazyModule,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 48\u001b[0m logging,\n\u001b[0;32m 49\u001b[0m )\n\u001b[0;32m 52\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m) \u001b[38;5;66;03m# pylint: disable=invalid-name\u001b[39;00m\n",
25 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Team. All rights reserved.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdependency_versions_table\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m deps\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_version, require_version_core\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# define which module versions we always want to check at run time\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m# (usually the ones defined in `install_requires` in\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;66;03m# order specific notes:\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;66;03m# - tqdm must be checked before tokenizers\u001b[39;00m\n\u001b[0;32m 25\u001b[0m pkgs_to_check_at_runtime \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpython\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtqdm\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyyaml\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 38\u001b[0m ]\n",
26 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstants\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD\n\u001b[0;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdoc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 27\u001b[0m add_code_sample_docstrings,\n\u001b[0;32m 28\u001b[0m add_end_docstrings,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 32\u001b[0m replace_return_docstrings,\n\u001b[0;32m 33\u001b[0m )\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgeneric\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 35\u001b[0m ContextManagers,\n\u001b[0;32m 36\u001b[0m ExplicitEnum,\n\u001b[0;32m 37\u001b[0m ModelOutput,\n\u001b[0;32m 38\u001b[0m PaddingStrategy,\n\u001b[0;32m 39\u001b[0m TensorType,\n\u001b[0;32m 40\u001b[0m add_model_info_to_auto_map,\n\u001b[0;32m 41\u001b[0m add_model_info_to_custom_pipelines,\n\u001b[0;32m 42\u001b[0m cached_property,\n\u001b[0;32m 43\u001b[0m can_return_loss,\n\u001b[0;32m 44\u001b[0m expand_dims,\n\u001b[0;32m 45\u001b[0m filter_out_non_signature_kwargs,\n\u001b[0;32m 46\u001b[0m find_labels,\n\u001b[0;32m 47\u001b[0m flatten_dict,\n\u001b[0;32m 48\u001b[0m infer_framework,\n\u001b[0;32m 49\u001b[0m is_jax_tensor,\n\u001b[0;32m 50\u001b[0m is_numpy_array,\n\u001b[0;32m 51\u001b[0m is_tensor,\n\u001b[0;32m 52\u001b[0m is_tf_symbolic_tensor,\n\u001b[0;32m 53\u001b[0m is_tf_tensor,\n\u001b[0;32m 54\u001b[0m is_torch_device,\n\u001b[0;32m 55\u001b[0m is_torch_dtype,\n\u001b[0;32m 56\u001b[0m is_torch_tensor,\n\u001b[0;32m 57\u001b[0m reshape,\n\u001b[0;32m 58\u001b[0m squeeze,\n\u001b[0;32m 59\u001b[0m strtobool,\n\u001b[0;32m 60\u001b[0m tensor_size,\n\u001b[0;32m 61\u001b[0m to_numpy,\n\u001b[0;32m 62\u001b[0m to_py_obj,\n\u001b[0;32m 63\u001b[0m torch_float,\n\u001b[0;32m 64\u001b[0m torch_int,\n\u001b[0;32m 65\u001b[0m transpose,\n\u001b[0;32m 66\u001b[0m working_or_temp_dir,\n\u001b[0;32m 67\u001b[0m )\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 69\u001b[0m CLOUDFRONT_DISTRIB_PREFIX,\n\u001b[0;32m 70\u001b[0m HF_MODULES_CACHE,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 96\u001b[0m try_to_load_from_cache,\n\u001b[0;32m 97\u001b[0m )\n\u001b[0;32m 98\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimport_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 99\u001b[0m ACCELERATE_MIN_VERSION,\n\u001b[0;32m 100\u001b[0m ENV_VARS_TRUE_AND_AUTO_VALUES,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 219\u001b[0m torch_only_method,\n\u001b[0;32m 220\u001b[0m )\n",
27 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\transformers\\utils\\\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(\u001b[38;5;28mself\u001b[39m[k] \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_torch_available():\n\u001b[1;32m--> 462\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_pytree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_torch_pytree\u001b[39;00m\n\u001b[0;32m 464\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_model_output_flatten\u001b[39m(output: ModelOutput) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[List[Any], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_torch_pytree.Context\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 465\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mvalues()), \u001b[38;5;28mlist\u001b[39m(output\u001b[38;5;241m.\u001b[39mkeys())\n",
28 |
"File \u001b[1;32mc:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\\u001b[0m\n\u001b[0;32m 146\u001b[0m err \u001b[38;5;241m=\u001b[39m ctypes\u001b[38;5;241m.\u001b[39mWinError(ctypes\u001b[38;5;241m.\u001b[39mget_last_error())\n\u001b[0;32m 147\u001b[0m err\u001b[38;5;241m.\u001b[39mstrerror \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m Error loading \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdll\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m or one of its dependencies.\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 148\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[0;32m 150\u001b[0m kernel32\u001b[38;5;241m.\u001b[39mSetErrorMode(prev_error_mode)\n\u001b[0;32m 153\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_preload_cuda_deps\u001b[39m(lib_folder, lib_name):\n",
29 |
"\u001b[1;31mOSError\u001b[0m: [WinError 126] Belirtilen modül bulunamadı. Error loading \"c:\\gitProjects\\yeni\\.venv\\lib\\site-packages\\torch\\lib\\fbgemm.dll\" or one of its dependencies."
30 |
31 |
32 |
33 |
"source": [
34 |
"from datasets import load_dataset\n",
35 |
"import pandas as pd \n",
36 |
"from pymongo import MongoClient\n",
37 |
"from transformers import BertTokenizer, BertForMaskedLM, DPRContextEncoderTokenizer,DPRContextEncoder"
38 |
39 |
40 |
41 |
"cell_type": "code",
42 |
"execution_count": null,
43 |
"metadata": {},
44 |
"outputs": [],
45 |
"source": []
46 |
47 |
48 |
"metadata": {
49 |
"kernelspec": {
50 |
"display_name": ".venv",
51 |
"language": "python",
52 |
"name": "python3"
53 |
54 |
"language_info": {
55 |
"codemirror_mode": {
56 |
"name": "ipython",
57 |
"version": 3
58 |
59 |
"file_extension": ".py",
60 |
"mimetype": "text/x-python",
61 |
"name": "python",
62 |
"nbconvert_exporter": "python",
63 |
"pygments_lexer": "ipython3",
64 |
"version": "3.10.11"
65 |
66 |
67 |
"nbformat": 4,
68 |
"nbformat_minor": 2
69 |
@@ -0,0 +1,3 @@
1 |
home = C:\Users\info\AppData\Local\Programs\Python\Python310
2 |
include-system-site-packages = false
3 |
version = 3.10.11
@@ -1 +1,127 @@
1 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:8e1d337fc9fef9ba455b318d88c240308e260ae31a80552abfa690ecd897a05c
3 |
size 107383375
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:d8de4c861ff88766cd74eefde2e1eb501f782bfeca81779f4a29645735d01f2f
3 |
size 447530173
@@ -0,0 +1,54 @@
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
5 |
*.ckpt filter=lfs diff=lfs merge=lfs -text
6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
7 |
*.gz filter=lfs diff=lfs merge=lfs -text
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
11 |
*.lz4 filter=lfs diff=lfs merge=lfs -text
12 |
*.mlmodel filter=lfs diff=lfs merge=lfs -text
13 |
*.model filter=lfs diff=lfs merge=lfs -text
14 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
15 |
*.npy filter=lfs diff=lfs merge=lfs -text
16 |
*.npz filter=lfs diff=lfs merge=lfs -text
17 |
*.onnx filter=lfs diff=lfs merge=lfs -text
18 |
*.ot filter=lfs diff=lfs merge=lfs -text
19 |
*.parquet filter=lfs diff=lfs merge=lfs -text
20 |
*.pb filter=lfs diff=lfs merge=lfs -text
21 |
*.pickle filter=lfs diff=lfs merge=lfs -text
22 |
*.pkl filter=lfs diff=lfs merge=lfs -text
23 |
*.pt filter=lfs diff=lfs merge=lfs -text
24 |
*.pth filter=lfs diff=lfs merge=lfs -text
25 |
*.rar filter=lfs diff=lfs merge=lfs -text
26 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
27 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
32 |
*.xz filter=lfs diff=lfs merge=lfs -text
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
36 |
# Audio files - uncompressed
37 |
*.pcm filter=lfs diff=lfs merge=lfs -text
38 |
*.sam filter=lfs diff=lfs merge=lfs -text
39 |
*.raw filter=lfs diff=lfs merge=lfs -text
40 |
# Audio files - compressed
41 |
*.aac filter=lfs diff=lfs merge=lfs -text
42 |
*.flac filter=lfs diff=lfs merge=lfs -text
43 |
*.mp3 filter=lfs diff=lfs merge=lfs -text
44 |
*.ogg filter=lfs diff=lfs merge=lfs -text
45 |
*.wav filter=lfs diff=lfs merge=lfs -text
46 |
# Image files - uncompressed
47 |
*.bmp filter=lfs diff=lfs merge=lfs -text
48 |
*.gif filter=lfs diff=lfs merge=lfs -text
49 |
*.png filter=lfs diff=lfs merge=lfs -text
50 |
*.tiff filter=lfs diff=lfs merge=lfs -text
51 |
# Image files - compressed
52 |
*.jpg filter=lfs diff=lfs merge=lfs -text
53 |
*.jpeg filter=lfs diff=lfs merge=lfs -text
54 |
*.webp filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,85 @@
1 |
2 |
3 |
- no-annotation
4 |
5 |
- tr
6 |
7 |
- crowdsourced
8 |
9 |
- cc-by-sa-3.0
10 |
- gfdl
11 |
multilinguality: []
12 |
pretty_name: Turkish Wikipedia 2023
13 |
14 |
- 100K<n<1M
15 |
16 |
- original
17 |
18 |
- wikipedia, wiki,
19 |
20 |
- fill-mask
21 |
- text-generation
22 |
23 |
- masked-language-modeling
24 |
25 |
26 |
- name: id
27 |
dtype: string
28 |
- name: url
29 |
dtype: string
30 |
- name: title
31 |
dtype: string
32 |
- name: text
33 |
dtype: string
34 |
35 |
- name: train
36 |
num_bytes: 956353353
37 |
num_examples: 520542
38 |
download_size: 529875169
39 |
dataset_size: 956353353
40 |
41 |
42 |
# 📖 Türkçe Vikipedi Mayıs 2023
43 |
Bu veri kümesi, Türkçe Vikipedi'den alınan makalelerin bir derlemesi olup, maskeleme dil modelleme ve metin oluşturma görevleri için tasarlanmıştır.
44 |
45 |
46 |
## 🗣️ Etiketlemeler
47 |
Bu veri kümesindeki makaleler, özellikle belirli bir görev için etiketlenmemiş olup, veri kümesi etiketsizdir.
48 |
49 |
## 🌐 Dil
50 |
Bu veri kümesi Türkçe yazılmış olup, gönüllülerden oluşan bir ekip tarafından topluluk katılımı yöntemleri ile oluşturulmuştur.
51 |
52 |
## 📜 Lisans
53 |
CC-BY-SA 3.0 ve GFDL
54 |
55 |
## 💻 Kaynak Veri Kümeleri
56 |
Bu veri kümesi, Türkçe Vikipedi'den oluşturulan orijinal bir veri kümesidir.
57 |
58 |
59 |
Türkçe Vikipedi veri kümesini kullandığınız için teşekkürler! Dil modelleme ve metin oluşturma görevleriniz için faydalı olmasını umuyoruz.
60 |
61 |
62 |
63 |
# 📖 Wikipedia Turkish 2023
64 |
65 |
This dataset is a collection of articles from the Turkish Wikipedia and is designed to be used for masked language modeling and text generation tasks.
66 |
67 |
## 📚 Dataset Info
68 |
69 |
Processed and cleaned using Huggingface wikipedia cleaner.
70 |
71 |
## 🗣️ Annotations
72 |
73 |
The articles in this dataset were not specifically annotated for any particular task, meaning that the dataset is unlabeled.
74 |
75 |
## 🌐 Language
76 |
77 |
This dataset is written in Turkish and was created using crowdsourcing methods by a team of volunteers.
78 |
79 |
## 📜 License
80 |
81 |
CC-BY-SA 3.0 and GFDL
82 |
83 |
## 💻 Source Datasets
84 |
85 |
This dataset is an original dataset created from the Turkish Wikipedia.
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:cf5ad76fe61cebe5b5a165c9c74a510cd9ee8013dc23c04fca44c58d7ffe13cb
3 |
size 328791077
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:2165870d3d8504c01cd1783ac654efe9144f9f1094aed360deb64a0eaa211246
3 |
size 201084092