Spaces:
Runtime error
Runtime error
File size: 3,836 Bytes
bfc0ec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
"""Find secret keys in text.
# Code forked from
# https://github.com/bigcode-project/pii-lib/blob/main/utils/keys_detection.py
# under the Apache 2.0 License.
"""
import os
import tempfile
from typing import Iterator
from detect_secrets import SecretsCollection
from detect_secrets.settings import transient_settings
from ..schema import Item, lilac_span
# Secrets detection with detect-secrets tool
filters = [
# some filters from
# https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters
# were removed based on their targets
{
'path': 'detect_secrets.filters.heuristic.is_potential_uuid'
},
{
'path': 'detect_secrets.filters.heuristic.is_likely_id_string'
},
{
'path': 'detect_secrets.filters.heuristic.is_templated_secret'
},
{
'path': 'detect_secrets.filters.heuristic.is_sequential_string'
},
]
plugins = [
{
'name': 'ArtifactoryDetector'
},
{
'name': 'AWSKeyDetector'
},
{
'name': 'AzureStorageKeyDetector'
},
{
'name': 'CloudantDetector'
},
{
'name': 'DiscordBotTokenDetector'
},
{
'name': 'GitHubTokenDetector'
},
{
'name': 'IbmCloudIamDetector'
},
{
'name': 'IbmCosHmacDetector'
},
{
'name': 'JwtTokenDetector'
},
{
'name': 'MailchimpDetector'
},
{
'name': 'NpmDetector'
},
{
'name': 'SendGridDetector'
},
{
'name': 'SlackDetector'
},
{
'name': 'SoftlayerDetector'
},
{
'name': 'StripeDetector'
},
{
'name': 'TwilioKeyDetector'
},
# remove 3 plugins for keyword
# {'name': 'BasicAuthDetector'},
# {'name': 'KeywordDetector'},
# {'name': 'PrivateKeyDetector'},
]
def _is_hash(content: str, value: str) -> bool:
"""Second check if the value is a hash (after gibberish detector)."""
try:
index = content.index(value)
except ValueError:
return False
lines = content[:index].splitlines()
if not lines:
return False
target_line = lines[-1]
if len(value) in [32, 40, 64]:
# if 'sha' or 'md5' are in content:
keywords = ['sha', 'md5', 'hash', 'byte']
if any(x in target_line.lower() for x in keywords):
return True
return False
def _file_has_hashes(content: str, coeff: float = 0.02) -> bool:
"""Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines."""
lines = content.splitlines()
count_sha = 0
count_hash = 0
nlines = content.count('\n')
threshold = int(coeff * nlines)
for line in lines:
count_sha += line.lower().count('sha')
count_hash += line.lower().count('hash')
if count_sha > threshold or count_hash > threshold:
return True
return False
def _get_indexes(text: str, value: str) -> list[tuple[int, int]]:
string = text
indexes: list[int] = []
new_start = 0
while True:
try:
start = string.index(value)
indexes.append(new_start + start)
new_start = new_start + start + len(value)
string = text[new_start:]
except ValueError:
break
return [(x, x + len(value)) for x in indexes]
def find_secrets(content: str, suffix: str = '.txt') -> Iterator[Item]:
"""Detect secret keys in content using detect-secrets tool."""
fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w')
fp.write(content)
fp.close()
secrets = SecretsCollection()
with transient_settings({'plugins_used': plugins, 'filters_used': filters}):
secrets.scan_file(fp.name)
os.unlink(fp.name)
secrets_set = list(secrets.data.values())
if not secrets_set:
return
for secret in secrets_set[0]:
if not secret.secret_value:
continue
if _is_hash(content, secret.secret_value) or _file_has_hashes(content):
continue
indexes = _get_indexes(content, secret.secret_value)
for start, end in indexes:
yield lilac_span(start, end)
|