_________ / Zatta /tag_counter.py
Default38693's picture
Upload 8 files
ffc9a51
raw
history blame
1.28 kB
from pathlib import Path
from collections import defaultdict
# n割以上に共通してるタグをピックアップするやつ
def count_tags(directory_path):
dir_path = Path(directory_path)
tag_count = defaultdict(int)
total_files = 0
for file_path in dir_path.glob('*.txt'):
total_files += 1
with open(file_path, 'r') as f:
tags = f.read().replace(" ", "").strip().split(',')
for tag in tags:
tag_count[tag] += 1
print(tag)
return tag_count, total_files
def find_common_tags(tag_count, total_files, threshold):
common_tags = [tag for tag, count in tag_count.items() if count / total_files >= threshold]
return common_tags
if __name__ == "__main__":
directory_path = r"E:\Dataset\XXXXXXXX" # ここにディレクトリへのパスを入力してください
threshold = 0.8 # 8割以上のキャプションファイルに使われているタグのみ
tag_count, total_files = count_tags(directory_path)
print(tag_count)
print(total_files)
common_tags = find_common_tags(tag_count, total_files, threshold)
output = ", ".join(common_tags)
print(f"Common tags (used in {threshold * 100}% or more of the files): {output}")