|
from pathlib import Path |
|
from collections import defaultdict |
|
|
|
|
|
|
|
def count_tags(directory_path): |
|
dir_path = Path(directory_path) |
|
tag_count = defaultdict(int) |
|
total_files = 0 |
|
|
|
for file_path in dir_path.glob('*.txt'): |
|
total_files += 1 |
|
with open(file_path, 'r') as f: |
|
tags = f.read().replace(" ", "").strip().split(',') |
|
for tag in tags: |
|
tag_count[tag] += 1 |
|
print(tag) |
|
|
|
return tag_count, total_files |
|
|
|
def find_common_tags(tag_count, total_files, threshold): |
|
common_tags = [tag for tag, count in tag_count.items() if count / total_files >= threshold] |
|
return common_tags |
|
|
|
if __name__ == "__main__": |
|
directory_path = r"E:\Dataset\XXXXXXXX" |
|
threshold = 0.8 |
|
|
|
tag_count, total_files = count_tags(directory_path) |
|
print(tag_count) |
|
print(total_files) |
|
common_tags = find_common_tags(tag_count, total_files, threshold) |
|
output = ", ".join(common_tags) |
|
|
|
print(f"Common tags (used in {threshold * 100}% or more of the files): {output}") |
|
|