titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
8c51bed verified
import argparse
import glob
from pathlib import Path
from loguru import logger
from iscc_sct.main import create
from charset_normalizer import from_bytes
def main():
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
parser.add_argument(
"path",
type=str,
help="Path to text files (supports glob patterns) or 'gui' to launch Gradio demo.",
nargs="?",
)
parser.add_argument(
"-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
)
parser.add_argument(
"-g", "--granular", action="store_true", help="Activate granular processing."
)
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
args = parser.parse_args()
if args.path is None:
parser.print_help()
return
if not args.debug:
logger.remove()
if args.path == "gui": # pragma: no cover
try:
from iscc_sct.demo import demo
demo.launch(inbrowser=True)
except ImportError:
print(
"Error: Gradio is not installed. Please install it with 'pip install gradio' to use the GUI."
)
return
for path in glob.glob(args.path):
path = Path(path)
if path.is_file():
logger.debug(f"Processing {path.name}")
with path.open("rb") as file:
data = file.read()
try:
text = data.decode("utf-8")
if not text.strip():
logger.warning(f"SKIPPED empty: {path}")
continue
except UnicodeDecodeError:
logger.debug(f"Could not decode {path.name} as UTF-8.")
charset_match = from_bytes(data).best()
if not charset_match: # pragma: no cover
logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
continue
logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
text = str(charset_match)
sct_meta = create(text, granular=args.granular, bits=args.bits)
if args.granular:
print(repr(sct_meta))
else:
print(sct_meta.iscc)
if __name__ == "__main__": # pragma: no cover
main()