import boto3 import os import re import json from pathlib import Path import sqlite3 from huggingface_hub import Repository, HfFolder import tqdm import subprocess from fastapi import FastAPI from fastapi_utils.tasks import repeat_every AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID') AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY') AWS_S3_BUCKET_NAME = os.getenv('AWS_S3_BUCKET_NAME') s3 = boto3.client(service_name='s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_KEY) paginator = s3.get_paginator('list_objects_v2') S3_DATA_FOLDER = Path("sd-multiplayer-data") ROOMS_DATA_DB = S3_DATA_FOLDER / "rooms_data.db" repo = Repository( local_dir=S3_DATA_FOLDER, repo_type="dataset", clone_from="huggingface-projects/sd-multiplayer-data", use_auth_token=True, ) repo.git_pull() if not ROOMS_DATA_DB.exists(): print("Creating database") print("ROOMS_DATA_DB", ROOMS_DATA_DB) db = sqlite3.connect(ROOMS_DATA_DB) with open(Path("schema.sql"), "r") as f: db.executescript(f.read()) db.commit() db.close() def get_db(db_path): db = sqlite3.connect(db_path, check_same_thread=False) db.row_factory = sqlite3.Row try: yield db except Exception: db.rollback() finally: db.close() def sync_rooms_to_dataset(): for room_data_db in get_db(ROOMS_DATA_DB): rooms = room_data_db.execute("SELECT * FROM rooms").fetchall() cursor = room_data_db.cursor() for row in tqdm.tqdm(rooms): room_id = row["room_id"] print("syncing room data: ", room_id) objects = [] for result in paginator.paginate(Bucket=AWS_S3_BUCKET_NAME, Prefix=f'{room_id}/', Delimiter='/'): results = [] for obj in result.get('Contents'): try: key = obj.get('Key') time = obj.get('LastModified').isoformat() split_str = re.split(r'[-/.]', key) uuid = split_str[3] x, y = [int(n) for n in re.split(r'[_]', split_str[4])] prompt = ' '.join(split_str[4:]) results.append( {'x': x, 'y': y, 'prompt': prompt, 'time': time, 'key': key, 'uuid': uuid}) cursor.execute( 'INSERT INTO rooms_data VALUES (NULL, ?, ?, ?, ?, ?, ?, ?)', (room_id, uuid, x, y, prompt, time, key)) except Exception as e: print(e) continue objects += results room_data_db.commit() all_rows = [dict(row) for row in room_data_db.execute( "SELECT * FROM rooms_data WHERE room_id = ?", (room_id,)).fetchall()] data_path = S3_DATA_FOLDER / f"{room_id}.json" with open(data_path, 'w') as f: json.dump(all_rows, f, separators=(',', ':')) print("Updating repository") subprocess.Popen( "git add . && git commit --amend -m 'update' && git push --force", cwd=S3_DATA_FOLDER, shell=True) app = FastAPI() @app.get("/") def read_root(): return "Just a bot to sync data to huggingface datasets and tweet tha latest data" @app.get("/sync") def sync(): sync_rooms_to_dataset() return "Synced data to huggingface datasets" @app.on_event("startup") @repeat_every(seconds=1800) def repeat_sync(): sync_rooms_to_dataset() return "Synced data to huggingface datasets"