File size: 3,218 Bytes
5e37512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285aab9
 
 
 
 
5e37512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from huggingface_hub import snapshot_download, delete_repo, metadata_update
import uuid
import json
import yaml
import subprocess

HF_TOKEN = os.environ.get("HF_TOKEN")
HF_DATASET = os.environ.get("DATA_PATH")


def download_dataset(hf_dataset_path: str):
    random_id = str(uuid.uuid4())
    snapshot_download(
        repo_id=hf_dataset_path,
        token=HF_TOKEN,
        local_dir=f"/tmp/{random_id}",
        repo_type="dataset",
    )
    return f"/tmp/{random_id}"


def process_dataset(dataset_dir: str):
    # dataset dir consists of images, config.yaml and a metadata.jsonl (optional) with fields: file_name, prompt
    # generate .txt files with the same name as the images with the prompt as the content
    # remove metadata.jsonl
    # return the path to the processed dataset

    # check if config.yaml exists
    if not os.path.exists(os.path.join(dataset_dir, "config.yaml")):
        raise ValueError("config.yaml does not exist")

    # check if metadata.jsonl exists
    if os.path.exists(os.path.join(dataset_dir, "metadata.jsonl")):
        metadata = []
        with open(os.path.join(dataset_dir, "metadata.jsonl"), "r") as f:
            for line in f:
                if len(line.strip()) > 0:
                    metadata.append(json.loads(line))
        for item in metadata:
            txt_path = os.path.join(dataset_dir, item["file_name"])
            txt_path = txt_path.rsplit(".", 1)[0] + ".txt"
            with open(txt_path, "w") as f:
                f.write(item["prompt"])

        # remove metadata.jsonl
        os.remove(os.path.join(dataset_dir, "metadata.jsonl"))

    with open(os.path.join(dataset_dir, "config.yaml"), "r") as f:
        config = yaml.safe_load(f)

    # update config with new dataset
    config["config"]["process"][0]["datasets"][0]["folder_path"] = dataset_dir

    with open(os.path.join(dataset_dir, "config.yaml"), "w") as f:
        yaml.dump(config, f)

    return dataset_dir


def run_training(hf_dataset_path: str):

    dataset_dir = download_dataset(hf_dataset_path)
    dataset_dir = process_dataset(dataset_dir)

    # run training
    commands = "git clone https://github.com/ostris/ai-toolkit.git ai-toolkit && cd ai-toolkit && git submodule update --init --recursive"
    subprocess.run(commands, shell=True)

    commands = f"python run.py {os.path.join(dataset_dir, 'config.yaml')}"
    process = subprocess.Popen(commands, shell=True, cwd="ai-toolkit", env=os.environ)

    return process, dataset_dir


if __name__ == "__main__":
    process, dataset_dir = run_training(HF_DATASET)
    process.wait()  # Wait for the training process to finish

    with open(os.path.join(dataset_dir, "config.yaml"), "r") as f:
        config = yaml.safe_load(f)
    repo_id = config["config"]["process"][0]["save"]["hf_repo_id"]

    metadata = {
        "tags": [
            "autotrain",
            "spacerunner",
            "text-to-image",
            "flux",
            "lora",
            "diffusers",
            "template:sd-lora",
        ]
    }
    metadata_update(repo_id, metadata, token=HF_TOKEN, repo_type="model", overwrite=True)
    delete_repo(HF_DATASET, token=HF_TOKEN, repo_type="dataset", missing_ok=True)