MaureenZOU commited on
Commit
fcc479d
1 Parent(s): d80ff28
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.psd filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Image Editing With GPT3
3
- emoji: 🐨
4
  colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.16.1
8
  app_file: app.py
9
  pinned: false
10
  license: afl-3.0
 
1
  ---
2
+ title: X Decoder
3
+ emoji: 📈
4
  colorFrom: purple
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 3.14.0
8
  app_file: app.py
9
  pinned: false
10
  license: afl-3.0
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
+ # Copyright (c) 2022 Microsoft
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu)
6
+ # --------------------------------------------------------
7
+
8
+ import os
9
+ os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git")
10
+
11
+ import gradio as gr
12
+ import torch
13
+ import argparse
14
+
15
+ from xdecoder.BaseModel import BaseModel
16
+ from xdecoder import build_model
17
+ from utils.distributed import init_distributed
18
+ from utils.arguments import load_opt_from_config_files
19
+
20
+ from tasks import *
21
+
22
+ def parse_option():
23
+ parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False)
24
+ parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', )
25
+ args = parser.parse_args()
26
+
27
+ return args
28
+
29
+ '''
30
+ build args
31
+ '''
32
+ args = parse_option()
33
+ opt = load_opt_from_config_files(args.conf_files)
34
+ opt = init_distributed(opt)
35
+
36
+ # META DATA
37
+ pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt")
38
+ pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt")
39
+
40
+ if not os.path.exists(pretrained_pth_last):
41
+ os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt"))
42
+
43
+ if not os.path.exists(pretrained_pth_novg):
44
+ os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt"))
45
+
46
+
47
+ '''
48
+ build model
49
+ '''
50
+ model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda()
51
+ model_cap = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_novg).eval().cuda()
52
+
53
+ with torch.no_grad():
54
+ model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
55
+ model_cap.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
56
+
57
+ '''
58
+ inference model
59
+ '''
60
+
61
+ @torch.no_grad()
62
+ def inference(image, instruction, *args, **kwargs):
63
+ image = image.convert("RGB")
64
+ with torch.autocast(device_type='cuda', dtype=torch.float16):
65
+ return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs)
66
+
67
+ '''
68
+ launch app
69
+ '''
70
+ title = "X-Decoder + GPT-3 Instructional Image Editing"
71
+ description = "<p style='text-align: center'> <a href='https://x-decoder-vl.github.io/' target='_blank'>Project Page</a> | <a href='https://arxiv.org/pdf/2212.11270.pdf' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/X-Decoder' target='_blank'>Github Repo</a> | <a href='https://youtu.be/wYp6vmyolqE' target='_blank'>Video</a> </p>"
72
+
73
+ article = "The Demo is Run on X-Decoder (Focal-T)."
74
+
75
+ inputs = [gr.inputs.Image(type='pil'), gr.Textbox(label="instruction")]
76
+ gr.Interface(
77
+ fn=inference,
78
+ inputs=inputs,
79
+ outputs=[
80
+ gr.outputs.Image(
81
+ type="pil",
82
+ label="segmentation results"),
83
+ gr.Textbox(label="text restuls"),
84
+ gr.outputs.Image(
85
+ type="pil",
86
+ label="inpainting results"),
87
+ ],
88
+ examples=[
89
+ ["./images/apples.jpg", "change green apple to a red apple"],
90
+ ["./images/girl_and_two_boys.png", "remove the boy with blue backbag"],
91
+ ["./images/dog.png", "remove the dog"],
92
+ ],
93
+ title=title,
94
+ description=description,
95
+ article=article,
96
+ allow_flagging='never',
97
+ cache_examples=True,
98
+ ).launch(share=True)
configs/xdecoder/svlp_focalt_lang.yaml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
+ # Copyright (c) 2022 Microsoft
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
+ # --------------------------------------------------------
7
+
8
+ ##################
9
+ # Task settings
10
+ ##################
11
+ VERBOSE: true
12
+ MODEL:
13
+ NAME: xdecoder_model
14
+ HEAD: xdecoder_head
15
+ DIM_PROJ: 512
16
+ BACKBONE_DIM: 768
17
+ TEXT:
18
+ ARCH: vlpencoder
19
+ NAME: transformer
20
+ TOKENIZER: clip
21
+ CONTEXT_LENGTH: 77 # 77
22
+ WIDTH: 512
23
+ HEADS: 8
24
+ LAYERS: 12 # 6
25
+ AUTOGRESSIVE: True
26
+ BACKBONE:
27
+ NAME: focal_dw
28
+ PRETRAINED: ''
29
+ LOAD_PRETRAINED: false
30
+ FOCAL:
31
+ PRETRAIN_IMG_SIZE: 224
32
+ PATCH_SIZE: 4
33
+ EMBED_DIM: 96
34
+ DEPTHS: [2, 2, 6, 2]
35
+ FOCAL_LEVELS: [3, 3, 3, 3]
36
+ FOCAL_WINDOWS: [3, 3, 3, 3]
37
+ DROP_PATH_RATE: 0.3
38
+ MLP_RATIO: 4.0
39
+ DROP_RATE: 0.0
40
+ PATCH_NORM: True
41
+ USE_CONV_EMBED: True
42
+ SCALING_MODULATOR: True
43
+ USE_CHECKPOINT: False
44
+ USE_POSTLN: true
45
+ USE_POSTLN_IN_MODULATION: false
46
+ USE_LAYERSCALE: True
47
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
48
+ OUT_INDICES: [0, 1, 2, 3]
49
+ ENCODER:
50
+ NAME: transformer_encoder_fpn
51
+ IGNORE_VALUE: 255
52
+ NUM_CLASSES: 133
53
+ LOSS_WEIGHT: 1.0
54
+ CONVS_DIM: 512
55
+ MASK_DIM: 512
56
+ NORM: "GN"
57
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
58
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
59
+ COMMON_STRIDE: 4
60
+ TRANSFORMER_ENC_LAYERS: 6
61
+ DECODER:
62
+ NAME: xdecoder
63
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
64
+ MASK: True
65
+ GROUNDING:
66
+ ENABLED: True
67
+ MAX_LEN: 5
68
+ TEXT_WEIGHT: 2.0
69
+ CLASS_WEIGHT: 0.5
70
+ DETECTION: False
71
+ CAPTION:
72
+ ENABLED: True
73
+ PHRASE_PROB: 0.0
74
+ SIM_THRES: 0.95
75
+ CAPTIONING:
76
+ ENABLED: True
77
+ STEP: 50
78
+ RETRIEVAL:
79
+ ENABLED: True
80
+ DIM_IMG: 768
81
+ ENSEMBLE: True
82
+ HIDDEN_DIM: 512
83
+ NUM_OBJECT_QUERIES: 101
84
+ NHEADS: 8
85
+ DROPOUT: 0.0
86
+ DIM_FEEDFORWARD: 2048
87
+ PRE_NORM: False
88
+ ENFORCE_INPUT_PROJ: False
89
+ SIZE_DIVISIBILITY: 32
90
+ TRAIN_NUM_POINTS: 12544
91
+ OVERSAMPLE_RATIO: 3.0
92
+ IMPORTANCE_SAMPLE_RATIO: 0.75
93
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
94
+ TOP_GROUNDING_LAYERS: 3
95
+ TOP_CAPTION_LAYERS: 3
96
+ TOP_CAPTIONING_LAYERS: 3
97
+ TOP_RETRIEVAL_LAYERS: 3
98
+ TOP_OPENIMAGE_LAYERS: 10
99
+ TEST:
100
+ SEMANTIC_ON: True
101
+ INSTANCE_ON: True
102
+ PANOPTIC_ON: True
103
+ OVERLAP_THRESHOLD: 0.8
104
+ OBJECT_MASK_THRESHOLD: 0.4
105
+ SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
106
+ DETECTIONS_PER_IMAGE: 100
107
+
108
+ INPUT:
109
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
110
+ PIXEL_STD: [58.395, 57.120, 57.375]
images/apples.jpg ADDED
images/coco/000.jpg ADDED
images/coco/001.jpg ADDED
images/coco/002.jpg ADDED
images/coco/003.jpg ADDED
images/coco/004.jpg ADDED
images/coco/005.jpg ADDED
images/coco/006.jpg ADDED
images/coco/007.jpg ADDED
images/coco/008.jpg ADDED
images/coco/009.jpg ADDED
images/coco/010.jpg ADDED
images/coco/011.jpg ADDED
images/coco/012.jpg ADDED
images/coco/013.jpg ADDED
images/coco/014.jpg ADDED
images/coco/015.jpg ADDED
images/coco/016.jpg ADDED
images/coco/017.jpg ADDED
images/coco/018.jpg ADDED
images/coco/019.jpg ADDED
images/coco/020.jpg ADDED
images/coco/021.jpg ADDED
images/coco/022.jpg ADDED
images/coco/023.jpg ADDED
images/coco/024.jpg ADDED
images/coco/025.jpg ADDED
images/coco/026.jpg ADDED
images/coco/027.jpg ADDED
images/coco/028.jpg ADDED
images/coco/029.jpg ADDED
images/coco/030.jpg ADDED
images/coco/031.jpg ADDED
images/coco/032.jpg ADDED
images/coco/033.jpg ADDED
images/coco/034.jpg ADDED
images/coco/035.jpg ADDED
images/coco/036.jpg ADDED
images/coco/037.jpg ADDED
images/coco/038.jpg ADDED
images/coco/039.jpg ADDED
images/coco/040.jpg ADDED
images/coco/041.jpg ADDED
images/coco/042.jpg ADDED
images/coco/043.jpg ADDED