Spaces:
Build error
Build error
MaureenZOU
commited on
Commit
•
fcc479d
1
Parent(s):
d80ff28
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- README.md +4 -4
- __init__.py +0 -0
- app.py +98 -0
- configs/xdecoder/svlp_focalt_lang.yaml +110 -0
- images/apples.jpg +0 -0
- images/coco/000.jpg +0 -0
- images/coco/001.jpg +0 -0
- images/coco/002.jpg +0 -0
- images/coco/003.jpg +0 -0
- images/coco/004.jpg +0 -0
- images/coco/005.jpg +0 -0
- images/coco/006.jpg +0 -0
- images/coco/007.jpg +0 -0
- images/coco/008.jpg +0 -0
- images/coco/009.jpg +0 -0
- images/coco/010.jpg +0 -0
- images/coco/011.jpg +0 -0
- images/coco/012.jpg +0 -0
- images/coco/013.jpg +0 -0
- images/coco/014.jpg +0 -0
- images/coco/015.jpg +0 -0
- images/coco/016.jpg +0 -0
- images/coco/017.jpg +0 -0
- images/coco/018.jpg +0 -0
- images/coco/019.jpg +0 -0
- images/coco/020.jpg +0 -0
- images/coco/021.jpg +0 -0
- images/coco/022.jpg +0 -0
- images/coco/023.jpg +0 -0
- images/coco/024.jpg +0 -0
- images/coco/025.jpg +0 -0
- images/coco/026.jpg +0 -0
- images/coco/027.jpg +0 -0
- images/coco/028.jpg +0 -0
- images/coco/029.jpg +0 -0
- images/coco/030.jpg +0 -0
- images/coco/031.jpg +0 -0
- images/coco/032.jpg +0 -0
- images/coco/033.jpg +0 -0
- images/coco/034.jpg +0 -0
- images/coco/035.jpg +0 -0
- images/coco/036.jpg +0 -0
- images/coco/037.jpg +0 -0
- images/coco/038.jpg +0 -0
- images/coco/039.jpg +0 -0
- images/coco/040.jpg +0 -0
- images/coco/041.jpg +0 -0
- images/coco/042.jpg +0 -0
- images/coco/043.jpg +0 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.psd filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: afl-3.0
|
|
|
1 |
---
|
2 |
+
title: X Decoder
|
3 |
+
emoji: 📈
|
4 |
colorFrom: purple
|
5 |
+
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.14.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: afl-3.0
|
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
+
# Copyright (c) 2022 Microsoft
|
4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
5 |
+
# Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu)
|
6 |
+
# --------------------------------------------------------
|
7 |
+
|
8 |
+
import os
|
9 |
+
os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git")
|
10 |
+
|
11 |
+
import gradio as gr
|
12 |
+
import torch
|
13 |
+
import argparse
|
14 |
+
|
15 |
+
from xdecoder.BaseModel import BaseModel
|
16 |
+
from xdecoder import build_model
|
17 |
+
from utils.distributed import init_distributed
|
18 |
+
from utils.arguments import load_opt_from_config_files
|
19 |
+
|
20 |
+
from tasks import *
|
21 |
+
|
22 |
+
def parse_option():
|
23 |
+
parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False)
|
24 |
+
parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', )
|
25 |
+
args = parser.parse_args()
|
26 |
+
|
27 |
+
return args
|
28 |
+
|
29 |
+
'''
|
30 |
+
build args
|
31 |
+
'''
|
32 |
+
args = parse_option()
|
33 |
+
opt = load_opt_from_config_files(args.conf_files)
|
34 |
+
opt = init_distributed(opt)
|
35 |
+
|
36 |
+
# META DATA
|
37 |
+
pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt")
|
38 |
+
pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt")
|
39 |
+
|
40 |
+
if not os.path.exists(pretrained_pth_last):
|
41 |
+
os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt"))
|
42 |
+
|
43 |
+
if not os.path.exists(pretrained_pth_novg):
|
44 |
+
os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt"))
|
45 |
+
|
46 |
+
|
47 |
+
'''
|
48 |
+
build model
|
49 |
+
'''
|
50 |
+
model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda()
|
51 |
+
model_cap = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_novg).eval().cuda()
|
52 |
+
|
53 |
+
with torch.no_grad():
|
54 |
+
model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
|
55 |
+
model_cap.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
|
56 |
+
|
57 |
+
'''
|
58 |
+
inference model
|
59 |
+
'''
|
60 |
+
|
61 |
+
@torch.no_grad()
|
62 |
+
def inference(image, instruction, *args, **kwargs):
|
63 |
+
image = image.convert("RGB")
|
64 |
+
with torch.autocast(device_type='cuda', dtype=torch.float16):
|
65 |
+
return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs)
|
66 |
+
|
67 |
+
'''
|
68 |
+
launch app
|
69 |
+
'''
|
70 |
+
title = "X-Decoder + GPT-3 Instructional Image Editing"
|
71 |
+
description = "<p style='text-align: center'> <a href='https://x-decoder-vl.github.io/' target='_blank'>Project Page</a> | <a href='https://arxiv.org/pdf/2212.11270.pdf' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/X-Decoder' target='_blank'>Github Repo</a> | <a href='https://youtu.be/wYp6vmyolqE' target='_blank'>Video</a> </p>"
|
72 |
+
|
73 |
+
article = "The Demo is Run on X-Decoder (Focal-T)."
|
74 |
+
|
75 |
+
inputs = [gr.inputs.Image(type='pil'), gr.Textbox(label="instruction")]
|
76 |
+
gr.Interface(
|
77 |
+
fn=inference,
|
78 |
+
inputs=inputs,
|
79 |
+
outputs=[
|
80 |
+
gr.outputs.Image(
|
81 |
+
type="pil",
|
82 |
+
label="segmentation results"),
|
83 |
+
gr.Textbox(label="text restuls"),
|
84 |
+
gr.outputs.Image(
|
85 |
+
type="pil",
|
86 |
+
label="inpainting results"),
|
87 |
+
],
|
88 |
+
examples=[
|
89 |
+
["./images/apples.jpg", "change green apple to a red apple"],
|
90 |
+
["./images/girl_and_two_boys.png", "remove the boy with blue backbag"],
|
91 |
+
["./images/dog.png", "remove the dog"],
|
92 |
+
],
|
93 |
+
title=title,
|
94 |
+
description=description,
|
95 |
+
article=article,
|
96 |
+
allow_flagging='never',
|
97 |
+
cache_examples=True,
|
98 |
+
).launch(share=True)
|
configs/xdecoder/svlp_focalt_lang.yaml
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
|
3 |
+
# Copyright (c) 2022 Microsoft
|
4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
5 |
+
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
|
6 |
+
# --------------------------------------------------------
|
7 |
+
|
8 |
+
##################
|
9 |
+
# Task settings
|
10 |
+
##################
|
11 |
+
VERBOSE: true
|
12 |
+
MODEL:
|
13 |
+
NAME: xdecoder_model
|
14 |
+
HEAD: xdecoder_head
|
15 |
+
DIM_PROJ: 512
|
16 |
+
BACKBONE_DIM: 768
|
17 |
+
TEXT:
|
18 |
+
ARCH: vlpencoder
|
19 |
+
NAME: transformer
|
20 |
+
TOKENIZER: clip
|
21 |
+
CONTEXT_LENGTH: 77 # 77
|
22 |
+
WIDTH: 512
|
23 |
+
HEADS: 8
|
24 |
+
LAYERS: 12 # 6
|
25 |
+
AUTOGRESSIVE: True
|
26 |
+
BACKBONE:
|
27 |
+
NAME: focal_dw
|
28 |
+
PRETRAINED: ''
|
29 |
+
LOAD_PRETRAINED: false
|
30 |
+
FOCAL:
|
31 |
+
PRETRAIN_IMG_SIZE: 224
|
32 |
+
PATCH_SIZE: 4
|
33 |
+
EMBED_DIM: 96
|
34 |
+
DEPTHS: [2, 2, 6, 2]
|
35 |
+
FOCAL_LEVELS: [3, 3, 3, 3]
|
36 |
+
FOCAL_WINDOWS: [3, 3, 3, 3]
|
37 |
+
DROP_PATH_RATE: 0.3
|
38 |
+
MLP_RATIO: 4.0
|
39 |
+
DROP_RATE: 0.0
|
40 |
+
PATCH_NORM: True
|
41 |
+
USE_CONV_EMBED: True
|
42 |
+
SCALING_MODULATOR: True
|
43 |
+
USE_CHECKPOINT: False
|
44 |
+
USE_POSTLN: true
|
45 |
+
USE_POSTLN_IN_MODULATION: false
|
46 |
+
USE_LAYERSCALE: True
|
47 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
48 |
+
OUT_INDICES: [0, 1, 2, 3]
|
49 |
+
ENCODER:
|
50 |
+
NAME: transformer_encoder_fpn
|
51 |
+
IGNORE_VALUE: 255
|
52 |
+
NUM_CLASSES: 133
|
53 |
+
LOSS_WEIGHT: 1.0
|
54 |
+
CONVS_DIM: 512
|
55 |
+
MASK_DIM: 512
|
56 |
+
NORM: "GN"
|
57 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
58 |
+
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
|
59 |
+
COMMON_STRIDE: 4
|
60 |
+
TRANSFORMER_ENC_LAYERS: 6
|
61 |
+
DECODER:
|
62 |
+
NAME: xdecoder
|
63 |
+
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
64 |
+
MASK: True
|
65 |
+
GROUNDING:
|
66 |
+
ENABLED: True
|
67 |
+
MAX_LEN: 5
|
68 |
+
TEXT_WEIGHT: 2.0
|
69 |
+
CLASS_WEIGHT: 0.5
|
70 |
+
DETECTION: False
|
71 |
+
CAPTION:
|
72 |
+
ENABLED: True
|
73 |
+
PHRASE_PROB: 0.0
|
74 |
+
SIM_THRES: 0.95
|
75 |
+
CAPTIONING:
|
76 |
+
ENABLED: True
|
77 |
+
STEP: 50
|
78 |
+
RETRIEVAL:
|
79 |
+
ENABLED: True
|
80 |
+
DIM_IMG: 768
|
81 |
+
ENSEMBLE: True
|
82 |
+
HIDDEN_DIM: 512
|
83 |
+
NUM_OBJECT_QUERIES: 101
|
84 |
+
NHEADS: 8
|
85 |
+
DROPOUT: 0.0
|
86 |
+
DIM_FEEDFORWARD: 2048
|
87 |
+
PRE_NORM: False
|
88 |
+
ENFORCE_INPUT_PROJ: False
|
89 |
+
SIZE_DIVISIBILITY: 32
|
90 |
+
TRAIN_NUM_POINTS: 12544
|
91 |
+
OVERSAMPLE_RATIO: 3.0
|
92 |
+
IMPORTANCE_SAMPLE_RATIO: 0.75
|
93 |
+
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
|
94 |
+
TOP_GROUNDING_LAYERS: 3
|
95 |
+
TOP_CAPTION_LAYERS: 3
|
96 |
+
TOP_CAPTIONING_LAYERS: 3
|
97 |
+
TOP_RETRIEVAL_LAYERS: 3
|
98 |
+
TOP_OPENIMAGE_LAYERS: 10
|
99 |
+
TEST:
|
100 |
+
SEMANTIC_ON: True
|
101 |
+
INSTANCE_ON: True
|
102 |
+
PANOPTIC_ON: True
|
103 |
+
OVERLAP_THRESHOLD: 0.8
|
104 |
+
OBJECT_MASK_THRESHOLD: 0.4
|
105 |
+
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
|
106 |
+
DETECTIONS_PER_IMAGE: 100
|
107 |
+
|
108 |
+
INPUT:
|
109 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
110 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
images/apples.jpg
ADDED
images/coco/000.jpg
ADDED
images/coco/001.jpg
ADDED
images/coco/002.jpg
ADDED
images/coco/003.jpg
ADDED
images/coco/004.jpg
ADDED
images/coco/005.jpg
ADDED
images/coco/006.jpg
ADDED
images/coco/007.jpg
ADDED
images/coco/008.jpg
ADDED
images/coco/009.jpg
ADDED
images/coco/010.jpg
ADDED
images/coco/011.jpg
ADDED
images/coco/012.jpg
ADDED
images/coco/013.jpg
ADDED
images/coco/014.jpg
ADDED
images/coco/015.jpg
ADDED
images/coco/016.jpg
ADDED
images/coco/017.jpg
ADDED
images/coco/018.jpg
ADDED
images/coco/019.jpg
ADDED
images/coco/020.jpg
ADDED
images/coco/021.jpg
ADDED
images/coco/022.jpg
ADDED
images/coco/023.jpg
ADDED
images/coco/024.jpg
ADDED
images/coco/025.jpg
ADDED
images/coco/026.jpg
ADDED
images/coco/027.jpg
ADDED
images/coco/028.jpg
ADDED
images/coco/029.jpg
ADDED
images/coco/030.jpg
ADDED
images/coco/031.jpg
ADDED
images/coco/032.jpg
ADDED
images/coco/033.jpg
ADDED
images/coco/034.jpg
ADDED
images/coco/035.jpg
ADDED
images/coco/036.jpg
ADDED
images/coco/037.jpg
ADDED
images/coco/038.jpg
ADDED
images/coco/039.jpg
ADDED
images/coco/040.jpg
ADDED
images/coco/041.jpg
ADDED
images/coco/042.jpg
ADDED
images/coco/043.jpg
ADDED