diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d550eaaeaf3ec284e5299a9a2b995efebadc790c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,153 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
\ No newline at end of file
diff --git a/app.py b/app.py
index 02cfb17df79ef9ec49c1f9a884450e6eee73c151..f30bd2bcfd5bb0f220bde2af8bad9b802329a0cb 100644
--- a/app.py
+++ b/app.py
@@ -27,15 +27,15 @@ def sadtalker_demo(result_dir='./tmp/'):
Homepage \
Github ")
- with gr.Row().style(equal_height=False):
+ with gr.Row():
with gr.Column(variant='panel'):
with gr.Tabs(elem_id="sadtalker_source_image"):
with gr.TabItem('Upload image'):
with gr.Row():
- source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256,width=256)
+ source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256)
with gr.Tabs(elem_id="sadtalker_driven_audio"):
- with gr.TabItem('Upload audio(wav only currently)'):
+ with gr.TabItem('Upload audio(wav/mp3 only currently)'):
with gr.Column(variant='panel'):
driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
@@ -43,12 +43,13 @@ def sadtalker_demo(result_dir='./tmp/'):
with gr.Tabs(elem_id="sadtalker_checkbox"):
with gr.TabItem('Settings'):
with gr.Column(variant='panel'):
- is_still_mode = gr.Checkbox(label="w/ Still Mode (fewer head motion)")
- enhancer = gr.Checkbox(label="w/ GFPGAN as Face enhancer")
+ is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion)").style(container=True)
+ is_resize_mode = gr.Checkbox(label="Resize Mode (⚠️ Resize mode need manually crop the image firstly, can handle larger image crop)").style(container=True)
+ is_enhance_mode = gr.Checkbox(label="Enhance Mode (better face quality )").style(container=True)
submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
with gr.Tabs(elem_id="sadtalker_genearted"):
- gen_video = gr.Video(label="Generated video", format="mp4").style(height=256,width=256)
+ gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
gen_text = gr.Textbox(visible=False)
with gr.Row():
@@ -57,7 +58,22 @@ def sadtalker_demo(result_dir='./tmp/'):
'examples/source_image/art_10.png',
'examples/driven_audio/deyu.wav',
True,
+ False,
False
+ ],
+ [
+ 'examples/source_image/art_1.png',
+ 'examples/driven_audio/fayu.wav',
+ True,
+ True,
+ False
+ ],
+ [
+ 'examples/source_image/art_9.png',
+ 'examples/driven_audio/itosinger1.wav',
+ True,
+ False,
+ True
]
]
gr.Examples(examples=examples,
@@ -65,7 +81,8 @@ def sadtalker_demo(result_dir='./tmp/'):
source_image,
driven_audio,
is_still_mode,
- enhancer,
+ is_resize_mode,
+ is_enhance_mode,
gr.Textbox(value=result_dir, visible=False)],
outputs=[gen_video, gen_text],
fn=sad_talker.test,
@@ -76,7 +93,8 @@ def sadtalker_demo(result_dir='./tmp/'):
inputs=[source_image,
driven_audio,
is_still_mode,
- enhancer,
+ is_resize_mode,
+ is_enhance_mode,
gr.Textbox(value=result_dir, visible=False)],
outputs=[gen_video, gen_text]
)
diff --git a/modules/__pycache__/sadtalker_test.cpython-38.pyc b/modules/__pycache__/sadtalker_test.cpython-38.pyc
index 5eb11acf9e267816b4086e1b582013476729c533..6377ea45cbb1a59c93ae5dd63fb699a3f4288be2 100644
Binary files a/modules/__pycache__/sadtalker_test.cpython-38.pyc and b/modules/__pycache__/sadtalker_test.cpython-38.pyc differ
diff --git a/modules/__pycache__/text2speech.cpython-38.pyc b/modules/__pycache__/text2speech.cpython-38.pyc
index 4b4bb06e0b09743093bac9edf6c0f5a16acac5f8..90ad4127ce0050c2215bdb797974ad849d12a96c 100644
Binary files a/modules/__pycache__/text2speech.cpython-38.pyc and b/modules/__pycache__/text2speech.cpython-38.pyc differ
diff --git a/modules/sadtalker_test.py b/modules/sadtalker_test.py
index f2404421dc132dcf05cddcb9422f47175f229bc9..f15e3aafe95aea1d082cbca4a3d5f4c6ce10fea4 100644
--- a/modules/sadtalker_test.py
+++ b/modules/sadtalker_test.py
@@ -60,7 +60,7 @@ class SadTalker():
facerender_yaml_path, device)
self.device = device
- def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
+ def test(self, source_image, driven_audio, still_mode, resize_mode, use_enhancer, result_dir='./'):
time_tag = str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S")
save_dir = os.path.join(result_dir, time_tag)
@@ -91,7 +91,7 @@ class SadTalker():
#crop image and extract 3dmm from image
first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
os.makedirs(first_frame_dir, exist_ok=True)
- first_coeff_path, crop_pic_path = self.preprocess_model.generate(pic_path, first_frame_dir)
+ first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir, crop_or_resize= 'crop' if resize_mode == 'crop' else 'resize')
if first_coeff_path is None:
raise AttributeError("No face is detected")
@@ -101,7 +101,7 @@ class SadTalker():
#coeff2video
batch_size = 4
data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
- self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None)
+ self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
video_name = data['video_name']
print(f'The generated video is named {video_name} in {save_dir}')
diff --git a/src/__pycache__/generate_batch.cpython-38.pyc b/src/__pycache__/generate_batch.cpython-38.pyc
index 5032a29f3d4291f8f90539857bd726dc58679445..c68dd09e49933b52115307195bf3aa446d924922 100644
Binary files a/src/__pycache__/generate_batch.cpython-38.pyc and b/src/__pycache__/generate_batch.cpython-38.pyc differ
diff --git a/src/__pycache__/generate_facerender_batch.cpython-38.pyc b/src/__pycache__/generate_facerender_batch.cpython-38.pyc
index 3dbfc8c5c193db5bcb198d2fdd6c3775c9f1f9ff..6a30615ed3eaa5902a2fa553ed3ed17a9ae92a51 100644
Binary files a/src/__pycache__/generate_facerender_batch.cpython-38.pyc and b/src/__pycache__/generate_facerender_batch.cpython-38.pyc differ
diff --git a/src/__pycache__/test_audio2coeff.cpython-38.pyc b/src/__pycache__/test_audio2coeff.cpython-38.pyc
index 145f0df05b72a17711ca0d2bc8fa960f2e760d20..c2553cc97f50096d7c7005ad39274a8653cb6ad4 100644
Binary files a/src/__pycache__/test_audio2coeff.cpython-38.pyc and b/src/__pycache__/test_audio2coeff.cpython-38.pyc differ
diff --git a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc
index 28f97d6138ff61dd5d40dbde67c8b62135623a9e..460563d74a990c40a3c5bd6f3209acca6d86b550 100644
Binary files a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc and b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc differ
diff --git a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc
index 76c987956e3de068b8817aca70cdd05479e82232..766660615f22f94c740dd420ccef83ed442c4fac 100644
Binary files a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc and b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/audio2exp_models/audio2exp.py b/src/audio2exp_models/audio2exp.py
index 8231007799891ca4dd7f81b04226d82ddfab292d..5f6e6b77b0ceb2089539caa440f7106c7b1e8aa2 100644
--- a/src/audio2exp_models/audio2exp.py
+++ b/src/audio2exp_models/audio2exp.py
@@ -1,3 +1,4 @@
+from tqdm import tqdm
import torch
from torch import nn
@@ -15,15 +16,24 @@ class Audio2Exp(nn.Module):
bs = mel_input.shape[0]
T = mel_input.shape[1]
- ref = batch['ref'][:, :, :64].repeat((1,T,1)) #bs T 64
- ratio = batch['ratio_gt'] #bs T
+ exp_coeff_pred = []
- audiox = mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16
- exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64
+ for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
+
+ current_mel_input = mel_input[:,i:i+10]
+
+ ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1)) #bs T 64
+ ratio = batch['ratio_gt'][:, i:i+10] #bs T
+
+ audiox = current_mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16
+
+ curr_exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64
+
+ exp_coeff_pred += [curr_exp_coeff_pred]
# BS x T x 64
results_dict = {
- 'exp_coeff_pred': exp_coeff_pred
+ 'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
}
return results_dict
diff --git a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc
index 57e4e23d75532766340f6ebb4ebc3eb038b73564..20fa93168344012f0bdb77727b5b5669fac8a10b 100644
Binary files a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc
index 253503524dff693f19b69d927124b99b3ac145e5..97d9bdf072c5bd356cc312357646c6eae2b798d0 100644
Binary files a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc
index ff9c8bc5279ab0e4675f9d23344f1af8c126f57e..0d9aaee3ad4caa8afc40f723d224eb5b25e8afcd 100644
Binary files a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc and b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc
index 12552f8a5cdc38657fa5769fb9055641b0c511f6..c7ebfcd0dd3538cedeb7eba984f94d9763b392c6 100644
Binary files a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc and b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc
index 14e8a5b95b2b165c9a8e501ac6b87801dea9481f..239626089b91321b1c00cfba2dfe0a3ba1ccb0b9 100644
Binary files a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc and b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc
index 1609cc2e18291e92dc25d290d47e215079c91972..0e6b40591fd932ddb2cf686b72afd08c90de1a44 100644
Binary files a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc and b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc differ
diff --git a/src/audio2pose_models/audio2pose.py b/src/audio2pose_models/audio2pose.py
index fc2499bd22f7fecabd3849a117a35d6a44d16269..3a37179e221340662a817628df3d01ae9e34404f 100644
--- a/src/audio2pose_models/audio2pose.py
+++ b/src/audio2pose_models/audio2pose.py
@@ -76,6 +76,7 @@ class Audio2Pose(nn.Module):
batch['audio_emb'] = audio_emb
batch = self.netG.test(batch)
pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6
+
if re != 0:
z = torch.randn(bs, self.latent_dim).to(ref.device)
batch['z'] = z
diff --git a/src/audio2pose_models/audio_encoder.py b/src/audio2pose_models/audio_encoder.py
index 8dc0f372a20f874ec7513d37b61859dc46e2669a..0ce036df119f86ef28c3ac8d6c834264571c309a 100644
--- a/src/audio2pose_models/audio_encoder.py
+++ b/src/audio2pose_models/audio_encoder.py
@@ -19,7 +19,7 @@ class Conv2d(nn.Module):
return self.act(out)
class AudioEncoder(nn.Module):
- def __init__(self, wav2lip_checkpoint, device='cpu'):
+ def __init__(self, wav2lip_checkpoint):
super(AudioEncoder, self).__init__()
self.audio_encoder = nn.Sequential(
@@ -42,7 +42,7 @@ class AudioEncoder(nn.Module):
Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
#### load the pre-trained audio_encoder\
- wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=device)['state_dict']
+ wav2lip_state_dict = torch.load(wav2lip_checkpoint)['state_dict']
state_dict = self.audio_encoder.state_dict()
for k,v in wav2lip_state_dict.items():
diff --git a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc
index d8dba7f62d303d81702181344c6a01a2d1671592..0469c877400338fae921f4aedf1159b03abbb101 100644
Binary files a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc and b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc differ
diff --git a/src/face3d/extract_kp_videos.py b/src/face3d/extract_kp_videos.py
index f1e3b7ee33ed44eb487a5856d8f835e7473e9466..f12e9ec3488d99a29620b744beaa46814b66db8f 100644
--- a/src/face3d/extract_kp_videos.py
+++ b/src/face3d/extract_kp_videos.py
@@ -71,7 +71,7 @@ def read_video(filename):
def run(data):
filename, opt, device = data
os.environ['CUDA_VISIBLE_DEVICES'] = device
- kp_extractor = KeypointExtractor(device)
+ kp_extractor = KeypointExtractor()
images = read_video(filename)
name = filename.split('/')[-2:]
os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
diff --git a/src/face3d/models/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/__pycache__/__init__.cpython-38.pyc
index 9226ef2a75f88c1bf2b87e8e173b3539d67e6b7e..886f0b184346c5530d0bf8d6f4b2300079511225 100644
Binary files a/src/face3d/models/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/models/__pycache__/base_model.cpython-38.pyc b/src/face3d/models/__pycache__/base_model.cpython-38.pyc
index 960d0df171da67b761b6b279c66ee32d1d698611..e42691ec8e26c5c38baf6bd0172dff8110754da1 100644
Binary files a/src/face3d/models/__pycache__/base_model.cpython-38.pyc and b/src/face3d/models/__pycache__/base_model.cpython-38.pyc differ
diff --git a/src/face3d/models/__pycache__/networks.cpython-38.pyc b/src/face3d/models/__pycache__/networks.cpython-38.pyc
index 00d82c97fa7a1dca5d0495fb1a2cbeb664f09813..1a97b5cd3309786e87448c4478ae2d19a18e096b 100644
Binary files a/src/face3d/models/__pycache__/networks.cpython-38.pyc and b/src/face3d/models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc
index 0acba565bc3928508199e41a8da56f09ad955ccc..83f6ad3ed4af3cc3d3cfa9067e345cdffb058638 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc
index 1d5ddd6e5ded7e34e7aaa8b3d945a0167454ee3c..f59247d26d9210b5fd2960df842753a903a90b3d 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc
index 46f43a3ae7bc4653ccbef934b56566b667ec0028..d8edc64d28aa3e3fb8c26ba795d04a8ef35b1540 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/__init__.cpython-38.pyc b/src/face3d/util/__pycache__/__init__.cpython-38.pyc
index 72c836e4ebd2b5baf1273f61fc024eb1e0347085..22771f3169f2da9a37c1bd619a0e5d05003492b9 100644
Binary files a/src/face3d/util/__pycache__/__init__.cpython-38.pyc and b/src/face3d/util/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc
index 0457c4f9ef0a394f926a5f792178dc9b25c60cfb..8a48b59ca078ef709825d54c069f518c15103c4e 100644
Binary files a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc and b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc
index 8100839a4aafba54f8a6a034b03e0660a48368ec..7900dafbd8b74629c391eb8972f615650d4461df 100644
Binary files a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc and b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/src/facerender/__pycache__/animate.cpython-38.pyc b/src/facerender/__pycache__/animate.cpython-38.pyc
index 91ca41e74ce67feb6bdb9948a59183d7b92eb3b9..11fb3d0ee467093c0cb318003c52eb4c78f11cc9 100644
Binary files a/src/facerender/__pycache__/animate.cpython-38.pyc and b/src/facerender/__pycache__/animate.cpython-38.pyc differ
diff --git a/src/facerender/animate.py b/src/facerender/animate.py
index e9743a6cdf8eaa60f7cb25d97d831d99638d855e..be2d62ebaeffe06a8dee1e268d832690b1937320 100644
--- a/src/facerender/animate.py
+++ b/src/facerender/animate.py
@@ -1,4 +1,5 @@
import os
+import cv2
import yaml
import numpy as np
import warnings
@@ -106,7 +107,7 @@ class AnimateFromCoeff():
return checkpoint['epoch']
- def generate(self, x, video_save_dir, enhancer=None):
+ def generate(self, x, video_save_dir, enhancer=None, original_size=None):
source_image=x['source_image'].type(torch.FloatTensor)
source_semantics=x['source_semantics'].type(torch.FloatTensor)
@@ -137,6 +138,10 @@ class AnimateFromCoeff():
video.append(image)
result = img_as_ubyte(video)
+ ### the generated video is 256x256, so we keep the aspect ratio,
+ if original_size:
+ result = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in result ]
+
video_name = x['video_name'] + '.mp4'
path = os.path.join(video_save_dir, 'temp_'+video_name)
imageio.mimsave(path, result, fps=float(25))
@@ -146,6 +151,10 @@ class AnimateFromCoeff():
av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer)
enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
enhanced_images = face_enhancer(result, method=enhancer)
+
+ if original_size:
+ enhanced_images = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in enhanced_images ]
+
imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
av_path = os.path.join(video_save_dir, video_name)
diff --git a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc
index 35a44da77686faa8c09392f15f119518059cd288..5178c3763bc9f6fcff3a8a410deff7d3c30060db 100644
Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc and b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/generator.cpython-38.pyc b/src/facerender/modules/__pycache__/generator.cpython-38.pyc
index f18ac48e6c06f902f22d051588a9023bcd163dda..8d132f05d36e505f21c864d4c95931472ba58051 100644
Binary files a/src/facerender/modules/__pycache__/generator.cpython-38.pyc and b/src/facerender/modules/__pycache__/generator.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc
index 2eb85ab75af6c9d5c47d5a0ecbcb929310c2d902..ccc5d4543365bfc022a06a72d6ed9d388249279a 100644
Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc and b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc
index de5ff5d0819a9aa6adc7d3aaa92343361a098048..1b54bcc293d742f70db165849b9764666b0f9a8b 100644
Binary files a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc and b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc
index b8ff701e5c431228481c008a4110c97f45b4803a..7e1a2baa2bfab28fe7e3904f94a644633124b56c 100644
Binary files a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc and b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/util.cpython-38.pyc b/src/facerender/modules/__pycache__/util.cpython-38.pyc
index 92a821d2ec9c6c655bd82b333490adca2f020304..1e1c92955be38c880c52cc70b8051fd8ef4fa63a 100644
Binary files a/src/facerender/modules/__pycache__/util.cpython-38.pyc and b/src/facerender/modules/__pycache__/util.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc
index 1712d28c2f1564aecdec0971496514e6b9a85109..03d5fdb5ff0e14c08894b394b8c1cae7e1f324c4 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc
index d204555804699761f59a2d639d89d33e0e1a8637..20a4560fc425087d5d63c70cc08fd12c2d8a7ea1 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc
index efda35f1eb336382fc5cb5943db3aaeda8042de6..eb7252b8ad1b6aec2f5566979db0494f71a63d91 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc
index 6fdcebc6259dc3c8ebf212c6d26bf9f00316ede7..30c9811579d75333db1b60fe4622f682013f719b 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc differ
diff --git a/src/generate_batch.py b/src/generate_batch.py
index 35a785b182ff4bfbdcc55fd8cfc7c5471cb9a1af..2d9e19b6aa4c19c13caf0a208e1189cd6c19f796 100644
--- a/src/generate_batch.py
+++ b/src/generate_batch.py
@@ -1,18 +1,11 @@
import os
+
+from tqdm import tqdm
import torch
import numpy as np
import random
import scipy.io as scio
import src.utils.audio as audio
-import subprocess, platform
-
-from pydub import AudioSegment
-
-def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
- mp3_file = AudioSegment.from_mp3(file=mp3_filename)
- mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
-
-
def crop_pad_audio(wav, audio_length):
if len(wav) > audio_length:
@@ -33,7 +26,6 @@ def generate_blink_seq(num_frames):
ratio = np.zeros((num_frames,1))
frame_id = 0
while frame_id in range(num_frames):
- #start = random.choice(range(60,70))
start = 80
if frame_id+start+9<=num_frames - 1:
ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
@@ -48,7 +40,6 @@ def generate_blink_seq_randomly(num_frames):
return ratio
frame_id = 0
while frame_id in range(num_frames):
- #start = random.choice(range(60,70))
start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70)))
if frame_id+start+5<=num_frames - 1:
ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
@@ -60,8 +51,6 @@ def generate_blink_seq_randomly(num_frames):
def get_data(first_coeff_path, audio_path, device):
syncnet_mel_step_size = 16
- syncnet_T = 5
- MAX_FRAME = 32
fps = 25
pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
@@ -71,23 +60,14 @@ def get_data(first_coeff_path, audio_path, device):
source_semantics_dict = scio.loadmat(source_semantics_path)
ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70] #1 70
- print(audio_path)
- if '.mp3' in audio_path:
- print(audio_path)
- mp3_to_wav(audio_path, audio_path.replace('.mp3','.wav'), 16000)
- new_audio = audio_path.replace('.mp3','.wav')
- else:
- new_audio = audio_path
-
- wav = audio.load_wav(new_audio, 16000)
-
+ wav = audio.load_wav(audio_path, 16000)
wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
wav = crop_pad_audio(wav, wav_length)
orig_mel = audio.melspectrogram(wav).T
spec = orig_mel.copy() # nframes 80
indiv_mels = []
- for i in range(num_frames):
+ for i in tqdm(range(num_frames), 'mel:'):
start_frame_num = i-2
start_idx = int(80. * (start_frame_num / float(fps)))
end_idx = start_idx + syncnet_mel_step_size
@@ -97,7 +77,6 @@ def get_data(first_coeff_path, audio_path, device):
indiv_mels.append(m.T)
indiv_mels = np.asarray(indiv_mels) # T 80 16
ratio = generate_blink_seq_randomly(num_frames) # T
-
indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
ratio = torch.FloatTensor(ratio).unsqueeze(0) # bs T
diff --git a/src/gradio_demo.py b/src/gradio_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f78c97349652e23cf463c49527191fcec795564
--- /dev/null
+++ b/src/gradio_demo.py
@@ -0,0 +1,113 @@
+import torch, uuid
+from time import gmtime, strftime
+import os, sys, shutil
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+from src.utils.text2speech import text2speech
+
+from pydub import AudioSegment
+
+def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
+ mp3_file = AudioSegment.from_file(file=mp3_filename)
+ mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
+
+
+class SadTalker():
+
+ def __init__(self, checkpoint_path='checkpoints', config_path='src/config'):
+
+ if torch.cuda.is_available() :
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ os.environ['TORCH_HOME']= checkpoint_path
+
+ path_of_lm_croper = os.path.join( checkpoint_path, 'shape_predictor_68_face_landmarks.dat')
+ path_of_net_recon_model = os.path.join( checkpoint_path, 'epoch_20.pth')
+ dir_of_BFM_fitting = os.path.join( checkpoint_path, 'BFM_Fitting')
+ wav2lip_checkpoint = os.path.join( checkpoint_path, 'wav2lip.pth')
+
+ audio2pose_checkpoint = os.path.join( checkpoint_path, 'auido2pose_00140-model.pth')
+ audio2pose_yaml_path = os.path.join( config_path, 'auido2pose.yaml')
+
+ audio2exp_checkpoint = os.path.join( checkpoint_path, 'auido2exp_00300-model.pth')
+ audio2exp_yaml_path = os.path.join( config_path, 'auido2exp.yaml')
+
+ free_view_checkpoint = os.path.join( checkpoint_path, 'facevid2vid_00189-model.pth.tar')
+ mapping_checkpoint = os.path.join( checkpoint_path, 'mapping_00229-model.pth.tar')
+ facerender_yaml_path = os.path.join( config_path, 'facerender.yaml')
+
+ #init model
+ print(path_of_lm_croper)
+ self.preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
+
+ print(audio2pose_checkpoint)
+ self.audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
+ audio2exp_checkpoint, audio2exp_yaml_path, wav2lip_checkpoint, device)
+ print(free_view_checkpoint)
+ self.animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
+ facerender_yaml_path, device)
+ self.device = device
+
+ def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
+
+ time_tag = str(uuid.uuid4())
+ save_dir = os.path.join(result_dir, time_tag)
+ os.makedirs(save_dir, exist_ok=True)
+
+ input_dir = os.path.join(save_dir, 'input')
+ os.makedirs(input_dir, exist_ok=True)
+
+ print(source_image)
+ pic_path = os.path.join(input_dir, os.path.basename(source_image))
+ shutil.move(source_image, input_dir)
+
+ if os.path.isfile(driven_audio):
+ audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
+
+ #### mp3 to wav
+ if '.mp3' in audio_path:
+ mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000)
+ audio_path = audio_path.replace('.mp3', '.wav')
+ else:
+ shutil.move(driven_audio, input_dir)
+ else:
+ text2speech
+
+
+ os.makedirs(save_dir, exist_ok=True)
+ pose_style = 0
+ #crop image and extract 3dmm from image
+ first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
+ os.makedirs(first_frame_dir, exist_ok=True)
+ first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir)
+
+ if first_coeff_path is None:
+ raise AttributeError("No face is detected")
+
+ #audio2ceoff
+ batch = get_data(first_coeff_path, audio_path, self.device) # longer audio?
+ coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style)
+ #coeff2video
+ batch_size = 4
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
+ self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
+ video_name = data['video_name']
+ print(f'The generated video is named {video_name} in {save_dir}')
+
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+ import gc; gc.collect()
+
+ if use_enhancer:
+ return os.path.join(save_dir, video_name+'_enhanced.mp4'), os.path.join(save_dir, video_name+'_enhanced.mp4')
+
+ else:
+ return os.path.join(save_dir, video_name+'.mp4'), os.path.join(save_dir, video_name+'.mp4')
+
+
+
\ No newline at end of file
diff --git a/src/test_audio2coeff.py b/src/test_audio2coeff.py
index 3de26514660d9a12853c45e4e5278c7cfce7a7cd..3db6be3af59b0319c50106d9a92c903118f28410 100644
--- a/src/test_audio2coeff.py
+++ b/src/test_audio2coeff.py
@@ -81,7 +81,7 @@ class Audio2Coeff():
savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),
{'coeff_3dmm': coeffs_pred_numpy})
- torch.cuda.empty_cache()
+
return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))
diff --git a/src/utils/__pycache__/audio.cpython-38.pyc b/src/utils/__pycache__/audio.cpython-38.pyc
index 71fec496164398d674b966d4146ed59d46ac7e0f..c9037ed6e9b29bf1f5ba29b25ed9c067103bb361 100644
Binary files a/src/utils/__pycache__/audio.cpython-38.pyc and b/src/utils/__pycache__/audio.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/croper.cpython-38.pyc b/src/utils/__pycache__/croper.cpython-38.pyc
index 66904192497cde94208a03a315a9017c41850d1d..addfae662741dd661426427e2f29d506c399adba 100644
Binary files a/src/utils/__pycache__/croper.cpython-38.pyc and b/src/utils/__pycache__/croper.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/face_enhancer.cpython-38.pyc b/src/utils/__pycache__/face_enhancer.cpython-38.pyc
index c2c91bd41ae88985f474551e32f12ba2b07bfc0e..51b465795f49c49c741a7fb510d02564337deb28 100644
Binary files a/src/utils/__pycache__/face_enhancer.cpython-38.pyc and b/src/utils/__pycache__/face_enhancer.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/hparams.cpython-38.pyc b/src/utils/__pycache__/hparams.cpython-38.pyc
index d0e429fd2d0a979e1704da0a94b4cce691d2ccef..29278c1421204d040aa03f77ed43e18f9b60dad8 100644
Binary files a/src/utils/__pycache__/hparams.cpython-38.pyc and b/src/utils/__pycache__/hparams.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/preprocess.cpython-38.pyc b/src/utils/__pycache__/preprocess.cpython-38.pyc
index 58abea029b3e9d96718a6812624cbba961ef5202..e5e0b7f2a4c29050bfbb30405816311acd3060f0 100644
Binary files a/src/utils/__pycache__/preprocess.cpython-38.pyc and b/src/utils/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/src/utils/preprocess.py b/src/utils/preprocess.py
index e5362752551bba79bfacb3dd304b47ff525f4e2f..4e3dad8d4a49080a3300f672965a11a8a2054fa2 100644
--- a/src/utils/preprocess.py
+++ b/src/utils/preprocess.py
@@ -1,5 +1,5 @@
import numpy as np
-import cv2, os, sys,torch
+import cv2, os, sys, torch
from tqdm import tqdm
from PIL import Image
@@ -51,7 +51,7 @@ class CropAndExtract():
self.lm3d_std = load_lm3d(dir_of_BFM_fitting)
self.device = device
- def generate(self, input_path, save_dir):
+ def generate(self, input_path, save_dir, crop_or_resize='crop'):
pic_size = 256
pic_name = os.path.splitext(os.path.split(input_path)[-1])[0]
@@ -81,7 +81,7 @@ class CropAndExtract():
break
x_full_frames = [cv2.cvtColor(full_frames[0], cv2.COLOR_BGR2RGB) ]
- if True:
+ if crop_or_resize.lower() == 'crop': # default crop
x_full_frames, crop, quad = self.croper.crop(x_full_frames, xsize=pic_size)
clx, cly, crx, cry = crop
lx, ly, rx, ry = quad
@@ -90,7 +90,9 @@ class CropAndExtract():
original_size = (ox2 - ox1, oy2 - oy1)
else:
oy1, oy2, ox1, ox2 = 0, x_full_frames[0].shape[0], 0, x_full_frames[0].shape[1]
- frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size,pic_size))) for frame in x_full_frames]
+ original_size = (ox2 - ox1, oy2 - oy1)
+
+ frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size, pic_size))) for frame in x_full_frames]
if len(frames_pil) == 0:
print('No face is detected in the input file')
return None, None
@@ -110,7 +112,7 @@ class CropAndExtract():
if not os.path.isfile(coeff_path):
# load 3dmm paramter generator from Deep3DFaceRecon_pytorch
video_coeffs, full_coeffs = [], []
- for idx in tqdm(range(len(frames_pil)), desc=' 3DMM Extraction In Video:'):
+ for idx in tqdm(range(len(frames_pil)), desc='3DMM Extraction In Video:'):
frame = frames_pil[idx]
W,H = frame.size
lm1 = lm[idx].reshape([-1, 2])
@@ -147,4 +149,4 @@ class CropAndExtract():
savemat(coeff_path, {'coeff_3dmm': semantic_npy, 'full_3dmm': np.array(full_coeffs)[0]})
- return coeff_path, png_path
\ No newline at end of file
+ return coeff_path, png_path, original_size
\ No newline at end of file
diff --git a/src/utils/text2speech.py b/src/utils/text2speech.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ecaef36961494c8b2b1f5771a70b997efa04ffd
--- /dev/null
+++ b/src/utils/text2speech.py
@@ -0,0 +1,12 @@
+import os
+
+def text2speech(txt, audio_path):
+ print(txt)
+ cmd = f'tts --text "{txt}" --out_path {audio_path}'
+ print(cmd)
+ try:
+ os.system(cmd)
+ return audio_path
+ except:
+ print("Error: Failed convert txt to audio")
+ return None
\ No newline at end of file