vasilis
/

xls-r-et-V-3

Automatic Speech Recognition

mozilla-foundation/common_voice_8_0

robust-speech-event

Generated from Trainer

hf-asr-leaderboard

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

xls-r-et-V-3 / transform_model.py

vasilis's picture

adds eval results

c2f4ff5 almost 3 years ago

2.11 kB

	'''
	Transform augmented model back to normal hf supported version.
	i.e remove first module
	'''
	from augmentation import AUG

	from torch import nn
	import transformers
	from transformers import Wav2Vec2ForCTC, AutoModelForPreTraining
	from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2NoLayerNormConvLayer, Wav2Vec2LayerNormConvLayer, Wav2Vec2GroupNormConvLayer


	def patch_init(cls):
	__class__ = cls # provide closure cell for super()

	def new_init(self, config):
	if config.feat_extract_norm == "group":
	conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
	Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
	]
	elif config.feat_extract_norm == "layer":
	conv_layers = [
	Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
	]
	else:
	raise ValueError(
	f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
	)
	aug = AUG()
	from IPython import embed
	embed()
	conv_layers.insert(0, aug)
	self.conv_layers = nn.ModuleList(conv_layers)
	cls.__init__ = new_init

	patch_init(transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor)

	# model_path = "pytorch_model.bin"
	# Wav2Vec2ForPreTraining
	model = Wav2Vec2ForCTC.from_pretrained(".")
	from IPython import embed
	embed()
	# monkey patching from augmentation return model to normal state
	model.wav2vec2.feature_extractor.conv_layers = nn.Sequential(*list(model.wav2vec2.feature_extractor.conv_layers.children())[1:])


	model.save_pretrained(".")

	"""
	replace with temprarily then save model, patching didn't work, get loaded so far for some reason.
	"conv_dim": [
	1,
	512,
	512,
	512,
	512,
	512,
	512,
	512
	],
	"conv_kernel": [
	10,
	10,
	3,
	3,
	3,
	3,
	2,
	2
	],
	"conv_stride": [
	5,
	5,
	2,
	2,
	2,
	2,
	2,
	2
	"""