@article{492, keywords = {Speech recognition, Generative adversarial networks, Mismatched data, Resource-scarce environments}, author = {Walter Heymans and Marelie Davel and Charl Van Heerden}, title = {Efficient acoustic feature transformation in mismatched environments using a Guided-GAN}, abstract = {We propose a new framework to improve automatic speech recognition (ASR) systems in resource-scarce environments using a generative adversarial network (GAN) operating on acoustic input features. The GAN is used to enhance the features of mismatched data prior to decoding, or can optionally be used to fine-tune the acoustic model. We achieve improvements that are comparable to multi-style training (MTR), but at a lower computational cost. With less than one hour of data, an ASR system trained on good quality data, and evaluated on mismatched audio is improved by between 11.5% and 19.7% relative word error rate (WER). Experiments demonstrate that the framework can be very useful in under-resourced environments where training data and computational resources are limited. The GAN does not require parallel training data, because it utilises a baseline acoustic model to provide an additional loss term that guides the generator to create acoustic features that are better classified by the baseline.}, year = {2022}, journal = {Speech Communication}, volume = {143}, chapter = {10 - 20}, month = {09/2022}, doi = {https://doi.org/10.1016/j.specom.2022.07.002}, language = {English}, }