@article{483, keywords = {code switching, Speech Corpus, multilingual speech recognition (195, Sepedi)}, author = {Thipe Modipa and Marelie Davel}, title = {Two Sepedi‑English code‑switched speech corpora}, abstract = {We report on the development of two reference corpora for the analysis of SepediEnglish code-switched speech in the context of automatic speech recognition. For the first corpus, possible English events were obtained from an existing corpus of transcribed Sepedi-English speech. The second corpus is based on the analysis of radio broadcasts: actual instances of code switching were transcribed and reproduced by a number of native Sepedi speakers. We describe the process to develop and verify both corpora and perform an initial analysis of the newly produced data sets. We find that, in naturally occurring speech, the frequency of code switching is unexpectedly high for this language pair, and that the continuum of code switching (from unmodified embedded words to loanwords absorbed into the matrix language) makes this a particularly challenging task for speech recognition systems.}, year = {2022}, journal = {Language Resources and Evaluation}, volume = {56}, chapter = {https://rdcu.be/cO6lD)}, publisher = {Springer}, address = {South Africa}, url = {https://rdcu.be/cO6lD}, doi = {https://doi.org/10.1007/s10579-022-09592-6 (Read here: https://rdcu.be/cO6lD)}, }