If the audio is in Mandarin, I found that XFyun FlySpeechRecognizer does a great job at this, it can even recognize some simple English words in mixed language audio.
You donโt need to train a machine learning model from scratch to do this.
XFyun API has some limitations, the file size should be smaller than or equal to 2MB, and the audio length should be smaller than or equal to 60s.
Usage ./test.sh test.mp4
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 #!/bin/zsh filename=$(echo $1 | cut -d. -f1) type =$(echo $1 | cut -d. -f2)mp4box -splits 2000 $filename .$type for name in $filename \_*.$type ; do print "ffmpeg $name to pcm" ffmpeg -y -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name .pcm rm $name done for name in $filename \_*.pcm; do print "xfyun.py $name " python3 xfyun.py $name rm $name done
Now we can use XFyun to recognize the audio.
Here is their file XFyun IFlySpeechRecognizer , but I couldnโt execute their example code, so I made some modifications.
with Python3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 import base64import jsonimport timeimport hashlibimport urllib.requestimport urllib.parseimport sysdef main (argv ): f = open (argv[1 ], 'rb' ) file_content = f.read() base64_audio = base64.b64encode(file_content) body = urllib.parse.urlencode({'audio' : base64_audio}).encode('utf8' ) url = 'http://api.xfyun.cn/v1/service/v1/iat' api_key = '' x_appid = '' param = {"engine_type" : "sms16k" , "aue" : "raw" } param_str = json.dumps(param) param_utf8 = param_str.replace(' ' , '' ).encode('utf8' ) param_b64 = base64.b64encode(param_utf8) param_b64str = param_b64.decode('utf8' ) x_time = str (int (time.time())) checksum = (api_key + x_time + param_b64str).encode('utf8' ) x_checksum = hashlib.md5(checksum).hexdigest() x_header = {'X-Appid' : x_appid, 'X-CurTime' : x_time, 'X-Param' : param_b64str, 'X-CheckSum' : x_checksum} req = urllib.request.Request(url, data=body, headers=x_header) response = urllib.request.urlopen(req) result = response.read().decode('utf8' ) data = json.loads(result)['data' ] print (data) return if __name__ == '__main__' : main(sys.argv)
More on Converting Video to Text I tried XFyun, but it didnโt work well for recognizable results, so I am looking for another recognizer to do some testing.
The good news is that they all support the earlier converted PCM
type, so I need to make some changes to the shell. When you want to compare the results from different recognizers, simply run:
test.sh xfyun.py|baidu.py|qq.py test.mp4|.m4a
Honestly, to the naked eye, Googleโs result looks better.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 py=$1 filename=$(echo $2 | cut -d. -f1) type =$(echo $2 | cut -d. -f2)mp4box -splits 2000 $filename.$type for name in $filename\_*.$type ; do print "ffmpeg $name to pcm" ffmpeg -y -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name.pcm rm $name done for name in $filename\_*.pcm; do print "$py $name" python3 $py $name done
id_key.py 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 dic = { "xfyun" : { "id" : "" , "key" : "" }, "baidu" : { "id" : "" , "key" : "" , "secret" : "" }, "qq" : { "id" : "" , "key" : "" } }
google They submitted the file online, so I skipped writing *.py ๐
baidu.py 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 from aip import AipSpeechimport sysimport id_keyapp_id = id_key.dic['baidu' ]['id' ] app_key = id_key.dic['baidu' ]['key' ] secret = id_key.dic['baidu' ]['secret' ] client = AipSpeech(app_id, app_key, secret) print (sys.argv[1 ])def get_file_content (filePath ): with open (filePath, 'rb' ) as fp: return fp.read() res=client.asr(get_file_content(sys.argv[1 ]), 'pcm' , 16000 , { 'dev_pid' : 1537 , }) print (res)
qq.py 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 import hashlibimport time, random, base64, jsonimport id_keyapp_id = id_key.dic['qq' ]['id' ] app_key = id_key.dic['qq' ]['key' ] api_url='https://api.ai.qq.com/fcgi-bin/aai/aai_asr' def md5 (string ): md = hashlib.md5() md.update(string) md5 = md.hexdigest().upper() return md5 def signify (args, app_key ): query_str = urlencode(args) query_str = query_str + '&app_key=' + app_key signiture = md5(query_str.encode('utf-8' )) return signiture import urllibdef urlencode (args ): tuples = [(k, args[k]) for k in sorted (args.keys()) if args[k]] query_str = urllib.parse.urlencode(tuples) return query_str import requestsdef http_post (api_url, args ): resp = requests.post(url=api_url, data=args) resp = json.loads(resp.text) return resp class BaseASR (object ): ext2idx = {'pcm' : '1' , 'wav' : '2' , 'amr' : '3' , 'slk' : '4' } def __init__ (self, api_url, app_id, app_key ): self.api_url = api_url self.app_id = app_id self.app_key = app_key def stt (self, audio_file, ext, rate ): raise Exceptin("Unimplemented!" ) class BasicASR (BaseASR ): """ Online ASR from Tencent https://ai.qq.com/doc/aaiasr.shtml """ def __init__ (self ): super (BasicASR, self).__init__(api_url, app_id, app_key) def stt (self, audio_file, ext='pcm' , rate=16000 ): if ext == 'pcm' : f = open (audio_file, 'rb' ) audio_data = f.read() f.close() else : raise Exception("Unsupport audio file format!" ) args = { 'app_id' : self.app_id, 'time_stamp' : str (int (time.time())), 'nonce_str' : '%.x' % random.randint(1048576 , 104857600 ), 'format' : self.ext2idx[ext], 'rate' : str (rate), 'speech' : base64.b64encode(audio_data), } signiture = signify(args, self.app_key) args['sign' ] = signiture resp = http_post(self.api_url, args) print (resp) text = resp['data' ]['text' ].encode('utf-8' ) return text import sysasr_engine = BasicASR() text = asr_engine.stt(sys.argv[1 ]) print (text)
xfyun.py 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 import base64import jsonimport timeimport hashlibimport urllib.requestimport urllib.parseimport sysimport id_keyapp_id = id_key.dic['xfyun' ]['id' ] api_key = id_key.dic['xfyun' ]['key' ] url = 'http://api.xfyun.cn/v1/service/v1/iat' def main (argv ): f = open (argv[1 ], 'rb' ) file_content = f.read() base64_audio = base64.b64encode(file_content) body = urllib.parse.urlencode({'audio' : base64_audio}).encode('utf8' ) param = {"engine_type" : "sms16k" , "aue" : "raw" } param_str = json.dumps(param) param_utf8 = param_str.replace(' ' , '' ).encode('utf8' ) param_b64 = base64.b64encode(param_utf8) param_b64str = param_b64.decode('utf8' ) x_time = str (int (time.time())) checksum = (api_key + x_time + param_b64str).encode('utf8' ) x_checksum = hashlib.md5(checksum).hexdigest() x_header = {'X-Appid' : app_id, 'X-CurTime' : x_time, 'X-Param' : param_b64str, 'X-CheckSum' : x_checksum} req = urllib.request.Request(url, data=body, headers=x_header) response = urllib.request.urlopen(req) result = response.read().decode('utf8' ) dic = json.loads(result) if dic['data' ] != "" : print (dic['data' ]) else : print (result) return if __name__ == '__main__' : main(sys.argv)
Translated by gpt-3.5-turbo