How to Convert Video to Text

If the audio is in Mandarin, I found that XFyun FlySpeechRecognizer does a great job at this, it can even recognize some simple English words in mixed language audio.

You donโ€™t need to train a machine learning model from scratch to do this.

XFyun API has some limitations, the file size should be smaller than or equal to 2MB, and the audio length should be smaller than or equal to 60s.

Usage ./test.sh test.mp4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/bin/zsh
filename=$(echo $1 | cut -d. -f1)
type=$(echo $1 | cut -d. -f2)

# print $filename.$type

# split video <= 2MB
mp4box -splits 2000 $filename.$type

# echo video โ†’ audio pcm
for name in $filename\_*.$type; do
print "ffmpeg $name to pcm"
ffmpeg -y -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name.pcm
rm $name
done

# audio to text
for name in $filename\_*.pcm; do
print "xfyun.py $name"
python3 xfyun.py $name
rm $name
done

Now we can use XFyun to recognize the audio.

Here is their file XFyun IFlySpeechRecognizer, but I couldnโ€™t execute their example code, so I made some modifications.

xfyun.py with Python3:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import base64
import json
import time
import hashlib
import urllib.request
import urllib.parse
import sys

def main(argv):
f = open(argv[1], 'rb')
file_content = f.read()
base64_audio = base64.b64encode(file_content)
body = urllib.parse.urlencode({'audio': base64_audio}).encode('utf8')

url = 'http://api.xfyun.cn/v1/service/v1/iat'
api_key = ''
x_appid = ''

param = {"engine_type": "sms16k", "aue": "raw"}

param_str = json.dumps(param)
param_utf8 = param_str.replace(' ', '').encode('utf8')
param_b64 = base64.b64encode(param_utf8)
param_b64str = param_b64.decode('utf8')

x_time = str(int(time.time()))
checksum = (api_key + x_time + param_b64str).encode('utf8')
x_checksum = hashlib.md5(checksum).hexdigest()
x_header = {'X-Appid': x_appid,
'X-CurTime': x_time,
'X-Param': param_b64str,
'X-CheckSum': x_checksum}

req = urllib.request.Request(url, data=body, headers=x_header)
response = urllib.request.urlopen(req)
result = response.read().decode('utf8')
data = json.loads(result)['data']
print (data)
return

if __name__ == '__main__':
main(sys.argv)

More on Converting Video to Text

I tried XFyun, but it didnโ€™t work well for recognizable results, so I am looking for another recognizer to do some testing.

The good news is that they all support the earlier converted PCM type, so I need to make some changes to the shell. When you want to compare the results from different recognizers, simply run:

test.sh xfyun.py|baidu.py|qq.py test.mp4|.m4a

Honestly, to the naked eye, Googleโ€™s result looks better.

test.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/bin/zsh

py=$1

filename=$(echo $2 | cut -d. -f1)
type=$(echo $2 | cut -d. -f2)


# print $filename.$type

# split video <= 1MB
mp4box -splits 2000 $filename.$type

# echo video โ†’ audio pcm
for name in $filename\_*.$type; do
print "ffmpeg $name to pcm"
ffmpeg -y -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name.pcm
rm $name
done

# audio to text
for name in $filename\_*.pcm; do
print "$py $name"
python3 $py $name
# rm $name
done

id_key.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
dic = {
"xfyun": {
"id": "",
"key": ""
},
"baidu": {
"id": "",
"key": "",
"secret": ""
},
"qq": {
"id": "",
"key": ""
}
}

google

They submitted the file online, so I skipped writing *.py ๐Ÿ˜

baidu.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from aip import AipSpeech
import sys

import id_key
app_id = id_key.dic['baidu']['id']
app_key = id_key.dic['baidu']['key']
secret = id_key.dic['baidu']['secret']

client = AipSpeech(app_id, app_key, secret)

print(sys.argv[1])

def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()

res=client.asr(get_file_content(sys.argv[1]), 'pcm', 16000, {
'dev_pid': 1537,
})

print(res)

qq.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: UTF-8 -*-
import hashlib
import time, random, base64, json

import id_key
app_id = id_key.dic['qq']['id']
app_key = id_key.dic['qq']['key']

api_url='https://api.ai.qq.com/fcgi-bin/aai/aai_asr'

def md5(string):
md = hashlib.md5()
md.update(string)
md5 = md.hexdigest().upper()
return md5

def signify(args, app_key):
query_str = urlencode(args)
query_str = query_str + '&app_key=' + app_key
signiture = md5(query_str.encode('utf-8'))
return signiture

import urllib

def urlencode(args):
tuples = [(k, args[k]) for k in sorted(args.keys()) if args[k]]
query_str = urllib.parse.urlencode(tuples)
return query_str


import requests
def http_post(api_url, args):
resp = requests.post(url=api_url, data=args)
resp = json.loads(resp.text)
return resp



class BaseASR(object):
ext2idx = {'pcm': '1', 'wav': '2', 'amr': '3', 'slk': '4'}

def __init__(self, api_url, app_id, app_key):
self.api_url = api_url
self.app_id = app_id
self.app_key = app_key

def stt(self, audio_file, ext, rate):
raise Exceptin("Unimplemented!")

class BasicASR(BaseASR):
""" Online ASR from Tencent
https://ai.qq.com/doc/aaiasr.shtml
"""
def __init__(self):
super(BasicASR, self).__init__(api_url, app_id, app_key)

def stt(self, audio_file, ext='pcm', rate=16000):
if ext == 'pcm':
# wf = wave.open(audio_file)
# nf = wf.getnframes()
# audio_data = wf.readframes(nf)
f = open(audio_file, 'rb')
audio_data = f.read()
f.close()
else:
raise Exception("Unsupport audio file format!")

args = {
'app_id': self.app_id,
'time_stamp': str(int(time.time())),
'nonce_str': '%.x' % random.randint(1048576, 104857600),
'format': self.ext2idx[ext],
'rate': str(rate),
'speech': base64.b64encode(audio_data),
}

signiture = signify(args, self.app_key)
args['sign'] = signiture
resp = http_post(self.api_url, args)
print(resp)
text = resp['data']['text'].encode('utf-8')

# if DEBUG:
return text

import sys

asr_engine = BasicASR()
text = asr_engine.stt(sys.argv[1])
print(text)

xfyun.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import base64
import json
import time
import hashlib
import urllib.request
import urllib.parse
import sys

import id_key
app_id = id_key.dic['xfyun']['id']
api_key = id_key.dic['xfyun']['key']


url = 'http://api.xfyun.cn/v1/service/v1/iat'

def main(argv):
f = open(argv[1], 'rb')
file_content = f.read()
base64_audio = base64.b64encode(file_content)
body = urllib.parse.urlencode({'audio': base64_audio}).encode('utf8')



param = {"engine_type": "sms16k", "aue": "raw"}

param_str = json.dumps(param)
param_utf8 = param_str.replace(' ', '').encode('utf8')
param_b64 = base64.b64encode(param_utf8)
param_b64str = param_b64.decode('utf8')

x_time = str(int(time.time()))
checksum = (api_key + x_time + param_b64str).encode('utf8')
x_checksum = hashlib.md5(checksum).hexdigest()
x_header = {'X-Appid': app_id,
'X-CurTime': x_time,
'X-Param': param_b64str,
'X-CheckSum': x_checksum}

req = urllib.request.Request(url, data=body, headers=x_header)
response = urllib.request.urlopen(req)
result = response.read().decode('utf8')
dic = json.loads(result)

if dic['data'] != "":
print(dic['data'])
else:
print(result)
return

if __name__ == '__main__':
main(sys.argv)

Translated by gpt-3.5-turbo