如何将视频转换为文本

如果音频是普通话,我发现 XFyun FlySpeechRecognizer 在这件事上做得很好,它甚至可以识别混合语言音频中的一些简单英语单词。

不需要从一开始就训练机器学习模型来做这样的事情。

XFyun API 有一些限制,大小应小于等于 2MB,音频长度小于等于 60s。

Usage ./test.sh test.mp4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/bin/zsh
filename=$(echo $1 | cut -d. -f1)
type=$(echo $1 | cut -d. -f2)

# print $filename.$type

# split video <= 2MB
mp4box -splits 2000 $filename.$type

# echo video → audio pcm
for name in $filename\_*.$type; do
print "ffmpeg $name to pcm"
ffmpeg -y -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name.pcm
rm $name
done

# audio to text
for name in $filename\_*.pcm; do
print "xfyun.py $name"
python3 xfyun.py $name
rm $name
done

现在我们可以使用 XFyun 识别音频。

这是他们的文件 XFyun IFlySpeechRecognizer, 但无法执行其示例代码,因此我进行了一些修复。

xfyun.py with Python3:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import base64
import json
import time
import hashlib
import urllib.request
import urllib.parse
import sys


def main(argv):
f = open(argv[1], 'rb')
file_content = f.read()
base64_audio = base64.b64encode(file_content)
body = urllib.parse.urlencode({'audio': base64_audio}).encode('utf8')

url = 'http://api.xfyun.cn/v1/service/v1/iat'
api_key = ''
x_appid = ''

param = {"engine_type": "sms16k", "aue": "raw"}

param_str = json.dumps(param)
param_utf8 = param_str.replace(' ', '').encode('utf8')
param_b64 = base64.b64encode(param_utf8)
param_b64str = param_b64.decode('utf8')

x_time = str(int(time.time()))
checksum = (api_key + x_time + param_b64str).encode('utf8')
x_checksum = hashlib.md5(checksum).hexdigest()
x_header = {'X-Appid': x_appid,
'X-CurTime': x_time,
'X-Param': param_b64str,
'X-CheckSum': x_checksum}

req = urllib.request.Request(url, data=body, headers=x_header)
response = urllib.request.urlopen(req)
result = response.read().decode('utf8')
data = json.loads(result)['data']
print (data)
return

if __name__ == '__main__':
main(sys.argv)

更多如何将视频转换为文本

我尝试了 XFyun,但是它不适合可识别的结果,因此我正在寻找另一个识别器来进行一些测试。

好消息是它们都支持早先转换的PCM类型,因此我需要对 shell 进行一些更改。当您想比较来自不同识别器的结果时,只需运行:

test.sh xfyun.py|baidu.py|qq.py test.mp4|.m4a

老实说,用人眼一看,Google 的结果看起来更好。

test.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/bin/zsh

py=$1

filename=$(echo $2 | cut -d. -f1)
type=$(echo $2 | cut -d. -f2)


# print $filename.$type

# split video <= 1MB
mp4box -splits 2000 $filename.$type

# echo video → audio pcm
for name in $filename\_*.$type; do
print "ffmpeg $name to pcm"
ffmpeg -y -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name.pcm
rm $name
done

# audio to text
for name in $filename\_*.pcm; do
print "$py $name"
python3 $py $name
# rm $name
done

id_key.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
dic = {
"xfyun": {
"id": "",
"key": ""
},
"baidu": {
"id": "",
"key": "",
"secret": ""
},
"qq": {
"id": "",
"key": ""
}
}

google

他们在线上提交了文件,所以我跳过写了 *.py 😝

baidu.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from aip import AipSpeech
import sys

import id_key
app_id = id_key.dic['baidu']['id']
app_key = id_key.dic['baidu']['key']
secret = id_key.dic['baidu']['secret']

client = AipSpeech(app_id, app_key, secret)

print(sys.argv[1])

def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()

res=client.asr(get_file_content(sys.argv[1]), 'pcm', 16000, {
'dev_pid': 1537,
})

print(res)

qq.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -*- coding: UTF-8 -*-
import hashlib
import time, random, base64, json

import id_key
app_id = id_key.dic['qq']['id']
app_key = id_key.dic['qq']['key']

api_url='https://api.ai.qq.com/fcgi-bin/aai/aai_asr'

def md5(string):
md = hashlib.md5()
md.update(string)
md5 = md.hexdigest().upper()
return md5

def signify(args, app_key):
query_str = urlencode(args)
query_str = query_str + '&app_key=' + app_key
signiture = md5(query_str.encode('utf-8'))
return signiture

import urllib

def urlencode(args):
tuples = [(k, args[k]) for k in sorted(args.keys()) if args[k]]
query_str = urllib.parse.urlencode(tuples)
return query_str


import requests
def http_post(api_url, args):
resp = requests.post(url=api_url, data=args)
resp = json.loads(resp.text)
return resp



class BaseASR(object):
ext2idx = {'pcm': '1', 'wav': '2', 'amr': '3', 'slk': '4'}

def __init__(self, api_url, app_id, app_key):
self.api_url = api_url
self.app_id = app_id
self.app_key = app_key

def stt(self, audio_file, ext, rate):
raise Exceptin("Unimplemented!")

class BasicASR(BaseASR):
""" Online ASR from Tencent
https://ai.qq.com/doc/aaiasr.shtml
"""
def __init__(self):
super(BasicASR, self).__init__(api_url, app_id, app_key)

def stt(self, audio_file, ext='pcm', rate=16000):
if ext == 'pcm':
# wf = wave.open(audio_file)
# nf = wf.getnframes()
# audio_data = wf.readframes(nf)
f = open(audio_file, 'rb')
audio_data = f.read()
f.close()
else:
raise Exception("Unsupport audio file format!")

args = {
'app_id': self.app_id,
'time_stamp': str(int(time.time())),
'nonce_str': '%.x' % random.randint(1048576, 104857600),
'format': self.ext2idx[ext],
'rate': str(rate),
'speech': base64.b64encode(audio_data),
}

signiture = signify(args, self.app_key)
args['sign'] = signiture
resp = http_post(self.api_url, args)
print(resp)
text = resp['data']['text'].encode('utf-8')

# if DEBUG:
return text

import sys

asr_engine = BasicASR()
text = asr_engine.stt(sys.argv[1])
print(text)

xfyun.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import base64
import json
import time
import hashlib
import urllib.request
import urllib.parse
import sys

import id_key
app_id = id_key.dic['xfyun']['id']
api_key = id_key.dic['xfyun']['key']


url = 'http://api.xfyun.cn/v1/service/v1/iat'

def main(argv):
f = open(argv[1], 'rb')
file_content = f.read()
base64_audio = base64.b64encode(file_content)
body = urllib.parse.urlencode({'audio': base64_audio}).encode('utf8')



param = {"engine_type": "sms16k", "aue": "raw"}

param_str = json.dumps(param)
param_utf8 = param_str.replace(' ', '').encode('utf8')
param_b64 = base64.b64encode(param_utf8)
param_b64str = param_b64.decode('utf8')

x_time = str(int(time.time()))
checksum = (api_key + x_time + param_b64str).encode('utf8')
x_checksum = hashlib.md5(checksum).hexdigest()
x_header = {'X-Appid': app_id,
'X-CurTime': x_time,
'X-Param': param_b64str,
'X-CheckSum': x_checksum}

req = urllib.request.Request(url, data=body, headers=x_header)
response = urllib.request.urlopen(req)
result = response.read().decode('utf8')
dic = json.loads(result)

if dic['data'] != "":
print(dic['data'])
else:
print(result)
return

if __name__ == '__main__':
main(sys.argv)