How to Convert Video to Text

Posted on 2018-07-22 Edited on 2023-09-29 Disqus: Word count in article: 7.7k Reading time ≈ 7 mins.

If the audio is in Mandarin, I found that XFyun FlySpeechRecognizer does a great job at this, it can even recognize some simple English words in mixed language audio.

You don’t need to train a machine learning model from scratch to do this.

XFyun API has some limitations, the file size should be smaller than or equal to 2MB, and the audio length should be smaller than or equal to 60s.

Usage ./test.sh test.mp4

#!/bin/zsh
filename=$(echo $1 | cut -d. -f1)
type=$(echo $1 | cut -d. -f2)

# print $filename.$type

# split video <= 2MB
mp4box -splits 2000 $filename.$type

# echo video → audio pcm
for name in $filename\_*.$type; do
  print "ffmpeg $name to pcm"
  ffmpeg -y  -i $name -acodec pcm_s16le -f s16le -ac 1 -ar 16000 $name.pcm
  rm $name
done

# audio to text
for name in $filename\_*.pcm; do
  print "xfyun.py $name"
  python3 xfyun.py $name
  rm $name
done

Now we can use XFyun to recognize the audio.

Here is their file XFyun IFlySpeechRecognizer, but I couldn’t execute their example code, so I made some modifications.

xfyun.py with Python3:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import base64
import json
import time
import hashlib
import urllib.request
import urllib.parse
import sys

def main(argv):
    f = open(argv[1], 'rb')
    file_content = f.read()
    base64_audio = base64.b64encode(file_content)
    body = urllib.parse.urlencode({'audio': base64_audio}).encode('utf8')

    url = 'http://api.xfyun.cn/v1/service/v1/iat'
    api_key = ''
    x_appid = ''

    param = {"engine_type": "sms16k", "aue": "raw"}

    param_str = json.dumps(param)
    param_utf8 = param_str.replace(' ', '').encode('utf8')
    param_b64 = base64.b64encode(param_utf8)
    param_b64str = param_b64.decode('utf8')

    x_time = str(int(time.time()))
    checksum = (api_key + x_time + param_b64str).encode('utf8')
    x_checksum = hashlib.md5(checksum).hexdigest()
    x_header = {'X-Appid': x_appid,
                'X-CurTime': x_time,
                'X-Param': param_b64str,
                'X-CheckSum': x_checksum}

    req = urllib.request.Request(url, data=body, headers=x_header)
    response = urllib.request.urlopen(req)
    result = response.read().decode('utf8')
    data = json.loads(result)['data']
    print (data)
    return

if __name__ == '__main__':
    main(sys.argv)

id_key.py

dic = {
    "xfyun": {
        "id": "",
        "key": ""
    },
    "baidu": {
        "id": "",
        "key": "",
        "secret": ""
    },
    "qq": {
        "id": "",
        "key": ""
    }
}

google

They submitted the file online, so I skipped writing *.py 😝

baidu.py

from aip import AipSpeech
import sys

import id_key
app_id = id_key.dic['baidu']['id']
app_key = id_key.dic['baidu']['key']
secret = id_key.dic['baidu']['secret']

client = AipSpeech(app_id, app_key, secret)

print(sys.argv[1])

def get_file_content(filePath):
    with open(filePath, 'rb') as fp:
        return fp.read()

res=client.asr(get_file_content(sys.argv[1]), 'pcm', 16000, {
    'dev_pid': 1537,
})

print(res)

qq.py

# -*- coding: UTF-8 -*-
import hashlib
import time, random, base64, json

import id_key
app_id = id_key.dic['qq']['id']
app_key = id_key.dic['qq']['key']

api_url='https://api.ai.qq.com/fcgi-bin/aai/aai_asr'

def md5(string):
    md = hashlib.md5()
    md.update(string)
    md5 = md.hexdigest().upper()
    return md5

def signify(args, app_key):
    query_str = urlencode(args)
    query_str = query_str + '&app_key=' + app_key
    signiture = md5(query_str.encode('utf-8'))
    return signiture

import urllib

def urlencode(args):
    tuples = [(k, args[k]) for k in sorted(args.keys()) if args[k]]
    query_str = urllib.parse.urlencode(tuples)
    return query_str


import requests
def http_post(api_url, args):
    resp = requests.post(url=api_url, data=args)
    resp = json.loads(resp.text)
    return resp



class BaseASR(object):
    ext2idx = {'pcm': '1', 'wav': '2', 'amr': '3', 'slk': '4'}

    def __init__(self, api_url, app_id, app_key):
        self.api_url = api_url
        self.app_id = app_id
        self.app_key = app_key

    def stt(self, audio_file, ext, rate):
        raise Exceptin("Unimplemented!")

class BasicASR(BaseASR):
    """ Online ASR from Tencent
    https://ai.qq.com/doc/aaiasr.shtml
    """
    def __init__(self):
        super(BasicASR, self).__init__(api_url, app_id, app_key)

    def stt(self, audio_file, ext='pcm', rate=16000):
        if ext == 'pcm':
            # wf = wave.open(audio_file)
            # nf = wf.getnframes()
            # audio_data = wf.readframes(nf)
            f = open(audio_file, 'rb')
            audio_data = f.read()
            f.close()
        else:
            raise Exception("Unsupport audio file format!")

        args = {
            'app_id': self.app_id,
            'time_stamp': str(int(time.time())),
            'nonce_str': '%.x' % random.randint(1048576, 104857600),
            'format': self.ext2idx[ext],
            'rate': str(rate),
            'speech': base64.b64encode(audio_data),
        }

        signiture = signify(args, self.app_key)
        args['sign'] = signiture
        resp = http_post(self.api_url, args)
        print(resp)
        text = resp['data']['text'].encode('utf-8')

        # if DEBUG:
        return text

import sys

asr_engine = BasicASR()
text = asr_engine.stt(sys.argv[1])
print(text)

xfyun.py

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import base64
import json
import time
import hashlib
import urllib.request
import urllib.parse
import sys

import id_key
app_id = id_key.dic['xfyun']['id']
api_key = id_key.dic['xfyun']['key']


url = 'http://api.xfyun.cn/v1/service/v1/iat'

def main(argv):
    f = open(argv[1], 'rb')
    file_content = f.read()
    base64_audio = base64.b64encode(file_content)
    body = urllib.parse.urlencode({'audio': base64_audio}).encode('utf8')



    param = {"engine_type": "sms16k", "aue": "raw"}

    param_str = json.dumps(param)
    param_utf8 = param_str.replace(' ', '').encode('utf8')
    param_b64 = base64.b64encode(param_utf8)
    param_b64str = param_b64.decode('utf8')

    x_time = str(int(time.time()))
    checksum = (api_key + x_time + param_b64str).encode('utf8')
    x_checksum = hashlib.md5(checksum).hexdigest()
    x_header = {'X-Appid': app_id,
                'X-CurTime': x_time,
                'X-Param': param_b64str,
                'X-CheckSum': x_checksum}

    req = urllib.request.Request(url, data=body, headers=x_header)
    response = urllib.request.urlopen(req)
    result = response.read().decode('utf8')
    dic = json.loads(result)

    if dic['data'] != "":
        print(dic['data'])
    else:
        print(result)
    return

if __name__ == '__main__':
    main(sys.argv)

Translated by gpt-3.5-turbo

More on Converting Video to Text

id_key.py

google

baidu.py

qq.py

xfyun.py