Audio File to Text via SFSpeechRecognizer

It’s much easier than I imagined, just like calling an API (actually) πŸ˜„πŸ˜„πŸ˜„

import Speech

Info.plist

  • Privacy - Microphone Usage Description
  • Privacy - Speech Recognition Usage Description

Authorization

1
2
3
4
5
6
7
8
9
10
SFSpeechRecognizer.requestAuthorization { authStatus in
OperationQueue.main.addOperation { [weak self] in
switch authStatus {
case .notDetermined, .denied, .restricted:
print(".notDetermined, .denied, .restricted")
case .authorized:
self?.startRecognizeFile()
}
}
}

SelectAudio

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
protocol ShareText {

}
extension ShareText where Self: UIViewController {
func shareText(_ title: String, text: String) {

// set up activity view controller
let textToShare = [text]
let activityViewController = UIActivityViewController(activityItems: textToShare, applicationActivities: nil)
activityViewController.popoverPresentationController?.sourceView = self.view // so that iPads won't crash

// exclude some activity types from the list (optional)
activityViewController.excludedActivityTypes = [ UIActivity.ActivityType.airDrop, .message, .copyToPasteboard ]

// present the view controller
self.present(activityViewController, animated: true, completion: nil)

}
}
1
2
3
4
5
6
7
8
9
10
11
12
protocol SeletctAudio {

}
extension SeletctAudio where Self: UIViewController {
func selectAudio() {
let types: [String] = ["public.movie", "public.audio"]
let documentPicker = UIDocumentPickerViewController(documentTypes: types, in: .import)
documentPicker.delegate = self as? UIDocumentPickerDelegate
documentPicker.modalPresentationStyle = .formSheet
self.present(documentPicker, animated: true, completion: nil)
}
}

RecognizeFile

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145


protocol RecognizeFile {

}

extension RecognizeFile {

public func recognizeFile(_ url: URL, completion: @escaping (Bool, Bool, String) -> Void) {

self.divide(url) { urls in
let s = DispatchSemaphore(value: 1)

urls.enumerated().forEach { i, url in
DispatchQueue.global().async {
_ = s.wait(timeout: DispatchTime.distantFuture)
print(url)
self.recognizeUrl(url) { b, str in
completion(b, urls.last == url, str)
if b {
print(str)
s.signal()
}
}
}

}
}
}

private func recognizeUrl(_ url: URL, completion: @escaping (Bool, String) -> Void) {
guard let myRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "zh_CN")) else {
// A recognizer is not supported for the current locale
return
}

if !myRecognizer.isAvailable {
// The recognizer is not available right now
return
}

let request = SFSpeechURLRecognitionRequest(url: url)
// request.shouldReportPartialResults = true
myRecognizer.recognitionTask(with: request) { (result, error) in
guard let result = result else {
// Recognition failed, so check error for details and handle it
print("err: ", error.debugDescription)
return
}

// Print the SpeechTester that has been recognized so far
// if result.isFinal {
let str = result.bestTranscription.formattedString
OperationQueue.main.addOperation {
completion(result.isFinal, str)
}
// }
}
}

func divide(_ url: URL, completion: @escaping ([URL]) -> Void) {
let asset = AVURLAsset(url: url)

let n = Int(CMTimeGetSeconds(asset.duration) / 60)
var urls = Array(repeating: URL(string: ""), count: n+1)

let group = DispatchGroup()

for i in 0...n {
group.enter()
exportAsset(asset, start: i) { url in
urls[i] = url
group.leave()
}
}

group.notify(queue: .main) {
print("divide complete")
completion(urls.compactMap({$0}))
}

}

func exportAsset(_ asset: AVAsset, start: Int, completion: @escaping (URL?) -> Void) {
print("\(#function)")

let documentsDirectory = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
let tmpFolder = documentsDirectory.appendingPathComponent("tmp/")
try? FileManager.default.createDirectory(at: tmpFolder, withIntermediateDirectories: true, attributes: nil)
let trimmedSoundFileURL = documentsDirectory.appendingPathComponent("tmp/\(start).m4a")
print("saving to \(trimmedSoundFileURL.absoluteString)")

if FileManager.default.fileExists(atPath: trimmedSoundFileURL.path) {
print("sound exists, removing \(trimmedSoundFileURL.absoluteString)")
do {
if try trimmedSoundFileURL.checkResourceIsReachable() {
print("is reachable")
}

try FileManager.default.removeItem(atPath: trimmedSoundFileURL.path)
} catch {
print("could not remove \(trimmedSoundFileURL)")
print(error.localizedDescription)
}

}

print("creating export session for \(asset)")

if let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetAppleM4A) {
exporter.outputFileType = AVFileType.m4a
exporter.outputURL = trimmedSoundFileURL

let duration = CMTimeGetSeconds(asset.duration)
if duration < 5.0 {
print("sound is not long enough")
return
}
// e.g. the first 5 seconds
let startTime = CMTime(seconds: Double(start*60), preferredTimescale: 1)
let stopTime = CMTime(seconds: Double(start+1)*60, preferredTimescale: 1)
exporter.timeRange = CMTimeRangeFromTimeToTime(start: startTime, end: stopTime)

// do it
exporter.exportAsynchronously(completionHandler: {
print("export complete \(exporter.status)")

switch exporter.status {
case .failed, .cancelled:
completion(nil)
print("export \(String(describing: exporter.error))")
default:
completion(trimmedSoundFileURL)
print("export complete")
}
})
} else {
completion(nil)
print("cannot create AVAssetExportSession for asset \(asset)")
}

}
}


Translated by gpt-3.5-turbo