blob: 9b6a78ecda37f5f6b54b4f47fe68bbcb9446fb0f (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
syntax = "proto3";
package yandex.cloud.ai.stt.v2;
import "google/api/annotations.proto";
import "google/protobuf/duration.proto";
import "yandex/cloud/api/operation.proto";
import "yandex/cloud/operation/operation.proto";
option go_package = "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v2;stt";
option java_package = "yandex.cloud.api.ai.stt.v2";
service SttService {
rpc LongRunningRecognize (LongRunningRecognitionRequest) returns (operation.Operation) {
option (google.api.http) = { post: "/speech/stt/v2/longRunningRecognize" body: "*" };
option (yandex.cloud.api.operation) = {
response: "LongRunningRecognitionResponse"
};
}
rpc StreamingRecognize (stream StreamingRecognitionRequest) returns (stream StreamingRecognitionResponse) {
}
}
message LongRunningRecognitionRequest {
RecognitionConfig config = 1;
RecognitionAudio audio = 2;
}
message LongRunningRecognitionResponse {
repeated SpeechRecognitionResult chunks = 1;
}
message StreamingRecognitionRequest {
oneof streaming_request {
RecognitionConfig config = 1;
bytes audio_content = 2;
}
}
message StreamingRecognitionResponse {
repeated SpeechRecognitionChunk chunks = 1;
reserved 2;
reserved "end_of_single_utterance";
}
message RecognitionAudio {
oneof audio_source {
bytes content = 1;
string uri = 2;
}
}
message RecognitionConfig {
RecognitionSpec specification = 1;
string folder_id = 2;
}
message RecognitionSpec {
enum AudioEncoding {
AUDIO_ENCODING_UNSPECIFIED = 0;
// 16-bit signed little-endian (Linear PCM)
LINEAR16_PCM = 1;
OGG_OPUS = 2;
// transcription only
MP3 = 3;
}
reserved 6;
AudioEncoding audio_encoding = 1;
// 8000, 16000, 48000 only for pcm
int64 sample_rate_hertz = 2;
// code in BCP-47
string language_code = 3;
bool profanity_filter = 4;
string model = 5;
// If set true, tentative hypotheses may be returned as they become available (final=false flag)
// If false or omitted, only final=true result(s) are returned.
// Makes sense only for StreamingRecognize requests.
bool partial_results = 7;
bool single_utterance = 8;
// Used only for long running recognize.
int64 audio_channel_count = 9;
// This mark allows disable normalization text
bool raw_results = 10;
// Rewrite text in literature style (default: false)
bool literature_text = 11;
}
message SpeechRecognitionChunk {
repeated SpeechRecognitionAlternative alternatives = 1;
// This flag shows that the received chunk contains a part of the recognized text that won't be changed.
bool final = 2;
// This flag shows that the received chunk is the end of an utterance.
bool end_of_utterance = 3;
}
message SpeechRecognitionResult {
repeated SpeechRecognitionAlternative alternatives = 1;
int64 channel_tag = 2;
}
message SpeechRecognitionAlternative {
string text = 1;
float confidence = 2;
repeated WordInfo words = 3;
}
message WordInfo {
google.protobuf.Duration start_time = 1;
google.protobuf.Duration end_time = 2;
string word = 3;
float confidence = 4;
}
|