contrib/libs/yandex-cloud-api-protos/yandex/cloud/ai/stt/v2/stt_service.proto


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

syntax = "proto3";

package yandex.cloud.ai.stt.v2;

import "google/api/annotations.proto";
import "google/protobuf/duration.proto";
import "yandex/cloud/api/operation.proto";
import "yandex/cloud/operation/operation.proto";

option go_package = "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v2;stt";
option java_package = "yandex.cloud.api.ai.stt.v2";

service SttService {
  rpc LongRunningRecognize (LongRunningRecognitionRequest) returns (operation.Operation) {
    option (google.api.http) = { post: "/speech/stt/v2/longRunningRecognize" body: "*" };
    option (yandex.cloud.api.operation) = {
      response: "LongRunningRecognitionResponse"
    };
  }

  rpc StreamingRecognize (stream StreamingRecognitionRequest) returns (stream StreamingRecognitionResponse) {
  }
}

message LongRunningRecognitionRequest {
  RecognitionConfig config = 1;
  RecognitionAudio audio = 2;
}

message LongRunningRecognitionResponse {
  repeated SpeechRecognitionResult chunks = 1;
}

message StreamingRecognitionRequest {
  oneof streaming_request {
    RecognitionConfig config = 1;
    bytes audio_content = 2;
  }
}

message StreamingRecognitionResponse {
  repeated SpeechRecognitionChunk chunks = 1;
  reserved 2;
  reserved "end_of_single_utterance";
}

message RecognitionAudio {
  oneof audio_source {
    bytes content = 1;
    string uri = 2;
  }
}

message RecognitionConfig {
  RecognitionSpec specification = 1;
  string folder_id = 2;
}

message RecognitionSpec {
  enum AudioEncoding {
    AUDIO_ENCODING_UNSPECIFIED = 0;

    // 16-bit signed little-endian (Linear PCM)
    LINEAR16_PCM = 1;

    OGG_OPUS = 2;

    // transcription only
    MP3 = 3;
  }

  reserved 6;

  AudioEncoding audio_encoding = 1;

  // 8000, 16000, 48000 only for pcm
  int64 sample_rate_hertz = 2;

  // code in BCP-47
  string language_code = 3;

  bool profanity_filter = 4;
  string model = 5;

  // If set true, tentative hypotheses may be returned as they become available (final=false flag)
  // If false or omitted, only final=true result(s) are returned.
  // Makes sense only for StreamingRecognize requests.
  bool partial_results = 7;

  bool single_utterance = 8;

  // Used only for long running recognize.
  int64 audio_channel_count = 9;

  // This mark allows disable normalization text
  bool raw_results = 10;

  // Rewrite text in literature style (default: false)
  bool literature_text = 11;
}

message SpeechRecognitionChunk {
  repeated SpeechRecognitionAlternative alternatives = 1;
  // This flag shows that the received chunk contains a part of the recognized text that won't be changed.
  bool final = 2;
  // This flag shows that the received chunk is the end of an utterance.
  bool end_of_utterance = 3;
}

message SpeechRecognitionResult {
  repeated SpeechRecognitionAlternative alternatives = 1;
  int64 channel_tag = 2;
}

message SpeechRecognitionAlternative {
  string text = 1;
  float confidence = 2;
  repeated WordInfo words = 3;
}

message WordInfo {
  google.protobuf.Duration start_time = 1;
  google.protobuf.Duration end_time = 2;
  string word = 3;
  float confidence = 4;
}