aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/yandex-cloud-api-protos/yandex/cloud/ai/stt/v3/stt.proto
blob: ed19a6d77752eebc10308efd2e989f348906274d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
syntax = "proto3";

package speechkit.stt.v3;

option go_package = "github.com/yandex-cloud/go-genproto/yandex/cloud/ai/stt/v3;stt";
option java_package = "yandex.cloud.api.ai.stt.v3";


// Options
message TextNormalizationOptions {
  // Normalization 
  enum TextNormalization {
    TEXT_NORMALIZATION_UNSPECIFIED = 0;
    // Enable normalization
    TEXT_NORMALIZATION_ENABLED = 1;
    // Disable normalization
    TEXT_NORMALIZATION_DISABLED = 2;
  }
  enum PhoneFormattingMode {
    PHONE_FORMATTING_MODE_UNSPECIFIED = 0;
    // Disable phone formatting
    PHONE_FORMATTING_MODE_DISABLED = 1;
  }
  TextNormalization text_normalization = 1;
  // Profanity filter (default: false).
  bool profanity_filter = 2;
  // Rewrite text in literature style (default: false).
  bool literature_text = 3;
  // Define phone formatting mode
  PhoneFormattingMode phone_formatting_mode = 4;
}


message DefaultEouClassifier {
  enum EouSensitivity {
    EOU_SENSITIVITY_UNSPECIFIED = 0;
    DEFAULT = 1;
    HIGH = 2;
  }
  // EOU sensitivity. Currently two levels, faster with more error and more conservative (our default).
  EouSensitivity type = 1;
  // Hint for max pause between words. Our EOU detector could use this information to distinguish between end of utterance and slow speech (like one <long pause> two <long pause> three, etc).
  int64 max_pause_between_words_hint_ms = 2;
}

// Use EOU provided by user
message ExternalEouClassifier {

}

message EouClassifierOptions {
  // Type of EOU classifier.
  oneof Classifier {
    // EOU classifier provided by SpeechKit. Default.
    DefaultEouClassifier default_classifier = 1;
    // EOU is enforced by external messages from user.
    ExternalEouClassifier external_classifier = 2;
  }
}

message RecognitionClassifier {
  // Type of recognition classifier trigger.
  enum TriggerType {
    TRIGGER_TYPE_UNSPECIFIED = 0 [deprecated = true];
    // Apply classifier to utterance responses
    ON_UTTERANCE = 1;
    // Apply classifier to final responses
    ON_FINAL = 2;
    // Apply classifier to partial responses
    ON_PARTIAL = 3;
  }

  // Classifier name
  string classifier = 1;
  // Describes the types of responses to which the classification results will come
  repeated TriggerType triggers = 2;
}

message RecognitionClassifierOptions {
  // List of classifiers to use
  repeated RecognitionClassifier classifiers = 1;
}

message SpeechAnalysisOptions {
  // Analyse speech for every speaker
  bool enable_speaker_analysis = 1;
  // Analyse conversation of two speakers
  bool enable_conversation_analysis = 2;
  // Quantile levels in range (0, 1) for descriptive statistics
  repeated double descriptive_statistics_quantiles = 3;
}

// RAW Audio format spec (no container to infer type). Used in AudioFormat options.
message RawAudio {
  enum AudioEncoding {
    AUDIO_ENCODING_UNSPECIFIED = 0;
    // Audio bit depth 16-bit signed little-endian (Linear PCM).
    LINEAR16_PCM = 1;
  }

  // Type of audio encoding
  AudioEncoding audio_encoding = 1;
  // PCM sample rate
  int64 sample_rate_hertz = 2;
  // PCM channel count. Currently only single channel audio is supported in real-time recognition.
  int64 audio_channel_count = 3;
}

// Audio with fixed type in container. Used in AudioFormat options.
message ContainerAudio {
  enum ContainerAudioType {
    CONTAINER_AUDIO_TYPE_UNSPECIFIED = 0;

    // Audio bit depth 16-bit signed little-endian (Linear PCM).
    WAV = 1;

    // Data is encoded using the OPUS audio codec and compressed using the OGG container format.
    OGG_OPUS = 2;

    // Data is encoded using MPEG-1/2 Layer III and compressed using the MP3 container format.
    MP3 = 3;
  }
  // Type of audio container.
  ContainerAudioType container_audio_type = 1;
}

// Audio format options.
message AudioFormatOptions {
  oneof AudioFormat {

    // Audio without container.
    RawAudio raw_audio = 1;

    // Audio is wrapped in container.
    ContainerAudio container_audio = 2;
  }
}

// Type of restriction for the list of languages expected in the incoming speech stream.
message LanguageRestrictionOptions {
  enum LanguageRestrictionType {
    LANGUAGE_RESTRICTION_TYPE_UNSPECIFIED = 0;
    // The allowing list. The incoming audio can contain only the listed languages.
    WHITELIST = 1;
    // The forbidding list. The incoming audio cannot contain the listed languages.
    BLACKLIST = 2;
  }
  // Language restriction type
  LanguageRestrictionType restriction_type = 1;
  // The list of language codes to restrict recognition in the case of an auto model
  repeated string language_code = 2;
}

message RecognitionModelOptions {
  enum AudioProcessingType {
    AUDIO_PROCESSING_TYPE_UNSPECIFIED = 0;
    // Process audio in mode optimized for real-time recognition, i.e. send partials and final responses as soon as possible
    REAL_TIME = 1;
    // Process audio after all data was received
    FULL_DATA = 2;
  }
  // Sets the recognition model for the cloud version of SpeechKit. Possible values: 'general', 'general:rc', 'general:deprecated'.
  // The model is ignored for SpeechKit Hybrid.
  string model = 1;

  // Specified input audio.
  AudioFormatOptions audio_format = 2;

  // Text normalization options.
  TextNormalizationOptions text_normalization = 3;

  // Possible languages in audio.
  LanguageRestrictionOptions language_restriction = 4;

  // How to deal with audio data (in real time, after all data is received, etc). Default is REAL_TIME.
  AudioProcessingType audio_processing_type = 5;
}

message SpeakerLabelingOptions {
  enum SpeakerLabeling {
    SPEAKER_LABELING_UNSPECIFIED = 0;
    // Enable speaker labeling
    SPEAKER_LABELING_ENABLED = 1;
    // Disable speaker labeling
    SPEAKER_LABELING_DISABLED = 2;
  }
  // Specifies the execution of speaker labeling. Default is SPEAKER_LABELING_DISABLED.
  SpeakerLabeling speaker_labeling = 1;
}

message StreamingOptions {
  // Configuration for speech recognition model.
  RecognitionModelOptions recognition_model = 1;

  // Configuration for end of utterance detection model.
  EouClassifierOptions eou_classifier = 2;

  // Configuration for classifiers over speech recognition.
  RecognitionClassifierOptions recognition_classifier = 3;

  // Configuration for speech analysis over speech recognition.
  SpeechAnalysisOptions speech_analysis = 4;

  // Configuration for speaker labeling
  SpeakerLabelingOptions speaker_labeling = 5;
}

// Data chunk with audio.
message AudioChunk {
  // Bytes with audio data.
  bytes data = 1;
}

// Data chunk with silence.
message SilenceChunk {

  // Duration of silence chunk in ms.
  int64 duration_ms = 1;
}

// Force EOU
message Eou {
}

// Streaming audio request
// Events are control messages from user.
// First message should be session options.
// The next messages are audio data chunks or control messages.
message StreamingRequest {
  oneof Event {

    // Session options. Should be the first message from user.
    StreamingOptions session_options = 1;

    // Chunk with audio data.
    AudioChunk chunk = 2;

    // Chunk with silence.
    SilenceChunk silence_chunk = 3;

    // Request to end current utterance. Works only with external EOU detector.
    Eou eou = 4;
  }
}

message RecognizeFileRequest {
    oneof AudioSource {
      // Bytes with data
      bytes content = 1;
      // S3 data url
      string uri = 2;
    }
  // Configuration for speech recognition model.
  RecognitionModelOptions recognition_model = 3;

  // Configuration for classifiers over speech recognition.
  RecognitionClassifierOptions recognition_classifier = 4;

  // Configuration for speech analysis over speech recognition.
  SpeechAnalysisOptions speech_analysis = 5;

  // Configuration for speaker labeling
  SpeakerLabelingOptions speaker_labeling = 6;
}

// Now response


// Recognized word.
message Word {

  // Word text.
  string text = 1;

  // Estimation of word start time in ms.
  int64 start_time_ms = 2;

  // Estimation of word end time in ms.
  int64 end_time_ms = 3;
}

// Estimation of language and its probability.
message LanguageEstimation {

  // Language code in ISO 639-1 format.
  string language_code = 1;

  // Estimation of language probability.
  double probability = 2;
}

// Recognition of specific time frame.
message Alternative {

  // Words in time frame.
  repeated Word words = 1;

  // Text in time frame.
  string text = 2;

  // Start of time frame.
  int64 start_time_ms = 3;

  // End of time frame.
  int64 end_time_ms = 4;

  // The hypothesis confidence. Currently is not used.
  double confidence = 5;

  // Distribution over possible languages.
  repeated LanguageEstimation languages = 6;
}

// Update information for external End of Utterance.
message EouUpdate {

  // EOU estimated time.
  int64 time_ms = 2;
}

// Update of hypothesis.
message AlternativeUpdate {

  // List of hypothesis for timeframes.
  repeated Alternative alternatives = 1;

  string channel_tag = 2 [deprecated = true];
}

// AudioCursors are state of ASR recognition stream.
message AudioCursors {

  // Amount of audio chunks server received. This cursor is moved after each audio chunk was received by server.
  int64 received_data_ms = 1;

  // Input stream reset data.
  int64 reset_time_ms = 2;

  // How much audio was processed. This time includes trimming silences as well. This cursor is moved after server received enough data
  // to update recognition results (includes silence as well).
  int64 partial_time_ms = 3;

  // Time of last final. This cursor is moved when server decides that recognition from start of audio until final_time_ms will not change anymore
  // usually this even is followed by EOU detection (but this could change in future).
  int64 final_time_ms = 4;

  // This is index of last final server send. Incremented after each new final.
  int64 final_index = 5;

  // Estimated time of EOU. Cursor is updated after each new EOU is sent.
  // For external classifier this equals to received_data_ms at the moment EOU event arrives.
  // For internal classifier this is estimation of time. The time is not exact and has the same guarantees as word timings.
  int64 eou_time_ms = 6;
}

enum CodeType {
  CODE_TYPE_UNSPECIFIED = 0 [deprecated = true];
  // All good.
  WORKING = 1;
  // For example, if speech is sent not in real time or context is unknown and we've made fallback.
  WARNING = 2;
  // After session was closed.
  CLOSED = 3;
}

// Refinement for final hypo. For example, text normalization is refinement.
message FinalRefinement {

  // Index of final for which server sends additional information.
  int64 final_index = 1;

  // Type of refinement.
  oneof Type {

    // Normalized text instead of raw one.
    AlternativeUpdate normalized_text = 2;
  }
}

// Status message
message StatusCode {

  // Code type.
  CodeType code_type = 1;

  // Human readable message.
  string message = 2;
}

// Session identifier.
message SessionUuid {

  // Internal session identifier.
  string uuid = 1;

  // User session identifier.
  string user_request_id = 2;
}

message PhraseHighlight {
  // Text transcription of the highlighted audio segment
  string text = 1;

  // Start time of the highlighted audio segment
  int64 start_time_ms = 2;

  // End time of the highlighted audio segment
  int64 end_time_ms = 3;
}

message RecognitionClassifierLabel {
  // The label of the class predicted by the classifier
  string label = 1;

  // The prediction confidence
  double confidence = 2;
}

message RecognitionClassifierResult {
  // Name of the triggered classifier
  string classifier = 1;

  // List of highlights, i.e. parts of phrase that determine the result of the classification
  repeated PhraseHighlight highlights = 2;

  // Classifier predictions
  repeated RecognitionClassifierLabel labels = 3;
}

message RecognitionClassifierUpdate {
  enum WindowType {
    WINDOW_TYPE_UNSPECIFIED = 0 [deprecated = true];
    // The result of applying the classifier to the last utterance response
    LAST_UTTERANCE = 1;
    // The result of applying the classifier to the last final response
    LAST_FINAL = 2;
    // The result of applying the classifier to the last partial response
    LAST_PARTIAL = 3;
  }

  // Response window type
  WindowType window_type = 1;

  // Start time of the audio segment used for classification
  int64 start_time_ms = 2;

  // End time of the audio segment used for classification
  int64 end_time_ms = 3;

  // Result for dictionary-based classifier
  RecognitionClassifierResult classifier_result = 4;
}

message DescriptiveStatistics {
  message Quantile {
    // Quantile level in range (0, 1)
    double level = 1;

    // Quantile value
    double value = 2;
  }

  // Minimum observed value
  double min = 1;

  // Maximum observed value
  double max = 2;

  // Estimated mean of distribution
  double mean = 3;

  // Estimated standard deviation of distribution
  double std = 4;

  // List of evaluated quantiles
  repeated Quantile quantiles = 5;
}

message AudioSegmentBoundaries {
  // Audio segment start time
  int64 start_time_ms = 1;

  // Audio segment end time
  int64 end_time_ms = 2;
}

message SpeakerAnalysis {
  enum WindowType {
    WINDOW_TYPE_UNSPECIFIED = 0 [deprecated = true];
    // Stats for all received audio.
    TOTAL = 1;
    // Stats for last utterance.
    LAST_UTTERANCE = 2;
  }

  // Speaker tag
  string speaker_tag = 1;

  // Response window type
  WindowType window_type = 2;

  // Audio segment boundaries
  AudioSegmentBoundaries speech_boundaries = 3;

  // Total speech duration
  int64 total_speech_ms = 4;

  // Speech ratio within audio segment
  double speech_ratio = 5;

  // Total silence duration
  int64 total_silence_ms = 6;

  // Silence ratio within audio segment
  double silence_ratio = 7;

  // Number of words in recognized speech
  int64 words_count = 8;

  // Number of letters in recognized speech
  int64 letters_count = 9;

  // Descriptive statistics for words per second distribution
  DescriptiveStatistics words_per_second = 10;

  // Descriptive statistics for letters per second distribution
  DescriptiveStatistics letters_per_second = 11;

  // Descriptive statistics for words per utterance distribution
  DescriptiveStatistics words_per_utterance = 12;

  // Descriptive statistics for letters per utterance distribution
  DescriptiveStatistics letters_per_utterance = 13;

  // Number of utterances
  int64 utterance_count = 14;

  // Descriptive statistics for utterance duration distribution
  DescriptiveStatistics utterance_duration_estimation = 15;
}

message ConversationAnalysis {
  message InterruptsEvaluation {
    // Speaker tag
    string speaker_tag = 1;

    // Number of interrupts made by the speaker
    int64 interrupts_count = 2;

    // Total duration of all interrupts
    int64 interrupts_duration_ms = 3;

    // Boundaries for every interrupt
    repeated AudioSegmentBoundaries interrupts = 4;
  }

  // Audio segment boundaries
  AudioSegmentBoundaries conversation_boundaries = 1;

  // Total simultaneous silence duration
  int64 total_simultaneous_silence_duration_ms = 2;

  // Simultaneous silence ratio within audio segment
  double total_simultaneous_silence_ratio = 3;

  // Descriptive statistics for simultaneous silence duration distribution
  DescriptiveStatistics simultaneous_silence_duration_estimation = 4;

  // Total simultaneous speech duration
  int64 total_simultaneous_speech_duration_ms = 5;

  // Simultaneous speech ratio within audio segment
  double total_simultaneous_speech_ratio = 6;

  // Descriptive statistics for simultaneous speech duration distribution
  DescriptiveStatistics simultaneous_speech_duration_estimation = 7;

  // Interrupts description for every speaker
  repeated InterruptsEvaluation speaker_interrupts = 8;

  // Total speech duration, including both simultaneous and separate speech
  int64 total_speech_duration_ms = 9;

  // Total speech ratio within audio segment
  double total_speech_ratio = 10;
}

// Responses from server.
// Each response contains session uuid
// AudioCursors
// plus specific event
message StreamingResponse {

  // Session identifier
  SessionUuid session_uuid = 1;

  // Progress bar for stream session recognition: how many data we obtained; final and partial times; etc.
  AudioCursors audio_cursors = 2;

  // Wall clock on server side. This is time when server wrote results to stream
  int64 response_wall_time_ms = 3;

  oneof Event {

    // Partial results, server will send them regularly after enough audio data was received from user. This are current text estimation
    // from final_time_ms to partial_time_ms. Could change after new data will arrive.
    AlternativeUpdate partial = 4;

    // Final results, the recognition is now fixed until final_time_ms. For now, final is sent only if the EOU event was triggered. This could be change in future releases.
    AlternativeUpdate final = 5;

    // After EOU classifier, send the message with final, send the EouUpdate with time of EOU
    // before eou_update we send final with the same time. there could be several finals before eou update.
    EouUpdate eou_update = 6;

    // For each final, if normalization is enabled, sent the normalized text (or some other advanced post-processing).
    // Final normalization will introduce additional latency.
    FinalRefinement final_refinement = 7;

    // Status messages, send by server with fixed interval (keep-alive).
    StatusCode status_code = 8;

    // Result of the triggered classifier
    RecognitionClassifierUpdate classifier_update = 10;

    // Speech statistics for every speaker
    SpeakerAnalysis speaker_analysis = 11;

    // Conversation statistics
    ConversationAnalysis conversation_analysis = 12;
  }

  // Tag for distinguish audio channels.
  string channel_tag = 9;
}