Search in sources :

Example 26 with RecognizeOptions

use of com.ibm.watson.speech_to_text.v1.model.RecognizeOptions in project java-sdk by watson-developer-cloud.

the class SpeechToTextIT method testRecognizeFileStringRecognizeOptions.

/**
 * Test recognize file string recognize options.
 *
 * @throws FileNotFoundException the file not found exception
 */
@Test
public void testRecognizeFileStringRecognizeOptions() throws FileNotFoundException {
    File audio = new File(SAMPLE_WAV);
    String contentType = HttpMediaType.AUDIO_WAV;
    RecognizeOptions options = new RecognizeOptions.Builder().audio(audio).timestamps(true).wordConfidence(true).model(EN_BROADBAND16K).contentType(contentType).profanityFilter(false).audioMetrics(true).build();
    SpeechRecognitionResults results = service.recognize(options).execute().getResult();
    assertNotNull(results.getResults().get(0).getAlternatives().get(0).getTranscript());
    assertNotNull(results.getResults().get(0).getAlternatives().get(0).getTimestamps());
    assertNotNull(results.getResults().get(0).getAlternatives().get(0).getWordConfidence());
    assertNotNull(results.getAudioMetrics());
}
Also used : File(java.io.File) SpeechRecognitionResults(com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionResults) RecognizeOptions(com.ibm.watson.speech_to_text.v1.model.RecognizeOptions) WatsonServiceTest(com.ibm.watson.common.WatsonServiceTest) Test(org.junit.Test)

Example 27 with RecognizeOptions

use of com.ibm.watson.speech_to_text.v1.model.RecognizeOptions in project java-sdk by watson-developer-cloud.

the class SpeechToTextTest method testRecognizeWOptions.

// Test the recognize operation with a valid options model parameter
@Test
public void testRecognizeWOptions() throws Throwable {
    // Register a mock response
    String mockResponseBody = "{\"results\": [{\"final\": true, \"alternatives\": [{\"transcript\": \"transcript\", \"confidence\": 0, \"timestamps\": [[\"timestamps\"]], \"word_confidence\": [[\"wordConfidence\"]]}], \"keywords_result\": {\"mapKey\": [{\"normalized_text\": \"normalizedText\", \"start_time\": 9, \"end_time\": 7, \"confidence\": 0}]}, \"word_alternatives\": [{\"start_time\": 9, \"end_time\": 7, \"alternatives\": [{\"confidence\": 0, \"word\": \"word\"}]}], \"end_of_utterance\": \"end_of_data\"}], \"result_index\": 11, \"speaker_labels\": [{\"from\": 4, \"to\": 2, \"speaker\": 7, \"confidence\": 10, \"final\": true}], \"processing_metrics\": {\"processed_audio\": {\"received\": 8, \"seen_by_engine\": 12, \"transcription\": 13, \"speaker_labels\": 13}, \"wall_clock_since_first_byte_received\": 31, \"periodic\": true}, \"audio_metrics\": {\"sampling_interval\": 16, \"accumulated\": {\"final\": true, \"end_time\": 7, \"signal_to_noise_ratio\": 18, \"speech_ratio\": 11, \"high_frequency_loss\": 17, \"direct_current_offset\": [{\"begin\": 5, \"end\": 3, \"count\": 5}], \"clipping_rate\": [{\"begin\": 5, \"end\": 3, \"count\": 5}], \"speech_level\": [{\"begin\": 5, \"end\": 3, \"count\": 5}], \"non_speech_level\": [{\"begin\": 5, \"end\": 3, \"count\": 5}]}}, \"warnings\": [\"warnings\"]}";
    String recognizePath = "/v1/recognize";
    server.enqueue(new MockResponse().setHeader("Content-type", "application/json").setResponseCode(200).setBody(mockResponseBody));
    // Construct an instance of the RecognizeOptions model
    RecognizeOptions recognizeOptionsModel = new RecognizeOptions.Builder().audio(TestUtilities.createMockStream("This is a mock file.")).contentType("application/octet-stream").model("en-US_BroadbandModel").languageCustomizationId("testString").acousticCustomizationId("testString").baseModelVersion("testString").customizationWeight(Double.valueOf("72.5")).inactivityTimeout(Long.valueOf("26")).keywords(new java.util.ArrayList<String>(java.util.Arrays.asList("testString"))).keywordsThreshold(Float.valueOf("36.0")).maxAlternatives(Long.valueOf("26")).wordAlternativesThreshold(Float.valueOf("36.0")).wordConfidence(false).timestamps(false).profanityFilter(true).smartFormatting(false).speakerLabels(false).customizationId("testString").grammarName("testString").redaction(false).audioMetrics(false).endOfPhraseSilenceTime(Double.valueOf("72.5")).splitTranscriptAtPhraseEnd(false).speechDetectorSensitivity(Float.valueOf("36.0")).backgroundAudioSuppression(Float.valueOf("36.0")).lowLatency(false).build();
    // Invoke recognize() with a valid options model and verify the result
    Response<SpeechRecognitionResults> response = speechToTextService.recognize(recognizeOptionsModel).execute();
    assertNotNull(response);
    SpeechRecognitionResults responseObj = response.getResult();
    assertNotNull(responseObj);
    // Verify the contents of the request sent to the mock server
    RecordedRequest request = server.takeRequest();
    assertNotNull(request);
    assertEquals(request.getMethod(), "POST");
    // Verify request path
    String parsedPath = TestUtilities.parseReqPath(request);
    assertEquals(parsedPath, recognizePath);
    // Verify query params
    Map<String, String> query = TestUtilities.parseQueryString(request);
    assertNotNull(query);
    assertEquals(query.get("model"), "en-US_BroadbandModel");
    assertEquals(query.get("language_customization_id"), "testString");
    assertEquals(query.get("acoustic_customization_id"), "testString");
    assertEquals(query.get("base_model_version"), "testString");
    assertEquals(Double.valueOf(query.get("customization_weight")), Double.valueOf("72.5"));
    assertEquals(Long.valueOf(query.get("inactivity_timeout")), Long.valueOf("26"));
    assertEquals(query.get("keywords"), RequestUtils.join(new java.util.ArrayList<String>(java.util.Arrays.asList("testString")), ","));
    assertEquals(Float.valueOf(query.get("keywords_threshold")), Float.valueOf("36.0"));
    assertEquals(Long.valueOf(query.get("max_alternatives")), Long.valueOf("26"));
    assertEquals(Float.valueOf(query.get("word_alternatives_threshold")), Float.valueOf("36.0"));
    assertEquals(Boolean.valueOf(query.get("word_confidence")), Boolean.valueOf(false));
    assertEquals(Boolean.valueOf(query.get("timestamps")), Boolean.valueOf(false));
    assertEquals(Boolean.valueOf(query.get("profanity_filter")), Boolean.valueOf(true));
    assertEquals(Boolean.valueOf(query.get("smart_formatting")), Boolean.valueOf(false));
    assertEquals(Boolean.valueOf(query.get("speaker_labels")), Boolean.valueOf(false));
    assertEquals(query.get("customization_id"), "testString");
    assertEquals(query.get("grammar_name"), "testString");
    assertEquals(Boolean.valueOf(query.get("redaction")), Boolean.valueOf(false));
    assertEquals(Boolean.valueOf(query.get("audio_metrics")), Boolean.valueOf(false));
    assertEquals(Double.valueOf(query.get("end_of_phrase_silence_time")), Double.valueOf("72.5"));
    assertEquals(Boolean.valueOf(query.get("split_transcript_at_phrase_end")), Boolean.valueOf(false));
    assertEquals(Float.valueOf(query.get("speech_detector_sensitivity")), Float.valueOf("36.0"));
    assertEquals(Float.valueOf(query.get("background_audio_suppression")), Float.valueOf("36.0"));
    assertEquals(Boolean.valueOf(query.get("low_latency")), Boolean.valueOf(false));
}
Also used : RecordedRequest(okhttp3.mockwebserver.RecordedRequest) MockResponse(okhttp3.mockwebserver.MockResponse) SpeechRecognitionResults(com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionResults) RecognizeOptions(com.ibm.watson.speech_to_text.v1.model.RecognizeOptions) Test(org.testng.annotations.Test)

Example 28 with RecognizeOptions

use of com.ibm.watson.speech_to_text.v1.model.RecognizeOptions in project java-sdk by watson-developer-cloud.

the class SpeechToText method recognize.

/**
 * Recognize audio.
 *
 * <p>Sends audio and returns transcription results for a recognition request. You can pass a
 * maximum of 100 MB and a minimum of 100 bytes of audio with a request. The service automatically
 * detects the endianness of the incoming audio and, for audio that includes multiple channels,
 * downmixes the audio to one-channel mono during transcoding. The method returns only final
 * results; to enable interim results, use the WebSocket API. (With the `curl` command, use the
 * `--data-binary` option to upload the file for the request.)
 *
 * <p>**See also:** [Making a basic HTTP
 * request](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-http#HTTP-basic).
 *
 * <p>### Streaming mode
 *
 * <p>For requests to transcribe live audio as it becomes available, you must set the
 * `Transfer-Encoding` header to `chunked` to use streaming mode. In streaming mode, the service
 * closes the connection (status code 408) if it does not receive at least 15 seconds of audio
 * (including silence) in any 30-second period. The service also closes the connection (status
 * code 400) if it detects no speech for `inactivity_timeout` seconds of streaming audio; use the
 * `inactivity_timeout` parameter to change the default of 30 seconds.
 *
 * <p>**See also:** * [Audio
 * transmission](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#transmission)
 * * [Timeouts](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#timeouts)
 *
 * <p>### Audio formats (content types)
 *
 * <p>The service accepts audio in the following formats (MIME types). * For formats that are
 * labeled **Required**, you must use the `Content-Type` header with the request to specify the
 * format of the audio. * For all other formats, you can omit the `Content-Type` header or specify
 * `application/octet-stream` with the header to have the service automatically detect the format
 * of the audio. (With the `curl` command, you can specify either `"Content-Type:"` or
 * `"Content-Type: application/octet-stream"`.)
 *
 * <p>Where indicated, the format that you specify must include the sampling rate and can
 * optionally include the number of channels and the endianness of the audio. * `audio/alaw`
 * (**Required.** Specify the sampling rate (`rate`) of the audio.) * `audio/basic` (**Required.**
 * Use only with narrowband models.) * `audio/flac` * `audio/g729` (Use only with narrowband
 * models.) * `audio/l16` (**Required.** Specify the sampling rate (`rate`) and optionally the
 * number of channels (`channels`) and endianness (`endianness`) of the audio.) * `audio/mp3` *
 * `audio/mpeg` * `audio/mulaw` (**Required.** Specify the sampling rate (`rate`) of the audio.) *
 * `audio/ogg` (The service automatically detects the codec of the input audio.) *
 * `audio/ogg;codecs=opus` * `audio/ogg;codecs=vorbis` * `audio/wav` (Provide audio with a maximum
 * of nine channels.) * `audio/webm` (The service automatically detects the codec of the input
 * audio.) * `audio/webm;codecs=opus` * `audio/webm;codecs=vorbis`
 *
 * <p>The sampling rate of the audio must match the sampling rate of the model for the recognition
 * request: for broadband models, at least 16 kHz; for narrowband models, at least 8 kHz. If the
 * sampling rate of the audio is higher than the minimum required rate, the service down-samples
 * the audio to the appropriate rate. If the sampling rate of the audio is lower than the minimum
 * required rate, the request fails.
 *
 * <p>**See also:** [Supported audio
 * formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats).
 *
 * <p>### Next-generation models
 *
 * <p>The service supports next-generation `Multimedia` (16 kHz) and `Telephony` (8 kHz) models
 * for many languages. Next-generation models have higher throughput than the service's previous
 * generation of `Broadband` and `Narrowband` models. When you use next-generation models, the
 * service can return transcriptions more quickly and also provide noticeably better transcription
 * accuracy.
 *
 * <p>You specify a next-generation model by using the `model` query parameter, as you do a
 * previous-generation model. Many next-generation models also support the `low_latency`
 * parameter, which is not available with previous-generation models. Next-generation models do
 * not support all of the parameters that are available for use with previous-generation models.
 *
 * <p>**Important:** Effective 15 March 2022, previous-generation models for all languages other
 * than Arabic and Japanese are deprecated. The deprecated models remain available until 15
 * September 2022, when they will be removed from the service and the documentation. You must
 * migrate to the equivalent next-generation model by the end of service date. For more
 * information, see [Migrating to next-generation
 * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-migrate).
 *
 * <p>**See also:** * [Next-generation languages and
 * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng) * [Supported
 * features for next-generation
 * models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-features)
 *
 * <p>### Multipart speech recognition
 *
 * <p>**Note:** The asynchronous HTTP interface, WebSocket interface, and Watson SDKs do not
 * support multipart speech recognition.
 *
 * <p>The HTTP `POST` method of the service also supports multipart speech recognition. With
 * multipart requests, you pass all audio data as multipart form data. You specify some parameters
 * as request headers and query parameters, but you pass JSON metadata as form data to control
 * most aspects of the transcription. You can use multipart recognition to pass multiple audio
 * files with a single request.
 *
 * <p>Use the multipart approach with browsers for which JavaScript is disabled or when the
 * parameters used with the request are greater than the 8 KB limit imposed by most HTTP servers
 * and proxies. You can encounter this limit, for example, if you want to spot a very large number
 * of keywords.
 *
 * <p>**See also:** [Making a multipart HTTP
 * request](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-http#HTTP-multi).
 *
 * @param recognizeOptions the {@link RecognizeOptions} containing the options for the call
 * @return a {@link ServiceCall} with a result of type {@link SpeechRecognitionResults}
 */
public ServiceCall<SpeechRecognitionResults> recognize(RecognizeOptions recognizeOptions) {
    com.ibm.cloud.sdk.core.util.Validator.notNull(recognizeOptions, "recognizeOptions cannot be null");
    RequestBuilder builder = RequestBuilder.post(RequestBuilder.resolveRequestUrl(getServiceUrl(), "/v1/recognize"));
    Map<String, String> sdkHeaders = SdkCommon.getSdkHeaders("speech_to_text", "v1", "recognize");
    for (Entry<String, String> header : sdkHeaders.entrySet()) {
        builder.header(header.getKey(), header.getValue());
    }
    builder.header("Accept", "application/json");
    if (recognizeOptions.contentType() != null) {
        builder.header("Content-Type", recognizeOptions.contentType());
    }
    if (recognizeOptions.model() != null) {
        builder.query("model", String.valueOf(recognizeOptions.model()));
    }
    if (recognizeOptions.languageCustomizationId() != null) {
        builder.query("language_customization_id", String.valueOf(recognizeOptions.languageCustomizationId()));
    }
    if (recognizeOptions.acousticCustomizationId() != null) {
        builder.query("acoustic_customization_id", String.valueOf(recognizeOptions.acousticCustomizationId()));
    }
    if (recognizeOptions.baseModelVersion() != null) {
        builder.query("base_model_version", String.valueOf(recognizeOptions.baseModelVersion()));
    }
    if (recognizeOptions.customizationWeight() != null) {
        builder.query("customization_weight", String.valueOf(recognizeOptions.customizationWeight()));
    }
    if (recognizeOptions.inactivityTimeout() != null) {
        builder.query("inactivity_timeout", String.valueOf(recognizeOptions.inactivityTimeout()));
    }
    if (recognizeOptions.keywords() != null) {
        builder.query("keywords", RequestUtils.join(recognizeOptions.keywords(), ","));
    }
    if (recognizeOptions.keywordsThreshold() != null) {
        builder.query("keywords_threshold", String.valueOf(recognizeOptions.keywordsThreshold()));
    }
    if (recognizeOptions.maxAlternatives() != null) {
        builder.query("max_alternatives", String.valueOf(recognizeOptions.maxAlternatives()));
    }
    if (recognizeOptions.wordAlternativesThreshold() != null) {
        builder.query("word_alternatives_threshold", String.valueOf(recognizeOptions.wordAlternativesThreshold()));
    }
    if (recognizeOptions.wordConfidence() != null) {
        builder.query("word_confidence", String.valueOf(recognizeOptions.wordConfidence()));
    }
    if (recognizeOptions.timestamps() != null) {
        builder.query("timestamps", String.valueOf(recognizeOptions.timestamps()));
    }
    if (recognizeOptions.profanityFilter() != null) {
        builder.query("profanity_filter", String.valueOf(recognizeOptions.profanityFilter()));
    }
    if (recognizeOptions.smartFormatting() != null) {
        builder.query("smart_formatting", String.valueOf(recognizeOptions.smartFormatting()));
    }
    if (recognizeOptions.speakerLabels() != null) {
        builder.query("speaker_labels", String.valueOf(recognizeOptions.speakerLabels()));
    }
    if (recognizeOptions.customizationId() != null) {
        builder.query("customization_id", String.valueOf(recognizeOptions.customizationId()));
    }
    if (recognizeOptions.grammarName() != null) {
        builder.query("grammar_name", String.valueOf(recognizeOptions.grammarName()));
    }
    if (recognizeOptions.redaction() != null) {
        builder.query("redaction", String.valueOf(recognizeOptions.redaction()));
    }
    if (recognizeOptions.audioMetrics() != null) {
        builder.query("audio_metrics", String.valueOf(recognizeOptions.audioMetrics()));
    }
    if (recognizeOptions.endOfPhraseSilenceTime() != null) {
        builder.query("end_of_phrase_silence_time", String.valueOf(recognizeOptions.endOfPhraseSilenceTime()));
    }
    if (recognizeOptions.splitTranscriptAtPhraseEnd() != null) {
        builder.query("split_transcript_at_phrase_end", String.valueOf(recognizeOptions.splitTranscriptAtPhraseEnd()));
    }
    if (recognizeOptions.speechDetectorSensitivity() != null) {
        builder.query("speech_detector_sensitivity", String.valueOf(recognizeOptions.speechDetectorSensitivity()));
    }
    if (recognizeOptions.backgroundAudioSuppression() != null) {
        builder.query("background_audio_suppression", String.valueOf(recognizeOptions.backgroundAudioSuppression()));
    }
    if (recognizeOptions.lowLatency() != null) {
        builder.query("low_latency", String.valueOf(recognizeOptions.lowLatency()));
    }
    builder.bodyContent(recognizeOptions.contentType(), null, null, recognizeOptions.audio());
    ResponseConverter<SpeechRecognitionResults> responseConverter = ResponseConverterUtils.getValue(new com.google.gson.reflect.TypeToken<SpeechRecognitionResults>() {
    }.getType());
    return createServiceCall(builder.build(), responseConverter);
}
Also used : RequestBuilder(com.ibm.cloud.sdk.core.http.RequestBuilder) SpeechRecognitionResults(com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionResults)

Aggregations

RecognizeOptions (com.ibm.watson.developer_cloud.speech_to_text.v1.model.RecognizeOptions)18 Test (org.junit.Test)18 SpeechRecognitionResults (com.ibm.watson.developer_cloud.speech_to_text.v1.model.SpeechRecognitionResults)16 File (java.io.File)13 SpeechRecognitionResults (com.ibm.watson.speech_to_text.v1.model.SpeechRecognitionResults)9 FileInputStream (java.io.FileInputStream)9 RecognizeOptions (com.ibm.watson.speech_to_text.v1.model.RecognizeOptions)8 MockResponse (okhttp3.mockwebserver.MockResponse)8 WatsonServiceUnitTest (com.ibm.watson.developer_cloud.WatsonServiceUnitTest)7 RecordedRequest (okhttp3.mockwebserver.RecordedRequest)7 WatsonServiceTest (com.ibm.watson.developer_cloud.WatsonServiceTest)6 WatsonServiceTest (com.ibm.watson.common.WatsonServiceTest)5 BaseRecognizeCallback (com.ibm.watson.developer_cloud.speech_to_text.v1.websocket.BaseRecognizeCallback)5 ByteString (okio.ByteString)5 JsonObject (com.google.gson.JsonObject)4 JsonParser (com.google.gson.JsonParser)4 Authenticator (com.ibm.cloud.sdk.core.security.Authenticator)2 IamAuthenticator (com.ibm.cloud.sdk.core.security.IamAuthenticator)2 NotFoundException (com.ibm.watson.developer_cloud.service.exception.NotFoundException)2 WordAlternativeResults (com.ibm.watson.developer_cloud.speech_to_text.v1.model.WordAlternativeResults)2