use of com.ibm.watson.speech_to_text.v1.model.RecognizeWithWebsocketsOptions in project openhab-addons by openhab.
the class WatsonSTTService method recognize.
@Override
public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set<String> set) throws STTException {
if (config.apiKey.isBlank() || config.instanceUrl.isBlank()) {
throw new STTException("service is not correctly configured");
}
String contentType = getContentType(audioStream);
if (contentType == null) {
throw new STTException("Unsupported format, unable to resolve audio content type");
}
logger.debug("Content-Type: {}", contentType);
var speechToText = new SpeechToText(new IamAuthenticator.Builder().apikey(config.apiKey).build());
speechToText.setServiceUrl(config.instanceUrl);
if (config.optOutLogging) {
speechToText.setDefaultHeaders(Map.of("X-Watson-Learning-Opt-Out", "1"));
}
RecognizeWithWebsocketsOptions wsOptions = new RecognizeWithWebsocketsOptions.Builder().audio(audioStream).contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting).model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true).backgroundAudioSuppression(config.backgroundAudioSuppression).speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.inactivityTimeout).build();
final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
final AtomicBoolean aborted = new AtomicBoolean(false);
executor.submit(() -> {
int retries = 2;
while (retries > 0) {
try {
socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions, new TranscriptionListener(sttListener, config, aborted)));
break;
} catch (RuntimeException e) {
var cause = e.getCause();
if (cause instanceof SSLPeerUnverifiedException) {
logger.debug("Retrying on error: {}", cause.getMessage());
retries--;
} else {
var errorMessage = e.getMessage();
logger.warn("Aborting on error: {}", errorMessage);
sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
break;
}
}
}
});
return new STTServiceHandle() {
@Override
public void abort() {
if (!aborted.getAndSet(true)) {
var socket = socketRef.get();
if (socket != null) {
socket.close(1000, null);
socket.cancel();
try {
Thread.sleep(100);
} catch (InterruptedException ignored) {
}
}
}
}
};
}
use of com.ibm.watson.speech_to_text.v1.model.RecognizeWithWebsocketsOptions in project java-sdk by watson-developer-cloud.
the class SpeechToText method recognizeUsingWebSocket.
/**
* Sends audio and returns transcription results for recognition requests over a WebSocket
* connection. Requests and responses are enabled over a single TCP connection that abstracts much
* of the complexity of the request to offer efficient implementation, low latency, high
* throughput, and an asynchronous response. By default, only final results are returned for any
* request; to enable interim results, set the interimResults parameter to true.
*
* <p>The service imposes a data size limit of 100 MB per utterance (per recognition request). You
* can send multiple utterances over a single WebSocket connection. The service automatically
* detects the endianness of the incoming audio and, for audio that includes multiple channels,
* downmixes the audio to one-channel mono during transcoding. (For the audio/l16 format, you can
* specify the endianness.)
*
* @param recognizeOptions the recognize options
* @param callback the {@link RecognizeCallback} instance where results will be sent
* @return the {@link WebSocket}
*/
public WebSocket recognizeUsingWebSocket(RecognizeWithWebsocketsOptions recognizeOptions, RecognizeCallback callback) {
com.ibm.cloud.sdk.core.util.Validator.notNull(recognizeOptions, "recognizeOptions cannot be null");
com.ibm.cloud.sdk.core.util.Validator.notNull(recognizeOptions.audio(), "audio cannot be null");
com.ibm.cloud.sdk.core.util.Validator.notNull(callback, "callback cannot be null");
HttpUrl.Builder urlBuilder = HttpUrl.parse(getServiceUrl() + "/v1/recognize").newBuilder();
if (recognizeOptions.model() != null) {
urlBuilder.addQueryParameter("model", recognizeOptions.model());
}
if (recognizeOptions.customizationId() != null) {
urlBuilder.addQueryParameter("customization_id", recognizeOptions.customizationId());
}
if (recognizeOptions.languageCustomizationId() != null) {
urlBuilder.addQueryParameter("language_customization_id", recognizeOptions.languageCustomizationId());
}
if (recognizeOptions.acousticCustomizationId() != null) {
urlBuilder.addQueryParameter("acoustic_customization_id", recognizeOptions.acousticCustomizationId());
}
if (recognizeOptions.baseModelVersion() != null) {
urlBuilder.addQueryParameter("base_model_version", recognizeOptions.baseModelVersion());
}
String url = urlBuilder.toString().replace("https://", "wss://");
Request.Builder builder = new Request.Builder().url(url);
setAuthentication(builder);
setDefaultHeaders(builder);
OkHttpClient client = configureHttpClient();
return client.newWebSocket(builder.build(), new SpeechToTextWebSocketListener(recognizeOptions, callback));
}
use of com.ibm.watson.speech_to_text.v1.model.RecognizeWithWebsocketsOptions in project java-sdk by watson-developer-cloud.
the class RecognizeUsingWebSocketsExample method main.
public static void main(String[] args) throws FileNotFoundException, InterruptedException {
Authenticator authenticator = new IamAuthenticator("<iam_api_key>");
SpeechToText service = new SpeechToText(authenticator);
FileInputStream audio = new FileInputStream("src/test/resources/speech_to_text/sample1.wav");
RecognizeWithWebsocketsOptions options = new RecognizeWithWebsocketsOptions.Builder().audio(audio).interimResults(true).contentType(HttpMediaType.AUDIO_WAV).build();
service.recognizeUsingWebSocket(options, new BaseRecognizeCallback() {
@Override
public void onTranscription(SpeechRecognitionResults speechResults) {
System.out.println(speechResults);
}
@Override
public void onDisconnected() {
lock.countDown();
}
});
lock.await(1, TimeUnit.MINUTES);
}
use of com.ibm.watson.speech_to_text.v1.model.RecognizeWithWebsocketsOptions in project java-sdk by watson-developer-cloud.
the class MicrophoneWithWebSocketsExample method main.
/**
* The main method.
*
* @param args the arguments
* @throws Exception the exception
*/
public static void main(final String[] args) throws Exception {
Authenticator authenticator = new IamAuthenticator("<iam_api_key>");
SpeechToText service = new SpeechToText(authenticator);
// Signed PCM AudioFormat with 16kHz, 16 bit sample size, mono
int sampleRate = 16000;
AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
if (!AudioSystem.isLineSupported(info)) {
System.out.println("Line not supported");
System.exit(0);
}
TargetDataLine line = (TargetDataLine) AudioSystem.getLine(info);
line.open(format);
line.start();
AudioInputStream audio = new AudioInputStream(line);
RecognizeWithWebsocketsOptions options = new RecognizeWithWebsocketsOptions.Builder().audio(audio).interimResults(true).timestamps(true).wordConfidence(true).contentType(HttpMediaType.AUDIO_RAW + ";rate=" + sampleRate).build();
service.recognizeUsingWebSocket(options, new BaseRecognizeCallback() {
@Override
public void onTranscription(SpeechRecognitionResults speechResults) {
System.out.println(speechResults);
}
});
System.out.println("Listening to your voice for the next 30s...");
Thread.sleep(30 * 1000);
// closing the WebSockets underlying InputStream will close the WebSocket itself.
line.stop();
line.close();
System.out.println("Fin.");
}
use of com.ibm.watson.speech_to_text.v1.model.RecognizeWithWebsocketsOptions in project java-sdk by watson-developer-cloud.
the class RecognizeUsingWebSocketsWithSpeakerLabelsExample method main.
/**
* The main method.
*
* @param args the arguments
* @throws FileNotFoundException the file not found exception
* @throws InterruptedException the interrupted exception
*/
public static void main(String[] args) throws FileNotFoundException, InterruptedException {
FileInputStream audio = new FileInputStream("src/test/resources/speech_to_text/twospeakers.wav");
Authenticator authenticator = new IamAuthenticator("<iam_api_key>");
SpeechToText service = new SpeechToText(authenticator);
RecognizeWithWebsocketsOptions options = new RecognizeWithWebsocketsOptions.Builder().audio(audio).interimResults(true).speakerLabels(true).model(RecognizeOptions.Model.EN_US_NARROWBANDMODEL).contentType(HttpMediaType.AUDIO_WAV).build();
RecoTokens recoTokens = new RecoTokens();
service.recognizeUsingWebSocket(options, new BaseRecognizeCallback() {
@Override
public void onTranscription(SpeechRecognitionResults speechResults) {
recoTokens.add(speechResults);
}
@Override
public void onDisconnected() {
lock.countDown();
}
});
lock.await(1, TimeUnit.MINUTES);
}
Aggregations