Search in sources :

Example 1 with TextEmbedding

use of ai.djl.modality.nlp.embedding.TextEmbedding in project djl by deepjavalibrary.

the class TrainSeq2Seq method getDataset.

public static TextDataset getDataset(Dataset.Usage usage, Arguments arguments, TextEmbedding sourceEmbedding, TextEmbedding targetEmbedding) throws IOException, TranslateException {
    long limit = usage == Dataset.Usage.TRAIN ? arguments.getLimit() : arguments.getLimit() / 10;
    TatoebaEnglishFrenchDataset.Builder datasetBuilder = TatoebaEnglishFrenchDataset.builder().setSampling(arguments.getBatchSize(), true, false).optDataBatchifier(PaddingStackBatchifier.builder().optIncludeValidLengths(true).addPad(0, 0, (m) -> m.zeros(new Shape(1)), 10).build()).optLabelBatchifier(PaddingStackBatchifier.builder().optIncludeValidLengths(true).addPad(0, 0, (m) -> m.ones(new Shape(1)), 10).build()).optUsage(usage).optPrefetchNumber(8).optLimit(limit);
    Configuration sourceConfig = new Configuration().setTextProcessors(Arrays.asList(new SimpleTokenizer(), new LowerCaseConvertor(Locale.ENGLISH), new PunctuationSeparator(), new TextTruncator(10)));
    Configuration targetConfig = new Configuration().setTextProcessors(Arrays.asList(new SimpleTokenizer(), new LowerCaseConvertor(Locale.FRENCH), new PunctuationSeparator(), new TextTruncator(8), new TextTerminator()));
    if (sourceEmbedding != null) {
        sourceConfig.setTextEmbedding(sourceEmbedding);
    } else {
        sourceConfig.setEmbeddingSize(32);
    }
    if (targetEmbedding != null) {
        targetConfig.setTextEmbedding(targetEmbedding);
    } else {
        targetConfig.setEmbeddingSize(32);
    }
    TatoebaEnglishFrenchDataset dataset = datasetBuilder.setSourceConfiguration(sourceConfig).setTargetConfiguration(targetConfig).build();
    dataset.prepare(new ProgressBar());
    return dataset;
}
Also used : Metrics(ai.djl.metric.Metrics) Arrays(java.util.Arrays) TextTruncator(ai.djl.modality.nlp.preprocess.TextTruncator) MaskedSoftmaxCrossEntropyLoss(ai.djl.training.loss.MaskedSoftmaxCrossEntropyLoss) Shape(ai.djl.ndarray.types.Shape) ProgressBar(ai.djl.training.util.ProgressBar) Block(ai.djl.nn.Block) Configuration(ai.djl.basicdataset.utils.TextData.Configuration) LSTM(ai.djl.nn.recurrent.LSTM) TextDataset(ai.djl.basicdataset.nlp.TextDataset) TranslateException(ai.djl.translate.TranslateException) TrainingListener(ai.djl.training.listener.TrainingListener) TatoebaEnglishFrenchDataset(ai.djl.basicdataset.nlp.TatoebaEnglishFrenchDataset) DefaultTrainingConfig(ai.djl.training.DefaultTrainingConfig) Locale(java.util.Locale) LowerCaseConvertor(ai.djl.modality.nlp.preprocess.LowerCaseConvertor) Trainer(ai.djl.training.Trainer) Engine(ai.djl.engine.Engine) PaddingStackBatchifier(ai.djl.translate.PaddingStackBatchifier) SaveModelTrainingListener(ai.djl.training.listener.SaveModelTrainingListener) ExecutorService(java.util.concurrent.ExecutorService) Model(ai.djl.Model) SimpleTextEncoder(ai.djl.basicmodelzoo.nlp.SimpleTextEncoder) Arguments(ai.djl.examples.training.util.Arguments) Accuracy(ai.djl.training.evaluator.Accuracy) EasyTrain(ai.djl.training.EasyTrain) IOException(java.io.IOException) EncoderDecoder(ai.djl.modality.nlp.EncoderDecoder) TextTerminator(ai.djl.modality.nlp.preprocess.TextTerminator) Executors(java.util.concurrent.Executors) Dataset(ai.djl.training.dataset.Dataset) SimpleTextDecoder(ai.djl.basicmodelzoo.nlp.SimpleTextDecoder) TextEmbedding(ai.djl.modality.nlp.embedding.TextEmbedding) PunctuationSeparator(ai.djl.modality.nlp.preprocess.PunctuationSeparator) TrainableTextEmbedding(ai.djl.modality.nlp.embedding.TrainableTextEmbedding) TrainingResult(ai.djl.training.TrainingResult) SimpleTokenizer(ai.djl.modality.nlp.preprocess.SimpleTokenizer) PunctuationSeparator(ai.djl.modality.nlp.preprocess.PunctuationSeparator) Shape(ai.djl.ndarray.types.Shape) Configuration(ai.djl.basicdataset.utils.TextData.Configuration) LowerCaseConvertor(ai.djl.modality.nlp.preprocess.LowerCaseConvertor) SimpleTokenizer(ai.djl.modality.nlp.preprocess.SimpleTokenizer) TextTerminator(ai.djl.modality.nlp.preprocess.TextTerminator) TatoebaEnglishFrenchDataset(ai.djl.basicdataset.nlp.TatoebaEnglishFrenchDataset) TextTruncator(ai.djl.modality.nlp.preprocess.TextTruncator) ProgressBar(ai.djl.training.util.ProgressBar)

Aggregations

Model (ai.djl.Model)1 TatoebaEnglishFrenchDataset (ai.djl.basicdataset.nlp.TatoebaEnglishFrenchDataset)1 TextDataset (ai.djl.basicdataset.nlp.TextDataset)1 Configuration (ai.djl.basicdataset.utils.TextData.Configuration)1 SimpleTextDecoder (ai.djl.basicmodelzoo.nlp.SimpleTextDecoder)1 SimpleTextEncoder (ai.djl.basicmodelzoo.nlp.SimpleTextEncoder)1 Engine (ai.djl.engine.Engine)1 Arguments (ai.djl.examples.training.util.Arguments)1 Metrics (ai.djl.metric.Metrics)1 EncoderDecoder (ai.djl.modality.nlp.EncoderDecoder)1 TextEmbedding (ai.djl.modality.nlp.embedding.TextEmbedding)1 TrainableTextEmbedding (ai.djl.modality.nlp.embedding.TrainableTextEmbedding)1 LowerCaseConvertor (ai.djl.modality.nlp.preprocess.LowerCaseConvertor)1 PunctuationSeparator (ai.djl.modality.nlp.preprocess.PunctuationSeparator)1 SimpleTokenizer (ai.djl.modality.nlp.preprocess.SimpleTokenizer)1 TextTerminator (ai.djl.modality.nlp.preprocess.TextTerminator)1 TextTruncator (ai.djl.modality.nlp.preprocess.TextTruncator)1 Shape (ai.djl.ndarray.types.Shape)1 Block (ai.djl.nn.Block)1 LSTM (ai.djl.nn.recurrent.LSTM)1