Search in sources :

Example 1 with TextFieldProcessor

use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.

the class JsonDataSourceTest method buildRowProcessor.

private static RowProcessor<MockOutput> buildRowProcessor() {
    Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("height", new DoubleFieldProcessor("height"));
    fieldProcessors.put("description", new TextFieldProcessor("description", new BasicPipeline(new BreakIteratorTokenizer(Locale.US), 2)));
    fieldProcessors.put("transport", new IdentityProcessor("transport"));
    Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
    regexMappingProcessors.put("extra.*", new DoubleFieldProcessor("regex"));
    ResponseProcessor<MockOutput> responseProcessor = new FieldResponseProcessor<>("disposition", "UNK", new MockOutputFactory());
    List<FieldExtractor<?>> metadataExtractors = new ArrayList<>();
    metadataExtractors.add(new IntExtractor("id"));
    metadataExtractors.add(new DateExtractor("timestamp", "timestamp", "dd/MM/yyyy HH:mm"));
    return new RowProcessor<>(metadataExtractors, null, responseProcessor, fieldProcessors, regexMappingProcessors, Collections.emptySet());
}
Also used : TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) IntExtractor(org.tribuo.data.columnar.extractors.IntExtractor) DateExtractor(org.tribuo.data.columnar.extractors.DateExtractor) MockOutput(org.tribuo.test.MockOutput) HashMap(java.util.HashMap) MockOutputFactory(org.tribuo.test.MockOutputFactory) BasicPipeline(org.tribuo.data.text.impl.BasicPipeline) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) FieldResponseProcessor(org.tribuo.data.columnar.processors.response.FieldResponseProcessor) ArrayList(java.util.ArrayList) FieldProcessor(org.tribuo.data.columnar.FieldProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) BreakIteratorTokenizer(org.tribuo.util.tokens.impl.BreakIteratorTokenizer) FieldExtractor(org.tribuo.data.columnar.FieldExtractor) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) RowProcessor(org.tribuo.data.columnar.RowProcessor)

Example 2 with TextFieldProcessor

use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.

the class LIMEColumnarTest method generateBinarisedDataset.

private Pair<RowProcessor<Label>, Dataset<Label>> generateBinarisedDataset() throws URISyntaxException {
    LabelFactory labelFactory = new LabelFactory();
    ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
    Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("A", new IdentityProcessor("A"));
    fieldProcessors.put("B", new DoubleFieldProcessor("B"));
    fieldProcessors.put("C", new DoubleFieldProcessor("C"));
    fieldProcessors.put("D", new IdentityProcessor("D"));
    fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
    RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
    CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
    Dataset<Label> dataset = new MutableDataset<>(source);
    return new Pair<>(rp, dataset);
}
Also used : TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) HashMap(java.util.HashMap) BasicPipeline(org.tribuo.data.text.impl.BasicPipeline) Label(org.tribuo.classification.Label) FieldResponseProcessor(org.tribuo.data.columnar.processors.response.FieldResponseProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) CSVDataSource(org.tribuo.data.csv.CSVDataSource) FieldProcessor(org.tribuo.data.columnar.FieldProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) LabelFactory(org.tribuo.classification.LabelFactory) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) RowProcessor(org.tribuo.data.columnar.RowProcessor) MutableDataset(org.tribuo.MutableDataset) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 3 with TextFieldProcessor

use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.

the class LIMEColumnarTest method generateCategoricalDataset.

private Pair<RowProcessor<Label>, Dataset<Label>> generateCategoricalDataset() throws URISyntaxException {
    LabelFactory labelFactory = new LabelFactory();
    ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
    Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("A", new IdentityProcessor("A") {

        @Override
        public GeneratedFeatureType getFeatureType() {
            return GeneratedFeatureType.CATEGORICAL;
        }
    });
    fieldProcessors.put("B", new DoubleFieldProcessor("B"));
    fieldProcessors.put("C", new DoubleFieldProcessor("C"));
    fieldProcessors.put("D", new IdentityProcessor("D") {

        @Override
        public GeneratedFeatureType getFeatureType() {
            return GeneratedFeatureType.CATEGORICAL;
        }
    });
    fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
    RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
    CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
    Dataset<Label> dataset = new MutableDataset<>(source);
    return new Pair<>(rp, dataset);
}
Also used : TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) HashMap(java.util.HashMap) BasicPipeline(org.tribuo.data.text.impl.BasicPipeline) Label(org.tribuo.classification.Label) FieldResponseProcessor(org.tribuo.data.columnar.processors.response.FieldResponseProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) CSVDataSource(org.tribuo.data.csv.CSVDataSource) FieldProcessor(org.tribuo.data.columnar.FieldProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) LabelFactory(org.tribuo.classification.LabelFactory) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) RowProcessor(org.tribuo.data.columnar.RowProcessor) MutableDataset(org.tribuo.MutableDataset) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 4 with TextFieldProcessor

use of org.tribuo.data.columnar.processors.field.TextFieldProcessor in project tribuo by oracle.

the class RowProcessorTest method replaceNewlinesWithSpacesTest.

@Test
public void replaceNewlinesWithSpacesTest() {
    final Pattern BLANK_LINES = Pattern.compile("(\n[\\s-]*\n)+");
    final Function<CharSequence, CharSequence> newLiner = (CharSequence charSequence) -> {
        if (charSequence == null || charSequence.length() == 0) {
            return charSequence;
        }
        return BLANK_LINES.splitAsStream(charSequence).collect(Collectors.joining(" *\n\n"));
    };
    Tokenizer tokenizer = new MungingTokenizer(new BreakIteratorTokenizer(Locale.US), newLiner);
    TokenPipeline textPipeline = new TokenPipeline(tokenizer, 2, false);
    final Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("order_text", new TextFieldProcessor("order_text", textPipeline));
    MockResponseProcessor response = new MockResponseProcessor("Label");
    Map<String, String> row = new HashMap<>();
    row.put("order_text", "Jimmy\n\n\n\nHoffa");
    row.put("Label", "Sheep");
    RowProcessor<MockOutput> processor = new RowProcessor<>(Collections.emptyList(), null, response, fieldProcessors, Collections.emptyMap(), Collections.emptySet(), false);
    Example<MockOutput> example = processor.generateExample(row, true).get();
    // Check example is extracted correctly
    assertEquals(5, example.size());
    assertEquals("Sheep", example.getOutput().label);
    Iterator<Feature> featureIterator = example.iterator();
    Feature a = featureIterator.next();
    assertEquals("order_text@1-N=*", a.getName());
    assertEquals(1.0, a.getValue());
    a = featureIterator.next();
    assertEquals("order_text@1-N=Hoffa", a.getName());
    a = featureIterator.next();
    assertEquals("order_text@1-N=Jimmy", a.getName());
    a = featureIterator.next();
    assertEquals("order_text@2-N=*/Hoffa", a.getName());
    a = featureIterator.next();
    assertEquals("order_text@2-N=Jimmy/*", a.getName());
    assertFalse(featureIterator.hasNext());
    // same input with replaceNewlinesWithSpacesTest=true (the default) produces different features
    processor = new RowProcessor<>(Collections.emptyList(), null, response, fieldProcessors, Collections.emptyMap(), Collections.emptySet(), true);
    example = processor.generateExample(row, true).get();
    // Check example is extracted correctly
    assertEquals(3, example.size());
    assertEquals("Sheep", example.getOutput().label);
    featureIterator = example.iterator();
    a = featureIterator.next();
    assertEquals("order_text@1-N=Hoffa", a.getName());
    assertEquals(1.0, a.getValue());
    a = featureIterator.next();
    assertEquals("order_text@1-N=Jimmy", a.getName());
    a = featureIterator.next();
    assertEquals("order_text@2-N=Jimmy/Hoffa", a.getName());
    assertFalse(featureIterator.hasNext());
}
Also used : Pattern(java.util.regex.Pattern) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) MockOutput(org.tribuo.test.MockOutput) HashMap(java.util.HashMap) Feature(org.tribuo.Feature) BreakIteratorTokenizer(org.tribuo.util.tokens.impl.BreakIteratorTokenizer) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) TokenPipeline(org.tribuo.data.text.impl.TokenPipeline) Tokenizer(org.tribuo.util.tokens.Tokenizer) BreakIteratorTokenizer(org.tribuo.util.tokens.impl.BreakIteratorTokenizer) Test(org.junit.jupiter.api.Test)

Aggregations

HashMap (java.util.HashMap)4 DoubleFieldProcessor (org.tribuo.data.columnar.processors.field.DoubleFieldProcessor)4 TextFieldProcessor (org.tribuo.data.columnar.processors.field.TextFieldProcessor)4 FieldProcessor (org.tribuo.data.columnar.FieldProcessor)3 RowProcessor (org.tribuo.data.columnar.RowProcessor)3 IdentityProcessor (org.tribuo.data.columnar.processors.field.IdentityProcessor)3 FieldResponseProcessor (org.tribuo.data.columnar.processors.response.FieldResponseProcessor)3 BasicPipeline (org.tribuo.data.text.impl.BasicPipeline)3 Pair (com.oracle.labs.mlrg.olcut.util.Pair)2 MutableDataset (org.tribuo.MutableDataset)2 Label (org.tribuo.classification.Label)2 LabelFactory (org.tribuo.classification.LabelFactory)2 CSVDataSource (org.tribuo.data.csv.CSVDataSource)2 MockOutput (org.tribuo.test.MockOutput)2 BreakIteratorTokenizer (org.tribuo.util.tokens.impl.BreakIteratorTokenizer)2 ArrayList (java.util.ArrayList)1 Pattern (java.util.regex.Pattern)1 Test (org.junit.jupiter.api.Test)1 Feature (org.tribuo.Feature)1 FieldExtractor (org.tribuo.data.columnar.FieldExtractor)1