Search in sources :

Example 1 with IdentityProcessor

use of org.tribuo.data.columnar.processors.field.IdentityProcessor in project tribuo by oracle.

the class JsonDataSourceTest method buildRowProcessor.

private static RowProcessor<MockOutput> buildRowProcessor() {
    Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("height", new DoubleFieldProcessor("height"));
    fieldProcessors.put("description", new TextFieldProcessor("description", new BasicPipeline(new BreakIteratorTokenizer(Locale.US), 2)));
    fieldProcessors.put("transport", new IdentityProcessor("transport"));
    Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
    regexMappingProcessors.put("extra.*", new DoubleFieldProcessor("regex"));
    ResponseProcessor<MockOutput> responseProcessor = new FieldResponseProcessor<>("disposition", "UNK", new MockOutputFactory());
    List<FieldExtractor<?>> metadataExtractors = new ArrayList<>();
    metadataExtractors.add(new IntExtractor("id"));
    metadataExtractors.add(new DateExtractor("timestamp", "timestamp", "dd/MM/yyyy HH:mm"));
    return new RowProcessor<>(metadataExtractors, null, responseProcessor, fieldProcessors, regexMappingProcessors, Collections.emptySet());
}
Also used : TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) IntExtractor(org.tribuo.data.columnar.extractors.IntExtractor) DateExtractor(org.tribuo.data.columnar.extractors.DateExtractor) MockOutput(org.tribuo.test.MockOutput) HashMap(java.util.HashMap) MockOutputFactory(org.tribuo.test.MockOutputFactory) BasicPipeline(org.tribuo.data.text.impl.BasicPipeline) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) FieldResponseProcessor(org.tribuo.data.columnar.processors.response.FieldResponseProcessor) ArrayList(java.util.ArrayList) FieldProcessor(org.tribuo.data.columnar.FieldProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) BreakIteratorTokenizer(org.tribuo.util.tokens.impl.BreakIteratorTokenizer) FieldExtractor(org.tribuo.data.columnar.FieldExtractor) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) RowProcessor(org.tribuo.data.columnar.RowProcessor)

Example 2 with IdentityProcessor

use of org.tribuo.data.columnar.processors.field.IdentityProcessor in project tribuo by oracle.

the class LIMEColumnarTest method generateBinarisedDataset.

private Pair<RowProcessor<Label>, Dataset<Label>> generateBinarisedDataset() throws URISyntaxException {
    LabelFactory labelFactory = new LabelFactory();
    ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
    Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("A", new IdentityProcessor("A"));
    fieldProcessors.put("B", new DoubleFieldProcessor("B"));
    fieldProcessors.put("C", new DoubleFieldProcessor("C"));
    fieldProcessors.put("D", new IdentityProcessor("D"));
    fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
    RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
    CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
    Dataset<Label> dataset = new MutableDataset<>(source);
    return new Pair<>(rp, dataset);
}
Also used : TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) HashMap(java.util.HashMap) BasicPipeline(org.tribuo.data.text.impl.BasicPipeline) Label(org.tribuo.classification.Label) FieldResponseProcessor(org.tribuo.data.columnar.processors.response.FieldResponseProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) CSVDataSource(org.tribuo.data.csv.CSVDataSource) FieldProcessor(org.tribuo.data.columnar.FieldProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) LabelFactory(org.tribuo.classification.LabelFactory) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) RowProcessor(org.tribuo.data.columnar.RowProcessor) MutableDataset(org.tribuo.MutableDataset) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 3 with IdentityProcessor

use of org.tribuo.data.columnar.processors.field.IdentityProcessor in project tribuo by oracle.

the class LIMEColumnarTest method generateCategoricalDataset.

private Pair<RowProcessor<Label>, Dataset<Label>> generateCategoricalDataset() throws URISyntaxException {
    LabelFactory labelFactory = new LabelFactory();
    ResponseProcessor<Label> responseProcessor = new FieldResponseProcessor<>("Response", "N", labelFactory);
    Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
    fieldProcessors.put("A", new IdentityProcessor("A") {

        @Override
        public GeneratedFeatureType getFeatureType() {
            return GeneratedFeatureType.CATEGORICAL;
        }
    });
    fieldProcessors.put("B", new DoubleFieldProcessor("B"));
    fieldProcessors.put("C", new DoubleFieldProcessor("C"));
    fieldProcessors.put("D", new IdentityProcessor("D") {

        @Override
        public GeneratedFeatureType getFeatureType() {
            return GeneratedFeatureType.CATEGORICAL;
        }
    });
    fieldProcessors.put("TextField", new TextFieldProcessor("TextField", new BasicPipeline(tokenizer, 2)));
    RowProcessor<Label> rp = new RowProcessor<>(responseProcessor, fieldProcessors);
    CSVDataSource<Label> source = new CSVDataSource<>(LIMEColumnarTest.class.getResource("/org/tribuo/classification/explanations/lime/test-columnar.csv").toURI(), rp, true);
    Dataset<Label> dataset = new MutableDataset<>(source);
    return new Pair<>(rp, dataset);
}
Also used : TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) HashMap(java.util.HashMap) BasicPipeline(org.tribuo.data.text.impl.BasicPipeline) Label(org.tribuo.classification.Label) FieldResponseProcessor(org.tribuo.data.columnar.processors.response.FieldResponseProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) CSVDataSource(org.tribuo.data.csv.CSVDataSource) FieldProcessor(org.tribuo.data.columnar.FieldProcessor) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) LabelFactory(org.tribuo.classification.LabelFactory) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) RowProcessor(org.tribuo.data.columnar.RowProcessor) MutableDataset(org.tribuo.MutableDataset) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 4 with IdentityProcessor

use of org.tribuo.data.columnar.processors.field.IdentityProcessor in project tribuo by oracle.

the class RowProcessorTest method testInvalidRegexMapping.

@Test
public void testInvalidRegexMapping() {
    List<String> fieldNames = Arrays.asList("Armadillos", "Armadas", "Archery", "Battleship", "Battles", "Carrots", "Label");
    Map<String, FieldProcessor> fixed = new HashMap<>();
    fixed.put("Battles", new IdentityProcessor("Battles"));
    Map<String, FieldProcessor> regex = new HashMap<>();
    try {
        regex.put("Arma*", new IdentityProcessor("Arma*"));
        regex.put("Monkeys", new IdentityProcessor("Monkeys"));
        RowProcessor<MockOutput> rowProcessor = new RowProcessor<>(Collections.emptyList(), null, new MockResponseProcessor("Label"), fixed, regex, new HashSet<>());
        rowProcessor.expandRegexMapping(fieldNames);
        fail("Should have thrown an IllegalArgumentException");
    } catch (IllegalArgumentException e) {
    // pass
    } catch (Exception e) {
        fail("Incorrect exception thrown.");
    }
    regex.clear();
    try {
        regex.put("Battle*", new IdentityProcessor("Battle*"));
        RowProcessor<MockOutput> rowProcessor = new RowProcessor<>(Collections.emptyList(), null, new MockResponseProcessor("Label"), fixed, regex, new HashSet<>());
        rowProcessor.expandRegexMapping(fieldNames);
        fail("Should have thrown an IllegalArgumentException");
    } catch (IllegalArgumentException e) {
    // pass
    } catch (Exception e) {
        fail("Incorrect exception thrown.");
    }
    regex.clear();
    try {
        regex.put("Arm*", new IdentityProcessor("Arm*"));
        regex.put("Armadil*", new IdentityProcessor("Armadil*"));
        RowProcessor<MockOutput> rowProcessor = new RowProcessor<>(Collections.emptyList(), null, new MockResponseProcessor("Label"), fixed, regex, new HashSet<>());
        rowProcessor.expandRegexMapping(fieldNames);
        fail("Should have thrown an IllegalArgumentException");
    } catch (IllegalArgumentException e) {
    // pass
    } catch (Exception e) {
        fail("Incorrect exception thrown.");
    }
}
Also used : MockOutput(org.tribuo.test.MockOutput) HashMap(java.util.HashMap) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) PropertyException(com.oracle.labs.mlrg.olcut.config.PropertyException) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) Test(org.junit.jupiter.api.Test)

Example 5 with IdentityProcessor

use of org.tribuo.data.columnar.processors.field.IdentityProcessor in project tribuo by oracle.

the class RowProcessorTest method metadataExtractorTest.

@Test
public void metadataExtractorTest() {
    Map<String, FieldProcessor> fixed = new HashMap<>();
    fixed.put("Battles", new IdentityProcessor("Battles"));
    fixed.put("Armadas", new DoubleFieldProcessor("Armadas"));
    List<FieldExtractor<?>> metadataExtractors = new ArrayList<>();
    metadataExtractors.add(new IdentityExtractor("Armadillos", Example.NAME));
    metadataExtractors.add(new IntExtractor("Armadillos", "ID"));
    metadataExtractors.add(new DateExtractor("Carrots", "Date", "uuuuMMdd"));
    metadataExtractors.add(new OffsetDateTimeExtractor("Carrot-time", "OffsetDateTime", "dd/MM/yyyy HH:mmx"));
    FloatExtractor weightExtractor = new FloatExtractor("Mass");
    MockResponseProcessor response = new MockResponseProcessor("Label");
    Map<String, String> row = new HashMap<>();
    row.put("Armadillos", "1");
    row.put("Armadas", "2");
    row.put("Archery", "3");
    row.put("Battleship", "4");
    row.put("Battles", "5");
    row.put("Carrots", "20010506");
    row.put("Carrot-time", "14/10/2020 16:07+01");
    row.put("Mass", "9000");
    row.put("Label", "Sheep");
    RowProcessor<MockOutput> processor = new RowProcessor<>(metadataExtractors, weightExtractor, response, fixed, Collections.emptySet());
    Example<MockOutput> example = processor.generateExample(row, true).get();
    // Check example is extracted correctly
    assertEquals(2, example.size());
    assertEquals("Sheep", example.getOutput().label);
    Iterator<Feature> featureIterator = example.iterator();
    Feature a = featureIterator.next();
    assertEquals("Armadas@value", a.getName());
    assertEquals(2.0, a.getValue());
    a = featureIterator.next();
    assertEquals("Battles@5", a.getName());
    assertEquals(IdentityProcessor.FEATURE_VALUE, a.getValue());
    assertEquals(9000f, example.getWeight());
    // Check metadata is extracted correctly
    Map<String, Object> metadata = example.getMetadata();
    assertEquals(4, metadata.size());
    assertEquals("1", metadata.get(Example.NAME));
    assertEquals(1, metadata.get("ID"));
    assertEquals(LocalDate.of(2001, 5, 6), metadata.get("Date"));
    assertEquals(OffsetDateTime.of(LocalDate.of(2020, 10, 14), LocalTime.of(16, 7), ZoneOffset.ofHours(1)), metadata.get("OffsetDateTime"));
    // Check metadata types
    Map<String, Class<?>> metadataTypes = processor.getMetadataTypes();
    assertEquals(4, metadataTypes.size());
    assertEquals(String.class, metadataTypes.get(Example.NAME));
    assertEquals(Integer.class, metadataTypes.get("ID"));
    assertEquals(LocalDate.class, metadataTypes.get("Date"));
    assertEquals(OffsetDateTime.class, metadataTypes.get("OffsetDateTime"));
    // Check an invalid metadata extractor throws IllegalArgumentException
    List<FieldExtractor<?>> badExtractors = new ArrayList<>();
    badExtractors.add(new IdentityExtractor("Armadillos", Example.NAME));
    badExtractors.add(new IntExtractor("Armadillos", "ID"));
    badExtractors.add(new DateExtractor("Carrots", "ID", "uuuuMMdd"));
    assertThrows(PropertyException.class, () -> new RowProcessor<>(badExtractors, weightExtractor, response, fixed, Collections.emptySet()));
}
Also used : IntExtractor(org.tribuo.data.columnar.extractors.IntExtractor) MockOutput(org.tribuo.test.MockOutput) HashMap(java.util.HashMap) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) ArrayList(java.util.ArrayList) FloatExtractor(org.tribuo.data.columnar.extractors.FloatExtractor) Feature(org.tribuo.Feature) DoubleFieldProcessor(org.tribuo.data.columnar.processors.field.DoubleFieldProcessor) TextFieldProcessor(org.tribuo.data.columnar.processors.field.TextFieldProcessor) IdentityProcessor(org.tribuo.data.columnar.processors.field.IdentityProcessor) IdentityExtractor(org.tribuo.data.columnar.extractors.IdentityExtractor) DateExtractor(org.tribuo.data.columnar.extractors.DateExtractor) OffsetDateTimeExtractor(org.tribuo.data.columnar.extractors.OffsetDateTimeExtractor) Test(org.junit.jupiter.api.Test)

Aggregations

IdentityProcessor (org.tribuo.data.columnar.processors.field.IdentityProcessor)7 HashMap (java.util.HashMap)6 DoubleFieldProcessor (org.tribuo.data.columnar.processors.field.DoubleFieldProcessor)5 TextFieldProcessor (org.tribuo.data.columnar.processors.field.TextFieldProcessor)5 FieldProcessor (org.tribuo.data.columnar.FieldProcessor)4 FieldResponseProcessor (org.tribuo.data.columnar.processors.response.FieldResponseProcessor)4 MockOutput (org.tribuo.test.MockOutput)4 RowProcessor (org.tribuo.data.columnar.RowProcessor)3 BasicPipeline (org.tribuo.data.text.impl.BasicPipeline)3 Pair (com.oracle.labs.mlrg.olcut.util.Pair)2 ArrayList (java.util.ArrayList)2 BeforeEach (org.junit.jupiter.api.BeforeEach)2 Test (org.junit.jupiter.api.Test)2 MutableDataset (org.tribuo.MutableDataset)2 Label (org.tribuo.classification.Label)2 LabelFactory (org.tribuo.classification.LabelFactory)2 DateExtractor (org.tribuo.data.columnar.extractors.DateExtractor)2 IntExtractor (org.tribuo.data.columnar.extractors.IntExtractor)2 CSVDataSource (org.tribuo.data.csv.CSVDataSource)2 MockOutputFactory (org.tribuo.test.MockOutputFactory)2