Search in sources :

Example 1 with ColumnarFeature

use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.

the class UniqueProcessor method process.

@Override
public List<ColumnarFeature> process(List<ColumnarFeature> features) {
    if (features.isEmpty()) {
        return features;
    }
    Map<String, List<ColumnarFeature>> map = new LinkedHashMap<>();
    for (ColumnarFeature f : features) {
        map.computeIfAbsent(f.getName(), (s) -> new ArrayList<>()).add(f);
    }
    // Unique the features
    List<ColumnarFeature> returnVal = new ArrayList<>();
    for (Map.Entry<String, List<ColumnarFeature>> e : map.entrySet()) {
        returnVal.add(uniqueList(reductionType, e.getValue()));
    }
    return returnVal;
}
Also used : LinkedHashMap(java.util.LinkedHashMap) List(java.util.List) ConfiguredObjectProvenanceImpl(com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl) ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature) Map(java.util.Map) Config(com.oracle.labs.mlrg.olcut.config.Config) ConfiguredObjectProvenance(com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance) FeatureProcessor(org.tribuo.data.columnar.FeatureProcessor) Comparator(java.util.Comparator) ArrayList(java.util.ArrayList) ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap)

Example 2 with ColumnarFeature

use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.

the class DateFieldProcessor method process.

@Override
public List<ColumnarFeature> process(String value) {
    try {
        LocalDate date = LocalDate.parse(value, formatter);
        List<ColumnarFeature> features = new ArrayList<>(featureTypes.size());
        for (DateFeatureType f : featureTypes) {
            int featureValue = f.extract(date);
            ColumnarFeature feature = new ColumnarFeature(fieldName, f.toString(), featureValue);
            features.add(feature);
        }
        return features;
    } catch (DateTimeParseException e) {
        logger.log(Level.WARNING, e.getParsedString());
        logger.log(Level.WARNING, String.format("Unable to parse date %s with formatter %s", value, formatter.toString()));
        return Collections.emptyList();
    }
}
Also used : DateTimeParseException(java.time.format.DateTimeParseException) ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature) ArrayList(java.util.ArrayList) LocalDate(java.time.LocalDate)

Example 3 with ColumnarFeature

use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.

the class RegexFieldProcessor method process.

@Override
public List<ColumnarFeature> process(String value) {
    List<ColumnarFeature> features = new ArrayList<>();
    Matcher m = regex.matcher(value);
    for (Mode mode : modes) {
        switch(mode) {
            case MATCH_ALL:
                if (m.matches()) {
                    features.add(new ColumnarFeature(fieldName, "MATCHES_ALL", 1.0));
                }
                break;
            case MATCH_CONTAINS:
                if (m.find()) {
                    features.add(new ColumnarFeature(fieldName, "CONTAINS_MATCH", 1.0));
                }
                break;
            case GROUPS:
                int i = 0;
                while (m.find()) {
                    i++;
                    features.add(new ColumnarFeature(fieldName, "GROUPS(" + m.group(i) + ")", 1.0));
                }
                break;
        }
    }
    return features;
}
Also used : ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList)

Example 4 with ColumnarFeature

use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.

the class TextFieldProcessor method wrapFeatures.

/**
 * Convert the {@link Feature}s from a text pipeline into {@link ColumnarFeature}s with the right field name.
 * @param fieldName The field name to prepend.
 * @param inputFeatures The features to convert.
 * @return A list of columnar features.
 */
public static List<ColumnarFeature> wrapFeatures(String fieldName, List<Feature> inputFeatures) {
    if (inputFeatures.isEmpty()) {
        return Collections.emptyList();
    } else {
        List<ColumnarFeature> list = new ArrayList<>();
        for (Feature f : inputFeatures) {
            ColumnarFeature newF = new ColumnarFeature(fieldName, f.getName(), f.getValue());
            list.add(newF);
        }
        return list;
    }
}
Also used : ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature) ArrayList(java.util.ArrayList) Feature(org.tribuo.Feature) ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature)

Example 5 with ColumnarFeature

use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.

the class DateFieldProcessorTest method testValidBehaviour.

@Test
public void testValidBehaviour() {
    String isoFormat = "uuuu-MM-dd";
    DateTimeFormatter isoFormatter = DateTimeFormatter.ofPattern(isoFormat, Locale.US);
    String isoInput = "1994-01-26";
    DateFieldProcessor isoProc = new DateFieldProcessor("test-iso", EnumSet.allOf(DateFieldProcessor.DateFeatureType.class), isoFormat);
    LocalDate isoDate = LocalDate.parse(isoInput, isoFormatter);
    List<ColumnarFeature> isoFeatures = isoProc.process(isoInput);
    assertEquals(DateFieldProcessor.DateFeatureType.values().length, isoFeatures.size());
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY", isoDate.getDayOfMonth())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY", 26)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_WEEK", isoDate.getDayOfWeek().getValue())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_WEEK", DayOfWeek.WEDNESDAY.getValue())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_YEAR", isoDate.getDayOfYear())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_YEAR", 26)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "WEEK_OF_YEAR", isoDate.get(WeekFields.ISO.weekOfWeekBasedYear()))));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "WEEK_OF_MONTH", 4)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_DAY", 0)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_WEEK", 0)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_MONTH", 1)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_YEAR", 0)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "CALENDAR_QUARTER", 1)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_QUARTER", 26)));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "MONTH", isoDate.getMonthValue())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "MONTH", Month.JANUARY.getValue())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "YEAR", isoDate.getYear())));
    assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "YEAR", 1994)));
    String usFormat = "MM-dd-uuuu";
    DateTimeFormatter usFormatter = DateTimeFormatter.ofPattern(usFormat, Locale.US);
    String usInput = "09-08-1966";
    DateFieldProcessor usProc = new DateFieldProcessor("test-us", EnumSet.allOf(DateFieldProcessor.DateFeatureType.class), usFormat);
    LocalDate usDate = LocalDate.parse(usInput, usFormatter);
    List<ColumnarFeature> usFeatures = usProc.process(usInput);
    assertEquals(DateFieldProcessor.DateFeatureType.values().length, usFeatures.size());
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY", usDate.getDayOfMonth())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY", 8)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_WEEK", usDate.getDayOfWeek().getValue())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_WEEK", DayOfWeek.THURSDAY.getValue())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_YEAR", usDate.getDayOfYear())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_YEAR", 251)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "WEEK_OF_YEAR", usDate.get(WeekFields.ISO.weekOfWeekBasedYear()))));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "WEEK_OF_MONTH", 2)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_DAY", 1)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_WEEK", 0)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_MONTH", 1)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_YEAR", 0)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "CALENDAR_QUARTER", 3)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_QUARTER", 70)));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "MONTH", usDate.getMonthValue())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "MONTH", Month.SEPTEMBER.getValue())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "YEAR", usDate.getYear())));
    assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "YEAR", 1966)));
    String ukFormat = "dd-MM-uuuu";
    DateTimeFormatter ukFormatter = DateTimeFormatter.ofPattern(ukFormat, Locale.US);
    String ukInput = "23-11-1963";
    DateFieldProcessor ukProc = new DateFieldProcessor("test-uk", EnumSet.allOf(DateFieldProcessor.DateFeatureType.class), ukFormat);
    LocalDate ukDate = LocalDate.parse(ukInput, ukFormatter);
    List<ColumnarFeature> ukFeatures = ukProc.process(ukInput);
    assertEquals(DateFieldProcessor.DateFeatureType.values().length, ukFeatures.size());
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY", ukDate.getDayOfMonth())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY", 23)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_WEEK", ukDate.getDayOfWeek().getValue())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_WEEK", DayOfWeek.SATURDAY.getValue())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_YEAR", ukDate.getDayOfYear())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_YEAR", 327)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "WEEK_OF_YEAR", ukDate.get(WeekFields.ISO.weekOfWeekBasedYear()))));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "WEEK_OF_MONTH", 3)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_DAY", 1)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_WEEK", 1)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_MONTH", 1)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_YEAR", 1)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "CALENDAR_QUARTER", 4)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_QUARTER", 54)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "MONTH", ukDate.getMonthValue())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "MONTH", Month.NOVEMBER.getValue())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "YEAR", ukDate.getYear())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "YEAR", 1963)));
    ukProc = new DateFieldProcessor("test-uk", EnumSet.of(DateFieldProcessor.DateFeatureType.DAY, DateFieldProcessor.DateFeatureType.MONTH, DateFieldProcessor.DateFeatureType.YEAR), ukFormat);
    ukFeatures = ukProc.process(ukInput);
    assertEquals(3, ukFeatures.size());
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY", 23)));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "MONTH", Month.NOVEMBER.getValue())));
    assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "YEAR", 1963)));
}
Also used : ColumnarFeature(org.tribuo.data.columnar.ColumnarFeature) DateTimeFormatter(java.time.format.DateTimeFormatter) LocalDate(java.time.LocalDate) Test(org.junit.jupiter.api.Test)

Aggregations

ColumnarFeature (org.tribuo.data.columnar.ColumnarFeature)5 ArrayList (java.util.ArrayList)4 LocalDate (java.time.LocalDate)2 Config (com.oracle.labs.mlrg.olcut.config.Config)1 ConfiguredObjectProvenance (com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance)1 ConfiguredObjectProvenanceImpl (com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl)1 DateTimeFormatter (java.time.format.DateTimeFormatter)1 DateTimeParseException (java.time.format.DateTimeParseException)1 Comparator (java.util.Comparator)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Map (java.util.Map)1 Matcher (java.util.regex.Matcher)1 Test (org.junit.jupiter.api.Test)1 Feature (org.tribuo.Feature)1 FeatureProcessor (org.tribuo.data.columnar.FeatureProcessor)1