use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.
the class UniqueProcessor method process.
@Override
public List<ColumnarFeature> process(List<ColumnarFeature> features) {
if (features.isEmpty()) {
return features;
}
Map<String, List<ColumnarFeature>> map = new LinkedHashMap<>();
for (ColumnarFeature f : features) {
map.computeIfAbsent(f.getName(), (s) -> new ArrayList<>()).add(f);
}
// Unique the features
List<ColumnarFeature> returnVal = new ArrayList<>();
for (Map.Entry<String, List<ColumnarFeature>> e : map.entrySet()) {
returnVal.add(uniqueList(reductionType, e.getValue()));
}
return returnVal;
}
use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.
the class DateFieldProcessor method process.
@Override
public List<ColumnarFeature> process(String value) {
try {
LocalDate date = LocalDate.parse(value, formatter);
List<ColumnarFeature> features = new ArrayList<>(featureTypes.size());
for (DateFeatureType f : featureTypes) {
int featureValue = f.extract(date);
ColumnarFeature feature = new ColumnarFeature(fieldName, f.toString(), featureValue);
features.add(feature);
}
return features;
} catch (DateTimeParseException e) {
logger.log(Level.WARNING, e.getParsedString());
logger.log(Level.WARNING, String.format("Unable to parse date %s with formatter %s", value, formatter.toString()));
return Collections.emptyList();
}
}
use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.
the class RegexFieldProcessor method process.
@Override
public List<ColumnarFeature> process(String value) {
List<ColumnarFeature> features = new ArrayList<>();
Matcher m = regex.matcher(value);
for (Mode mode : modes) {
switch(mode) {
case MATCH_ALL:
if (m.matches()) {
features.add(new ColumnarFeature(fieldName, "MATCHES_ALL", 1.0));
}
break;
case MATCH_CONTAINS:
if (m.find()) {
features.add(new ColumnarFeature(fieldName, "CONTAINS_MATCH", 1.0));
}
break;
case GROUPS:
int i = 0;
while (m.find()) {
i++;
features.add(new ColumnarFeature(fieldName, "GROUPS(" + m.group(i) + ")", 1.0));
}
break;
}
}
return features;
}
use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.
the class TextFieldProcessor method wrapFeatures.
/**
* Convert the {@link Feature}s from a text pipeline into {@link ColumnarFeature}s with the right field name.
* @param fieldName The field name to prepend.
* @param inputFeatures The features to convert.
* @return A list of columnar features.
*/
public static List<ColumnarFeature> wrapFeatures(String fieldName, List<Feature> inputFeatures) {
if (inputFeatures.isEmpty()) {
return Collections.emptyList();
} else {
List<ColumnarFeature> list = new ArrayList<>();
for (Feature f : inputFeatures) {
ColumnarFeature newF = new ColumnarFeature(fieldName, f.getName(), f.getValue());
list.add(newF);
}
return list;
}
}
use of org.tribuo.data.columnar.ColumnarFeature in project tribuo by oracle.
the class DateFieldProcessorTest method testValidBehaviour.
@Test
public void testValidBehaviour() {
String isoFormat = "uuuu-MM-dd";
DateTimeFormatter isoFormatter = DateTimeFormatter.ofPattern(isoFormat, Locale.US);
String isoInput = "1994-01-26";
DateFieldProcessor isoProc = new DateFieldProcessor("test-iso", EnumSet.allOf(DateFieldProcessor.DateFeatureType.class), isoFormat);
LocalDate isoDate = LocalDate.parse(isoInput, isoFormatter);
List<ColumnarFeature> isoFeatures = isoProc.process(isoInput);
assertEquals(DateFieldProcessor.DateFeatureType.values().length, isoFeatures.size());
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY", isoDate.getDayOfMonth())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY", 26)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_WEEK", isoDate.getDayOfWeek().getValue())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_WEEK", DayOfWeek.WEDNESDAY.getValue())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_YEAR", isoDate.getDayOfYear())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_YEAR", 26)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "WEEK_OF_YEAR", isoDate.get(WeekFields.ISO.weekOfWeekBasedYear()))));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "WEEK_OF_MONTH", 4)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_DAY", 0)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_WEEK", 0)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_MONTH", 1)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "EVEN_OR_ODD_YEAR", 0)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "CALENDAR_QUARTER", 1)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "DAY_OF_QUARTER", 26)));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "MONTH", isoDate.getMonthValue())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "MONTH", Month.JANUARY.getValue())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "YEAR", isoDate.getYear())));
assertTrue(isoFeatures.contains(new ColumnarFeature("test-iso", "YEAR", 1994)));
String usFormat = "MM-dd-uuuu";
DateTimeFormatter usFormatter = DateTimeFormatter.ofPattern(usFormat, Locale.US);
String usInput = "09-08-1966";
DateFieldProcessor usProc = new DateFieldProcessor("test-us", EnumSet.allOf(DateFieldProcessor.DateFeatureType.class), usFormat);
LocalDate usDate = LocalDate.parse(usInput, usFormatter);
List<ColumnarFeature> usFeatures = usProc.process(usInput);
assertEquals(DateFieldProcessor.DateFeatureType.values().length, usFeatures.size());
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY", usDate.getDayOfMonth())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY", 8)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_WEEK", usDate.getDayOfWeek().getValue())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_WEEK", DayOfWeek.THURSDAY.getValue())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_YEAR", usDate.getDayOfYear())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_YEAR", 251)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "WEEK_OF_YEAR", usDate.get(WeekFields.ISO.weekOfWeekBasedYear()))));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "WEEK_OF_MONTH", 2)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_DAY", 1)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_WEEK", 0)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_MONTH", 1)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "EVEN_OR_ODD_YEAR", 0)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "CALENDAR_QUARTER", 3)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "DAY_OF_QUARTER", 70)));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "MONTH", usDate.getMonthValue())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "MONTH", Month.SEPTEMBER.getValue())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "YEAR", usDate.getYear())));
assertTrue(usFeatures.contains(new ColumnarFeature("test-us", "YEAR", 1966)));
String ukFormat = "dd-MM-uuuu";
DateTimeFormatter ukFormatter = DateTimeFormatter.ofPattern(ukFormat, Locale.US);
String ukInput = "23-11-1963";
DateFieldProcessor ukProc = new DateFieldProcessor("test-uk", EnumSet.allOf(DateFieldProcessor.DateFeatureType.class), ukFormat);
LocalDate ukDate = LocalDate.parse(ukInput, ukFormatter);
List<ColumnarFeature> ukFeatures = ukProc.process(ukInput);
assertEquals(DateFieldProcessor.DateFeatureType.values().length, ukFeatures.size());
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY", ukDate.getDayOfMonth())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY", 23)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_WEEK", ukDate.getDayOfWeek().getValue())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_WEEK", DayOfWeek.SATURDAY.getValue())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_YEAR", ukDate.getDayOfYear())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_YEAR", 327)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "WEEK_OF_YEAR", ukDate.get(WeekFields.ISO.weekOfWeekBasedYear()))));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "WEEK_OF_MONTH", 3)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_DAY", 1)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_WEEK", 1)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_MONTH", 1)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "EVEN_OR_ODD_YEAR", 1)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "CALENDAR_QUARTER", 4)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY_OF_QUARTER", 54)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "MONTH", ukDate.getMonthValue())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "MONTH", Month.NOVEMBER.getValue())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "YEAR", ukDate.getYear())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "YEAR", 1963)));
ukProc = new DateFieldProcessor("test-uk", EnumSet.of(DateFieldProcessor.DateFeatureType.DAY, DateFieldProcessor.DateFeatureType.MONTH, DateFieldProcessor.DateFeatureType.YEAR), ukFormat);
ukFeatures = ukProc.process(ukInput);
assertEquals(3, ukFeatures.size());
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "DAY", 23)));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "MONTH", Month.NOVEMBER.getValue())));
assertTrue(ukFeatures.contains(new ColumnarFeature("test-uk", "YEAR", 1963)));
}
Aggregations