use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.
the class ThirdeyePinotSchemaUtils method createSchema.
public static Schema createSchema(String configPath) throws IOException {
FileSystem fs = FileSystem.get(new Configuration());
ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.decode(fs.open(new Path(configPath)));
LOGGER.info("{}", thirdeyeConfig);
return createSchema(thirdeyeConfig);
}
use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.
the class DerivedColumnNoTransformationTest method setUp.
@Before
public void setUp() throws Exception {
DerivedColumnNoTransformationPhaseMapper mapper = new DerivedColumnNoTransformationPhaseMapper();
mapDriver = MapDriver.newMapDriver(mapper);
Configuration configuration = mapDriver.getConfiguration();
configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch");
ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
Schema inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema);
Schema outputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(NO_TRANSFORMATION_SCHEMA));
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH.toString(), TOPK_PATH);
TemporaryPath tmpPath = new TemporaryPath();
outputPath = tmpPath.toString();
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH.toString(), outputPath);
}
use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.
the class DerivedColumnTransformationTest method setUp.
@Before
public void setUp() throws Exception {
DerivedColumnTransformationPhaseMapper mapper = new DerivedColumnTransformationPhaseMapper();
mapDriver = MapDriver.newMapDriver(mapper);
Configuration configuration = mapDriver.getConfiguration();
configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_DIMENSION_NAMES.toString(), "d2,");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_METRICS.toString() + ".d2", "m1");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_KVALUES.toString() + ".d2", "1");
ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
Schema inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema);
Schema outputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(TRANSFORMATION_SCHEMA));
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH.toString(), ClassLoader.getSystemResource(TOPK_PATH).toString());
TemporaryPath tmpPath = new TemporaryPath();
outputPath = tmpPath.toString();
configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH.toString(), outputPath);
}
use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.
the class DerivedColumnTransformationPhaseJob method run.
public Job run() throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName(name);
job.setJarByClass(DerivedColumnTransformationPhaseJob.class);
Configuration configuration = job.getConfiguration();
FileSystem fs = FileSystem.get(configuration);
// Input Path
String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH);
LOGGER.info("Input path dir: " + inputPathDir);
for (String inputPath : inputPathDir.split(",")) {
LOGGER.info("Adding input:" + inputPath);
Path input = new Path(inputPath);
FileInputFormat.addInputPath(job, input);
}
// Topk path
String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH);
LOGGER.info("Topk path : " + topkPath);
// Output path
Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH));
LOGGER.info("Output path dir: " + outputPath.toString());
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
// Schema
Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
LOGGER.info("Schema : {}", avroSchema.toString(true));
// ThirdEyeConfig
String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
// New schema
Schema outputSchema = newSchema(thirdeyeConfig);
job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
// Map config
job.setMapperClass(DerivedColumnTransformationPhaseMapper.class);
job.setInputFormatClass(AvroKeyInputFormat.class);
job.setMapOutputKeyClass(AvroKey.class);
job.setMapOutputValueClass(NullWritable.class);
AvroJob.setOutputKeySchema(job, outputSchema);
LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class);
AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema);
job.setNumReduceTasks(0);
job.waitForCompletion(true);
return job;
}
use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.
the class DerivedSchemaGenerationTest method testDerivedColumnsSchemaGeneration.
@Test
public void testDerivedColumnsSchemaGeneration() throws Exception {
ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
Schema outputSchema = job.newSchema(thirdeyeConfig);
Assert.assertEquals(inputSchema.getFields().size(), outputSchema.getFields().size(), "Input schema should be same as output schema if no topk/whitelist in config");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_DIMENSION_NAMES.toString(), "d2,");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_METRICS.toString() + ".d2", "m1");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_KVALUES.toString() + ".d2", "1");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION_NAMES.toString(), "d2,d3");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION.toString() + ".d2", "a,b,c");
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION.toString() + ".d3", "x,y");
thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
outputSchema = job.newSchema(thirdeyeConfig);
Assert.assertEquals(inputSchema.getFields().size() + 1, outputSchema.getFields().size(), "Input schema should not be same as output schema if topk/whitelist in config");
Assert.assertEquals(outputSchema.getField("d2_topk") != null, true, "Output schema should have _topk entries for columsn in topk");
}
Aggregations