Search in sources :

Example 1 with ThirdEyeConfig

use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.

the class ThirdeyePinotSchemaUtils method createSchema.

public static Schema createSchema(String configPath) throws IOException {
    FileSystem fs = FileSystem.get(new Configuration());
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.decode(fs.open(new Path(configPath)));
    LOGGER.info("{}", thirdeyeConfig);
    return createSchema(thirdeyeConfig);
}
Also used : Path(org.apache.hadoop.fs.Path) ThirdEyeConfig(com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 2 with ThirdEyeConfig

use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.

the class DerivedColumnNoTransformationTest method setUp.

@Before
public void setUp() throws Exception {
    DerivedColumnNoTransformationPhaseMapper mapper = new DerivedColumnNoTransformationPhaseMapper();
    mapDriver = MapDriver.newMapDriver(mapper);
    Configuration configuration = mapDriver.getConfiguration();
    configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch");
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
    Schema inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
    setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema);
    Schema outputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(NO_TRANSFORMATION_SCHEMA));
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH.toString(), TOPK_PATH);
    TemporaryPath tmpPath = new TemporaryPath();
    outputPath = tmpPath.toString();
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH.toString(), outputPath);
}
Also used : ThirdEyeConfig(com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) TemporaryPath(org.apache.hadoop.mrunit.testutil.TemporaryPath) Before(org.junit.Before)

Example 3 with ThirdEyeConfig

use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.

the class DerivedColumnTransformationTest method setUp.

@Before
public void setUp() throws Exception {
    DerivedColumnTransformationPhaseMapper mapper = new DerivedColumnTransformationPhaseMapper();
    mapDriver = MapDriver.newMapDriver(mapper);
    Configuration configuration = mapDriver.getConfiguration();
    configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString(), "collection");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString(), "d1,d2,d3");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString(), "m1,m2");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), "INT,INT");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TIMECOLUMN_NAME.toString(), "hoursSinceEpoch");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_DIMENSION_NAMES.toString(), "d2,");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_METRICS.toString() + ".d2", "m1");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_KVALUES.toString() + ".d2", "1");
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
    Schema inputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
    setUpAvroSerialization(mapDriver.getConfiguration(), inputSchema);
    Schema outputSchema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(TRANSFORMATION_SCHEMA));
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH.toString(), ClassLoader.getSystemResource(TOPK_PATH).toString());
    TemporaryPath tmpPath = new TemporaryPath();
    outputPath = tmpPath.toString();
    configuration.set(DerivedColumnTransformationPhaseConstants.DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH.toString(), outputPath);
}
Also used : ThirdEyeConfig(com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) TemporaryPath(org.apache.hadoop.mrunit.testutil.TemporaryPath) Before(org.junit.Before)

Example 4 with ThirdEyeConfig

use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.

the class DerivedColumnTransformationPhaseJob method run.

public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName(name);
    job.setJarByClass(DerivedColumnTransformationPhaseJob.class);
    Configuration configuration = job.getConfiguration();
    FileSystem fs = FileSystem.get(configuration);
    // Input Path
    String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH);
    LOGGER.info("Input path dir: " + inputPathDir);
    for (String inputPath : inputPathDir.split(",")) {
        LOGGER.info("Adding input:" + inputPath);
        Path input = new Path(inputPath);
        FileInputFormat.addInputPath(job, input);
    }
    // Topk path
    String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH);
    LOGGER.info("Topk path : " + topkPath);
    // Output path
    Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH));
    LOGGER.info("Output path dir: " + outputPath.toString());
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
    }
    FileOutputFormat.setOutputPath(job, outputPath);
    // Schema
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
    LOGGER.info("Schema : {}", avroSchema.toString(true));
    // ThirdEyeConfig
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
    // New schema
    Schema outputSchema = newSchema(thirdeyeConfig);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
    // Map config
    job.setMapperClass(DerivedColumnTransformationPhaseMapper.class);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, outputSchema);
    LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class);
    AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema);
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    return job;
}
Also used : Path(org.apache.hadoop.fs.Path) ThirdEyeConfig(com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) Schema(org.apache.avro.Schema) Job(org.apache.hadoop.mapreduce.Job) AvroJob(org.apache.avro.mapreduce.AvroJob)

Example 5 with ThirdEyeConfig

use of com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig in project pinot by linkedin.

the class DerivedSchemaGenerationTest method testDerivedColumnsSchemaGeneration.

@Test
public void testDerivedColumnsSchemaGeneration() throws Exception {
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    Schema outputSchema = job.newSchema(thirdeyeConfig);
    Assert.assertEquals(inputSchema.getFields().size(), outputSchema.getFields().size(), "Input schema should be same as output schema if no topk/whitelist in config");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_DIMENSION_NAMES.toString(), "d2,");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_METRICS.toString() + ".d2", "m1");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_TOPK_KVALUES.toString() + ".d2", "1");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION_NAMES.toString(), "d2,d3");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION.toString() + ".d2", "a,b,c");
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_WHITELIST_DIMENSION.toString() + ".d3", "x,y");
    thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    outputSchema = job.newSchema(thirdeyeConfig);
    Assert.assertEquals(inputSchema.getFields().size() + 1, outputSchema.getFields().size(), "Input schema should not be same as output schema if topk/whitelist in config");
    Assert.assertEquals(outputSchema.getField("d2_topk") != null, true, "Output schema should have _topk entries for columsn in topk");
}
Also used : ThirdEyeConfig(com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig) Schema(org.apache.avro.Schema) BeforeTest(org.testng.annotations.BeforeTest) Test(org.testng.annotations.Test)

Aggregations

ThirdEyeConfig (com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig)8 Schema (org.apache.avro.Schema)7 Configuration (org.apache.hadoop.conf.Configuration)7 FileSystem (org.apache.hadoop.fs.FileSystem)5 Path (org.apache.hadoop.fs.Path)5 Job (org.apache.hadoop.mapreduce.Job)4 AvroJob (org.apache.avro.mapreduce.AvroJob)2 TemporaryPath (org.apache.hadoop.mrunit.testutil.TemporaryPath)2 Before (org.junit.Before)2 ArrayList (java.util.ArrayList)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Counter (org.apache.hadoop.mapreduce.Counter)1 BeforeTest (org.testng.annotations.BeforeTest)1 Test (org.testng.annotations.Test)1