Search in sources :

Example 11 with TablePrefixLayoutStrategy

use of org.apache.rya.api.layout.TablePrefixLayoutStrategy in project incubator-rya by apache.

the class CopyTool method runQueryCopy.

private int runQueryCopy() throws Exception {
    log.info("Setting up Copy Tool with a query-based ruleset...");
    setup();
    if (!useCopyFileOutput) {
        createChildInstance(conf);
    }
    // Set up the configuration
    final AccumuloRdfConfiguration aconf = new AccumuloRdfConfiguration(conf);
    aconf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock);
    aconf.setTablePrefix(tablePrefix);
    aconf.setFlush(false);
    ConfigUtils.setIndexers(aconf);
    // Since we're copying at the statement-level, ignore any given list of tables and determine
    // which tables we might need to create based on which indexers are desired.
    final TablePrefixLayoutStrategy prefixStrategy = new TablePrefixLayoutStrategy(tablePrefix);
    tables.clear();
    // Always include core tables
    tables.add(prefixStrategy.getSpo());
    tables.add(prefixStrategy.getOsp());
    tables.add(prefixStrategy.getPo());
    // Copy namespaces if they exist
    tables.add(prefixStrategy.getNs());
    // Add tables associated with any configured indexers
    /* TODO: SEE RYA-160
        if (aconf.getBoolean(ConfigUtils.USE_FREETEXT, false)) {
            tables.add(ConfigUtils.getFreeTextDocTablename(conf));
            tables.add(ConfigUtils.getFreeTextTermTablename(conf));
        }
        if (aconf.getBoolean(ConfigUtils.USE_GEO, false)) {
            tables.add(ConfigUtils.getGeoTablename(conf));
        }
        if (aconf.getBoolean(ConfigUtils.USE_TEMPORAL, false)) {
            tables.add(ConfigUtils.getTemporalTableName(conf));
        }
        if (aconf.getBoolean(ConfigUtils.USE_ENTITY, false)) {
            tables.add(ConfigUtils.getEntityTableName(conf));
        }
        */
    // Ignore anything else, e.g. statistics -- must be recalculated for the child if desired
    // Extract the ruleset, and copy the namespace table directly
    final AccumuloQueryRuleset ruleset = new AccumuloQueryRuleset(aconf);
    ruleset.addTable(prefixStrategy.getNs());
    for (final String line : ruleset.toString().split("\n")) {
        log.info(line);
    }
    // Create a Job and configure its input and output
    final Job job = Job.getInstance(aconf);
    job.setJarByClass(this.getClass());
    setupMultiTableInputFormat(job, ruleset);
    setupAccumuloOutput(job, "");
    if (useCopyFileOutput) {
        // Configure job for file output
        job.setJobName("Ruleset-based export to file: " + tablePrefix + " -> " + localBaseOutputDir);
        // Map (row) to (table+key, key+value)
        job.setMapperClass(RowRuleMapper.class);
        job.setMapOutputKeyClass(GroupedRow.class);
        job.setMapOutputValueClass(GroupedRow.class);
        // Group according to table and and sort according to key
        job.setGroupingComparatorClass(GroupedRow.GroupComparator.class);
        job.setSortComparatorClass(GroupedRow.SortComparator.class);
        // Reduce ([table+row], rows): output each row to the file for that table, in sorted order
        job.setReducerClass(MultipleFileReducer.class);
        job.setOutputKeyClass(Key.class);
        job.setOutputValueClass(Value.class);
    } else {
        // Configure job for table output
        job.setJobName("Ruleset-based copy: " + tablePrefix + " -> " + childTablePrefix);
        // Map (row): convert to statement, insert to child (for namespace table, output row directly)
        job.setMapperClass(AccumuloRyaRuleMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Mutation.class);
        job.setNumReduceTasks(0);
        // Create the child tables, so mappers don't try to do this in parallel
        for (final String parentTable : tables) {
            final String childTable = parentTable.replaceFirst(tablePrefix, childTablePrefix);
            createTableIfNeeded(childTable);
        }
    }
    // Run the job and copy files to local filesystem if needed
    final Date beginTime = new Date();
    log.info("Job started: " + beginTime);
    final boolean success = job.waitForCompletion(true);
    if (success) {
        if (useCopyFileOutput) {
            log.info("Moving data from HDFS to the local file system");
            final Path baseOutputPath = new Path(baseOutputDir);
            for (final FileStatus status : FileSystem.get(conf).listStatus(baseOutputPath)) {
                if (status.isDirectory()) {
                    final String tableName = status.getPath().getName();
                    final Path hdfsPath = getPath(baseOutputDir, tableName);
                    final Path localPath = getPath(localBaseOutputDir, tableName);
                    log.info("HDFS directory: " + hdfsPath.toString());
                    log.info("Local directory: " + localPath.toString());
                    copyHdfsToLocal(hdfsPath, localPath);
                }
            }
        }
        final Date endTime = new Date();
        log.info("Job finished: " + endTime);
        log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds.");
        return 0;
    } else {
        log.error("Job failed!!!");
        return 1;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AccumuloQueryRuleset(org.apache.rya.accumulo.mr.merge.util.AccumuloQueryRuleset) FileStatus(org.apache.hadoop.fs.FileStatus) TablePrefixLayoutStrategy(org.apache.rya.api.layout.TablePrefixLayoutStrategy) Job(org.apache.hadoop.mapreduce.Job) GroupedRow(org.apache.rya.accumulo.mr.merge.util.GroupedRow) AccumuloRdfConfiguration(org.apache.rya.accumulo.AccumuloRdfConfiguration) Date(java.util.Date)

Aggregations

TablePrefixLayoutStrategy (org.apache.rya.api.layout.TablePrefixLayoutStrategy)11 AccumuloRdfConfiguration (org.apache.rya.accumulo.AccumuloRdfConfiguration)7 BatchWriterConfig (org.apache.accumulo.core.client.BatchWriterConfig)4 MockInstance (org.apache.accumulo.core.client.mock.MockInstance)4 Before (org.junit.Before)4 PasswordToken (org.apache.accumulo.core.client.security.tokens.PasswordToken)3 RdfCloudTripleStoreConfiguration (org.apache.rya.api.RdfCloudTripleStoreConfiguration)3 ProspectorServiceEvalStatsDAO (org.apache.rya.prospector.service.ProspectorServiceEvalStatsDAO)3 MongoDBRdfConfiguration (org.apache.rya.mongodb.MongoDBRdfConfiguration)2 StatefulMongoDBRdfConfiguration (org.apache.rya.mongodb.StatefulMongoDBRdfConfiguration)2 RdfCloudTripleStore (org.apache.rya.rdftriplestore.RdfCloudTripleStore)2 InferenceEngine (org.apache.rya.rdftriplestore.inference.InferenceEngine)2 MongoClient (com.mongodb.MongoClient)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)1 TableOperations (org.apache.accumulo.core.client.admin.TableOperations)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1