Search in sources :

Example 1 with DataLoadMetrics

use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.

the class HiveCarbonUtil method getCarbonLoadModel.

public static CarbonLoadModel getCarbonLoadModel(String tableName, String databaseName, String location, String sortColumnsString, String[] columns, String[] columnTypes, Configuration configuration) {
    CarbonLoadModel loadModel;
    CarbonTable carbonTable;
    try {
        String schemaFilePath = CarbonTablePath.getSchemaFilePath(location, configuration);
        AbsoluteTableIdentifier absoluteTableIdentifier = AbsoluteTableIdentifier.from(location, databaseName, tableName, "");
        if (FileFactory.getCarbonFile(schemaFilePath).exists()) {
            carbonTable = SchemaReader.readCarbonTableFromStore(absoluteTableIdentifier);
            carbonTable.setTransactionalTable(true);
        } else {
            String carbonDataFile = CarbonUtil.getFilePathExternalFilePath(location, configuration);
            if (carbonDataFile == null) {
                carbonTable = CarbonTable.buildFromTableInfo(getTableInfo(tableName, databaseName, location, sortColumnsString, columns, columnTypes, new ArrayList<>()));
            } else {
                carbonTable = CarbonTable.buildFromTableInfo(SchemaReader.inferSchema(absoluteTableIdentifier, false, configuration));
            }
            carbonTable.setTransactionalTable(false);
        }
    } catch (SQLException | IOException e) {
        throw new RuntimeException("Unable to fetch schema for the table: " + tableName, e);
    }
    CarbonLoadModelBuilder carbonLoadModelBuilder = new CarbonLoadModelBuilder(carbonTable);
    Map<String, String> options = new HashMap<>();
    options.put("fileheader", Strings.mkString(columns, ","));
    try {
        loadModel = carbonLoadModelBuilder.build(options, System.currentTimeMillis(), "");
    } catch (InvalidLoadOptionException | IOException e) {
        throw new RuntimeException(e);
    }
    loadModel.setSkipParsers();
    loadModel.setMetrics(new DataLoadMetrics());
    return loadModel;
}
Also used : InvalidLoadOptionException(org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException) DataLoadMetrics(org.apache.carbondata.core.util.DataLoadMetrics) SQLException(java.sql.SQLException) HashMap(java.util.HashMap) IOException(java.io.IOException) CarbonLoadModelBuilder(org.apache.carbondata.processing.loading.model.CarbonLoadModelBuilder) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) CarbonLoadModel(org.apache.carbondata.processing.loading.model.CarbonLoadModel)

Example 2 with DataLoadMetrics

use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.

the class CarbonTableOutputFormat method getLoadModel.

public static CarbonLoadModel getLoadModel(Configuration conf) throws IOException {
    CarbonLoadModel model;
    String encodedString = conf.get(LOAD_MODEL);
    if (encodedString != null) {
        model = (CarbonLoadModel) ObjectSerializationUtil.convertStringToObject(encodedString);
        return model;
    }
    model = new CarbonLoadModel();
    CarbonProperties carbonProperty = CarbonProperties.getInstance();
    model.setDatabaseName(CarbonTableOutputFormat.getDatabaseName(conf));
    model.setTableName(CarbonTableOutputFormat.getTableName(conf));
    model.setCarbonTransactionalTable(true);
    model.setMetrics(new DataLoadMetrics());
    CarbonTable carbonTable = getCarbonTable(conf);
    // global dictionary is not supported since 2.0
    if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
        DeprecatedFeatureException.globalDictNotSupported();
    }
    String columnCompressor = carbonTable.getTableInfo().getFactTable().getTableProperties().get(CarbonCommonConstants.COMPRESSOR);
    if (null == columnCompressor) {
        columnCompressor = CompressorFactory.getInstance().getCompressor().getName();
    }
    model.setColumnCompressor(columnCompressor);
    model.setCarbonDataLoadSchema(new CarbonDataLoadSchema(carbonTable));
    model.setTablePath(getTablePath(conf));
    setFileHeader(conf, model);
    model.setSerializationNullFormat(conf.get(SERIALIZATION_NULL_FORMAT, "\\N"));
    model.setBadRecordsLoggerEnable(conf.get(BAD_RECORDS_LOGGER_ENABLE, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE, CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT)));
    model.setBadRecordsAction(conf.get(BAD_RECORDS_LOGGER_ACTION, carbonProperty.getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT)));
    model.setIsEmptyDataBadRecord(conf.get(IS_EMPTY_DATA_BAD_RECORD, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD, CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT)));
    model.setSkipEmptyLine(conf.get(SKIP_EMPTY_LINE, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_SKIP_EMPTY_LINE)));
    String complexDelim = conf.get(COMPLEX_DELIMITERS);
    if (null == complexDelim) {
        complexDelim = ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_1.value() + "," + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_2.value() + "," + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_3.value() + "," + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_4.value();
    }
    String[] split = complexDelim.split(",");
    model.setComplexDelimiter(split[0]);
    if (split.length > 3) {
        model.setComplexDelimiter(split[1]);
        model.setComplexDelimiter(split[2]);
        model.setComplexDelimiter(split[3]);
    } else if (split.length > 2) {
        model.setComplexDelimiter(split[1]);
        model.setComplexDelimiter(split[2]);
    } else if (split.length > 1) {
        model.setComplexDelimiter(split[1]);
    }
    model.setDateFormat(conf.get(DATE_FORMAT, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_DATEFORMAT, CarbonLoadOptionConstants.CARBON_OPTIONS_DATEFORMAT_DEFAULT)));
    model.setTimestampFormat(conf.get(TIMESTAMP_FORMAT, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_TIMESTAMPFORMAT, CarbonLoadOptionConstants.CARBON_OPTIONS_TIMESTAMPFORMAT_DEFAULT)));
    model.setGlobalSortPartitions(conf.get(GLOBAL_SORT_PARTITIONS, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_GLOBAL_SORT_PARTITIONS, null)));
    String badRecordsPath = conf.get(BAD_RECORD_PATH);
    if (StringUtils.isEmpty(badRecordsPath)) {
        badRecordsPath = carbonTable.getTableInfo().getFactTable().getTableProperties().get("bad_record_path");
        if (StringUtils.isEmpty(badRecordsPath)) {
            badRecordsPath = carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORD_PATH, carbonProperty.getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL));
        }
    }
    model.setBadRecordsLocation(badRecordsPath);
    return model;
}
Also used : CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) DataLoadMetrics(org.apache.carbondata.core.util.DataLoadMetrics) CarbonProperties(org.apache.carbondata.core.util.CarbonProperties) CarbonLoadModel(org.apache.carbondata.processing.loading.model.CarbonLoadModel) CarbonDataLoadSchema(org.apache.carbondata.processing.loading.model.CarbonDataLoadSchema)

Example 3 with DataLoadMetrics

use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.

the class CarbonTableOutputFormat method getRecordWriter.

@Override
public RecordWriter<NullWritable, ObjectArrayWritable> getRecordWriter(final TaskAttemptContext taskAttemptContext) throws IOException {
    final CarbonLoadModel loadModel = getLoadModel(taskAttemptContext.getConfiguration());
    loadModel.setMetrics(new DataLoadMetrics());
    String appName = taskAttemptContext.getConfiguration().get(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME);
    if (null != appName) {
        CarbonProperties.getInstance().addProperty(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME, appName);
    }
    // if loadModel having taskNo already(like in SDK) then no need to overwrite
    short sdkWriterCores = loadModel.getSdkWriterCores();
    int itrSize = (sdkWriterCores > 0) ? sdkWriterCores : 1;
    final CarbonOutputIteratorWrapper[] iterators = new CarbonOutputIteratorWrapper[itrSize];
    for (int i = 0; i < itrSize; i++) {
        iterators[i] = new CarbonOutputIteratorWrapper();
    }
    if (null == loadModel.getTaskNo() || loadModel.getTaskNo().isEmpty()) {
        loadModel.setTaskNo(taskAttemptContext.getConfiguration().get("carbon.outputformat.taskno", String.valueOf(DEFAULT_TASK_NO.getAndIncrement())));
    }
    loadModel.setDataWritePath(taskAttemptContext.getConfiguration().get("carbon.outputformat.writepath"));
    final String[] tempStoreLocations = getTempStoreLocations(taskAttemptContext);
    DataTypeUtil.clearFormatter();
    final DataLoadExecutor dataLoadExecutor = new DataLoadExecutor();
    final ExecutorService executorService = Executors.newFixedThreadPool(1, new CarbonThreadFactory("CarbonRecordWriter:" + loadModel.getTableName(), true));
    // It should be started in new thread as the underlying iterator uses blocking queue.
    Future future = executorService.submit(() -> {
        ThreadLocalSessionInfo.getOrCreateCarbonSessionInfo().getNonSerializableExtraInfo().put("carbonConf", taskAttemptContext.getConfiguration());
        try {
            dataLoadExecutor.execute(loadModel, tempStoreLocations, iterators);
        } catch (Exception e) {
            executorService.shutdownNow();
            for (CarbonOutputIteratorWrapper iterator : iterators) {
                iterator.closeWriter(true);
            }
            try {
                dataLoadExecutor.close();
            } catch (Exception ex) {
                // As already exception happened before close() send that exception.
                throw new RuntimeException(e);
            }
            throw new RuntimeException(e);
        } finally {
            ThreadLocalSessionInfo.unsetAll();
        }
    });
    if (sdkWriterCores > 0) {
        // CarbonMultiRecordWriter handles the load balancing of the write rows in round robin.
        return new CarbonMultiRecordWriter(iterators, dataLoadExecutor, loadModel, future, executorService);
    } else {
        return new CarbonRecordWriter(iterators[0], dataLoadExecutor, loadModel, future, executorService);
    }
}
Also used : DataLoadMetrics(org.apache.carbondata.core.util.DataLoadMetrics) IOException(java.io.IOException) DeprecatedFeatureException(org.apache.carbondata.common.exceptions.DeprecatedFeatureException) ExecutionException(java.util.concurrent.ExecutionException) CarbonOutputIteratorWrapper(org.apache.carbondata.processing.loading.iterator.CarbonOutputIteratorWrapper) CarbonLoadModel(org.apache.carbondata.processing.loading.model.CarbonLoadModel) ExecutorService(java.util.concurrent.ExecutorService) CarbonThreadFactory(org.apache.carbondata.core.util.CarbonThreadFactory) Future(java.util.concurrent.Future) DataLoadExecutor(org.apache.carbondata.processing.loading.DataLoadExecutor)

Example 4 with DataLoadMetrics

use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.

the class CarbonLoadModelBuilder method build.

/**
 * build CarbonLoadModel for data loading
 * @param options Load options from user input
 * @param optionsFinal Load options that populated with default values for optional options
 * @param carbonLoadModel The output load model
 * @param hadoopConf hadoopConf is needed to read CSV header if there 'fileheader' is not set in
 *                   user provided load options
 * @param partitions partition name map to path
 * @param isDataFrame true if build for load for dataframe
 */
public void build(Map<String, String> options, Map<String, String> optionsFinal, CarbonLoadModel carbonLoadModel, Configuration hadoopConf, Map<String, String> partitions, boolean isDataFrame) throws InvalidLoadOptionException, IOException {
    carbonLoadModel.setTableName(table.getTableName());
    carbonLoadModel.setDatabaseName(table.getDatabaseName());
    carbonLoadModel.setTablePath(table.getTablePath());
    carbonLoadModel.setTableName(table.getTableName());
    carbonLoadModel.setCarbonTransactionalTable(table.isTransactionalTable());
    CarbonDataLoadSchema dataLoadSchema = new CarbonDataLoadSchema(table);
    // Need to fill dimension relation
    carbonLoadModel.setCarbonDataLoadSchema(dataLoadSchema);
    String sort_scope = optionsFinal.get("sort_scope");
    String bad_records_logger_enable = optionsFinal.get("bad_records_logger_enable");
    String bad_records_action = optionsFinal.get("bad_records_action");
    String bad_record_path = optionsFinal.get("bad_record_path");
    String global_sort_partitions = optionsFinal.get("global_sort_partitions");
    String timestampformat = optionsFinal.get("timestampformat");
    String dateFormat = optionsFinal.get("dateformat");
    String delimiter = optionsFinal.get("delimiter");
    String complex_delimiter_level1 = optionsFinal.get("complex_delimiter_level_1");
    String complex_delimiter_level2 = optionsFinal.get("complex_delimiter_level_2");
    String complex_delimiter_level3 = optionsFinal.get("complex_delimiter_level_3");
    String complex_delimiter_level4 = optionsFinal.get("complex_delimiter_level_4");
    validateDateTimeFormat(timestampformat, "TimestampFormat");
    validateDateTimeFormat(dateFormat, "DateFormat");
    if (Boolean.parseBoolean(bad_records_logger_enable) || LoggerAction.REDIRECT.name().equalsIgnoreCase(bad_records_action)) {
        if (!StringUtils.isEmpty(bad_record_path)) {
            bad_record_path = CarbonUtil.checkAndAppendHDFSUrl(bad_record_path);
        } else {
            throw new InvalidLoadOptionException("Cannot redirect bad records as bad record location is not provided.");
        }
    }
    carbonLoadModel.setBadRecordsLocation(bad_record_path);
    validateGlobalSortPartitions(global_sort_partitions);
    carbonLoadModel.setEscapeChar(checkDefaultValue(optionsFinal.get("escapechar"), "\\"));
    carbonLoadModel.setQuoteChar(CarbonUtil.unescapeChar(checkDefaultValue(optionsFinal.get("quotechar"), "\"")));
    carbonLoadModel.setCommentChar(checkDefaultValue(optionsFinal.get("commentchar"), "#"));
    String lineSeparator = CarbonUtil.unescapeChar(options.get("line_separator"));
    if (lineSeparator != null) {
        carbonLoadModel.setLineSeparator(lineSeparator);
    }
    // if there isn't file header in csv file and load sql doesn't provide FILEHEADER option,
    // we should use table schema to generate file header.
    String fileHeader = optionsFinal.get("fileheader");
    String headerOption = optionsFinal.get("header");
    if (StringUtils.isNotEmpty(headerOption)) {
        if (!headerOption.equalsIgnoreCase("true") && !headerOption.equalsIgnoreCase("false")) {
            throw new InvalidLoadOptionException("'header' option should be either 'true' or 'false'.");
        }
        // whether the csv file has file header, the default value is true
        if (Boolean.valueOf(headerOption)) {
            if (!StringUtils.isEmpty(fileHeader)) {
                throw new InvalidLoadOptionException("When 'header' option is true, 'fileheader' option is not required.");
            }
        } else {
            if (StringUtils.isEmpty(fileHeader)) {
                List<CarbonColumn> columns = table.getCreateOrderColumn();
                List<String> columnNames = new ArrayList<>();
                List<String> partitionColumns = new ArrayList<>();
                for (int i = 0; i < columns.size(); i++) {
                    columnNames.add(columns.get(i).getColName());
                }
                columnNames.addAll(partitionColumns);
                fileHeader = Strings.mkString(columnNames.toArray(new String[columnNames.size()]), ",");
            }
        }
    }
    String binaryDecoder = options.get("binary_decoder");
    carbonLoadModel.setBinaryDecoder(binaryDecoder);
    carbonLoadModel.setTimestampFormat(timestampformat);
    carbonLoadModel.setDateFormat(dateFormat);
    carbonLoadModel.setDefaultTimestampFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT));
    carbonLoadModel.setDefaultDateFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT));
    carbonLoadModel.setSerializationNullFormat(TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName() + "," + optionsFinal.get("serialization_null_format"));
    carbonLoadModel.setBadRecordsLoggerEnable(TableOptionConstant.BAD_RECORDS_LOGGER_ENABLE.getName() + "," + bad_records_logger_enable);
    carbonLoadModel.setBadRecordsAction(TableOptionConstant.BAD_RECORDS_ACTION.getName() + "," + bad_records_action.toUpperCase());
    carbonLoadModel.setIsEmptyDataBadRecord(DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD + "," + optionsFinal.get("is_empty_data_bad_record"));
    carbonLoadModel.setSkipEmptyLine(optionsFinal.get("skip_empty_line"));
    carbonLoadModel.setSortScope(sort_scope);
    if (global_sort_partitions == null) {
        global_sort_partitions = table.getGlobalSortPartitions();
    }
    carbonLoadModel.setGlobalSortPartitions(global_sort_partitions);
    if (delimiter.equalsIgnoreCase(complex_delimiter_level1) || complex_delimiter_level1.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level3)) {
        throw new InvalidLoadOptionException("Field Delimiter and Complex types delimiter are same");
    } else {
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level1);
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level2);
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level3);
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level4);
    }
    carbonLoadModel.setCsvDelimiter(CarbonUtil.unescapeChar(delimiter));
    carbonLoadModel.setCsvHeader(fileHeader);
    List<String> ignoreColumns = new ArrayList<>();
    if (!isDataFrame) {
        for (Map.Entry<String, String> partition : partitions.entrySet()) {
            if (partition.getValue() != null) {
                ignoreColumns.add(partition.getKey());
            }
        }
    }
    carbonLoadModel.setCsvHeaderColumns(LoadOption.getCsvHeaderColumns(carbonLoadModel, hadoopConf, ignoreColumns));
    int validatedMaxColumns = validateMaxColumns(carbonLoadModel.getCsvHeaderColumns(), optionsFinal.get("maxcolumns"));
    carbonLoadModel.setMaxColumns(String.valueOf(validatedMaxColumns));
    if (carbonLoadModel.isCarbonTransactionalTable()) {
        carbonLoadModel.readAndSetLoadMetadataDetails();
    }
    carbonLoadModel.setSortColumnsBoundsStr(optionsFinal.get("sort_column_bounds"));
    carbonLoadModel.setLoadMinSize(optionsFinal.get(CarbonCommonConstants.CARBON_LOAD_MIN_SIZE_INMB));
    validateAndSetLoadMinSize(carbonLoadModel);
    validateAndSetColumnCompressor(carbonLoadModel);
    validateAndSetBinaryDecoder(carbonLoadModel);
    validateRangeColumn(optionsFinal, carbonLoadModel);
    carbonLoadModel.setMetrics(new DataLoadMetrics());
}
Also used : InvalidLoadOptionException(org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException) DataLoadMetrics(org.apache.carbondata.core.util.DataLoadMetrics) CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

DataLoadMetrics (org.apache.carbondata.core.util.DataLoadMetrics)4 CarbonLoadModel (org.apache.carbondata.processing.loading.model.CarbonLoadModel)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 InvalidLoadOptionException (org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException)2 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)2 SQLException (java.sql.SQLException)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 ExecutionException (java.util.concurrent.ExecutionException)1 ExecutorService (java.util.concurrent.ExecutorService)1 Future (java.util.concurrent.Future)1 DeprecatedFeatureException (org.apache.carbondata.common.exceptions.DeprecatedFeatureException)1 AbsoluteTableIdentifier (org.apache.carbondata.core.metadata.AbsoluteTableIdentifier)1 CarbonColumn (org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn)1 CarbonProperties (org.apache.carbondata.core.util.CarbonProperties)1 CarbonThreadFactory (org.apache.carbondata.core.util.CarbonThreadFactory)1 DataLoadExecutor (org.apache.carbondata.processing.loading.DataLoadExecutor)1 CarbonOutputIteratorWrapper (org.apache.carbondata.processing.loading.iterator.CarbonOutputIteratorWrapper)1 CarbonDataLoadSchema (org.apache.carbondata.processing.loading.model.CarbonDataLoadSchema)1