use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.
the class HiveCarbonUtil method getCarbonLoadModel.
public static CarbonLoadModel getCarbonLoadModel(String tableName, String databaseName, String location, String sortColumnsString, String[] columns, String[] columnTypes, Configuration configuration) {
CarbonLoadModel loadModel;
CarbonTable carbonTable;
try {
String schemaFilePath = CarbonTablePath.getSchemaFilePath(location, configuration);
AbsoluteTableIdentifier absoluteTableIdentifier = AbsoluteTableIdentifier.from(location, databaseName, tableName, "");
if (FileFactory.getCarbonFile(schemaFilePath).exists()) {
carbonTable = SchemaReader.readCarbonTableFromStore(absoluteTableIdentifier);
carbonTable.setTransactionalTable(true);
} else {
String carbonDataFile = CarbonUtil.getFilePathExternalFilePath(location, configuration);
if (carbonDataFile == null) {
carbonTable = CarbonTable.buildFromTableInfo(getTableInfo(tableName, databaseName, location, sortColumnsString, columns, columnTypes, new ArrayList<>()));
} else {
carbonTable = CarbonTable.buildFromTableInfo(SchemaReader.inferSchema(absoluteTableIdentifier, false, configuration));
}
carbonTable.setTransactionalTable(false);
}
} catch (SQLException | IOException e) {
throw new RuntimeException("Unable to fetch schema for the table: " + tableName, e);
}
CarbonLoadModelBuilder carbonLoadModelBuilder = new CarbonLoadModelBuilder(carbonTable);
Map<String, String> options = new HashMap<>();
options.put("fileheader", Strings.mkString(columns, ","));
try {
loadModel = carbonLoadModelBuilder.build(options, System.currentTimeMillis(), "");
} catch (InvalidLoadOptionException | IOException e) {
throw new RuntimeException(e);
}
loadModel.setSkipParsers();
loadModel.setMetrics(new DataLoadMetrics());
return loadModel;
}
use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.
the class CarbonTableOutputFormat method getLoadModel.
public static CarbonLoadModel getLoadModel(Configuration conf) throws IOException {
CarbonLoadModel model;
String encodedString = conf.get(LOAD_MODEL);
if (encodedString != null) {
model = (CarbonLoadModel) ObjectSerializationUtil.convertStringToObject(encodedString);
return model;
}
model = new CarbonLoadModel();
CarbonProperties carbonProperty = CarbonProperties.getInstance();
model.setDatabaseName(CarbonTableOutputFormat.getDatabaseName(conf));
model.setTableName(CarbonTableOutputFormat.getTableName(conf));
model.setCarbonTransactionalTable(true);
model.setMetrics(new DataLoadMetrics());
CarbonTable carbonTable = getCarbonTable(conf);
// global dictionary is not supported since 2.0
if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
DeprecatedFeatureException.globalDictNotSupported();
}
String columnCompressor = carbonTable.getTableInfo().getFactTable().getTableProperties().get(CarbonCommonConstants.COMPRESSOR);
if (null == columnCompressor) {
columnCompressor = CompressorFactory.getInstance().getCompressor().getName();
}
model.setColumnCompressor(columnCompressor);
model.setCarbonDataLoadSchema(new CarbonDataLoadSchema(carbonTable));
model.setTablePath(getTablePath(conf));
setFileHeader(conf, model);
model.setSerializationNullFormat(conf.get(SERIALIZATION_NULL_FORMAT, "\\N"));
model.setBadRecordsLoggerEnable(conf.get(BAD_RECORDS_LOGGER_ENABLE, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE, CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT)));
model.setBadRecordsAction(conf.get(BAD_RECORDS_LOGGER_ACTION, carbonProperty.getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT)));
model.setIsEmptyDataBadRecord(conf.get(IS_EMPTY_DATA_BAD_RECORD, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD, CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT)));
model.setSkipEmptyLine(conf.get(SKIP_EMPTY_LINE, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_SKIP_EMPTY_LINE)));
String complexDelim = conf.get(COMPLEX_DELIMITERS);
if (null == complexDelim) {
complexDelim = ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_1.value() + "," + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_2.value() + "," + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_3.value() + "," + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_4.value();
}
String[] split = complexDelim.split(",");
model.setComplexDelimiter(split[0]);
if (split.length > 3) {
model.setComplexDelimiter(split[1]);
model.setComplexDelimiter(split[2]);
model.setComplexDelimiter(split[3]);
} else if (split.length > 2) {
model.setComplexDelimiter(split[1]);
model.setComplexDelimiter(split[2]);
} else if (split.length > 1) {
model.setComplexDelimiter(split[1]);
}
model.setDateFormat(conf.get(DATE_FORMAT, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_DATEFORMAT, CarbonLoadOptionConstants.CARBON_OPTIONS_DATEFORMAT_DEFAULT)));
model.setTimestampFormat(conf.get(TIMESTAMP_FORMAT, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_TIMESTAMPFORMAT, CarbonLoadOptionConstants.CARBON_OPTIONS_TIMESTAMPFORMAT_DEFAULT)));
model.setGlobalSortPartitions(conf.get(GLOBAL_SORT_PARTITIONS, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_GLOBAL_SORT_PARTITIONS, null)));
String badRecordsPath = conf.get(BAD_RECORD_PATH);
if (StringUtils.isEmpty(badRecordsPath)) {
badRecordsPath = carbonTable.getTableInfo().getFactTable().getTableProperties().get("bad_record_path");
if (StringUtils.isEmpty(badRecordsPath)) {
badRecordsPath = carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORD_PATH, carbonProperty.getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC, CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL));
}
}
model.setBadRecordsLocation(badRecordsPath);
return model;
}
use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.
the class CarbonTableOutputFormat method getRecordWriter.
@Override
public RecordWriter<NullWritable, ObjectArrayWritable> getRecordWriter(final TaskAttemptContext taskAttemptContext) throws IOException {
final CarbonLoadModel loadModel = getLoadModel(taskAttemptContext.getConfiguration());
loadModel.setMetrics(new DataLoadMetrics());
String appName = taskAttemptContext.getConfiguration().get(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME);
if (null != appName) {
CarbonProperties.getInstance().addProperty(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME, appName);
}
// if loadModel having taskNo already(like in SDK) then no need to overwrite
short sdkWriterCores = loadModel.getSdkWriterCores();
int itrSize = (sdkWriterCores > 0) ? sdkWriterCores : 1;
final CarbonOutputIteratorWrapper[] iterators = new CarbonOutputIteratorWrapper[itrSize];
for (int i = 0; i < itrSize; i++) {
iterators[i] = new CarbonOutputIteratorWrapper();
}
if (null == loadModel.getTaskNo() || loadModel.getTaskNo().isEmpty()) {
loadModel.setTaskNo(taskAttemptContext.getConfiguration().get("carbon.outputformat.taskno", String.valueOf(DEFAULT_TASK_NO.getAndIncrement())));
}
loadModel.setDataWritePath(taskAttemptContext.getConfiguration().get("carbon.outputformat.writepath"));
final String[] tempStoreLocations = getTempStoreLocations(taskAttemptContext);
DataTypeUtil.clearFormatter();
final DataLoadExecutor dataLoadExecutor = new DataLoadExecutor();
final ExecutorService executorService = Executors.newFixedThreadPool(1, new CarbonThreadFactory("CarbonRecordWriter:" + loadModel.getTableName(), true));
// It should be started in new thread as the underlying iterator uses blocking queue.
Future future = executorService.submit(() -> {
ThreadLocalSessionInfo.getOrCreateCarbonSessionInfo().getNonSerializableExtraInfo().put("carbonConf", taskAttemptContext.getConfiguration());
try {
dataLoadExecutor.execute(loadModel, tempStoreLocations, iterators);
} catch (Exception e) {
executorService.shutdownNow();
for (CarbonOutputIteratorWrapper iterator : iterators) {
iterator.closeWriter(true);
}
try {
dataLoadExecutor.close();
} catch (Exception ex) {
// As already exception happened before close() send that exception.
throw new RuntimeException(e);
}
throw new RuntimeException(e);
} finally {
ThreadLocalSessionInfo.unsetAll();
}
});
if (sdkWriterCores > 0) {
// CarbonMultiRecordWriter handles the load balancing of the write rows in round robin.
return new CarbonMultiRecordWriter(iterators, dataLoadExecutor, loadModel, future, executorService);
} else {
return new CarbonRecordWriter(iterators[0], dataLoadExecutor, loadModel, future, executorService);
}
}
use of org.apache.carbondata.core.util.DataLoadMetrics in project carbondata by apache.
the class CarbonLoadModelBuilder method build.
/**
* build CarbonLoadModel for data loading
* @param options Load options from user input
* @param optionsFinal Load options that populated with default values for optional options
* @param carbonLoadModel The output load model
* @param hadoopConf hadoopConf is needed to read CSV header if there 'fileheader' is not set in
* user provided load options
* @param partitions partition name map to path
* @param isDataFrame true if build for load for dataframe
*/
public void build(Map<String, String> options, Map<String, String> optionsFinal, CarbonLoadModel carbonLoadModel, Configuration hadoopConf, Map<String, String> partitions, boolean isDataFrame) throws InvalidLoadOptionException, IOException {
carbonLoadModel.setTableName(table.getTableName());
carbonLoadModel.setDatabaseName(table.getDatabaseName());
carbonLoadModel.setTablePath(table.getTablePath());
carbonLoadModel.setTableName(table.getTableName());
carbonLoadModel.setCarbonTransactionalTable(table.isTransactionalTable());
CarbonDataLoadSchema dataLoadSchema = new CarbonDataLoadSchema(table);
// Need to fill dimension relation
carbonLoadModel.setCarbonDataLoadSchema(dataLoadSchema);
String sort_scope = optionsFinal.get("sort_scope");
String bad_records_logger_enable = optionsFinal.get("bad_records_logger_enable");
String bad_records_action = optionsFinal.get("bad_records_action");
String bad_record_path = optionsFinal.get("bad_record_path");
String global_sort_partitions = optionsFinal.get("global_sort_partitions");
String timestampformat = optionsFinal.get("timestampformat");
String dateFormat = optionsFinal.get("dateformat");
String delimiter = optionsFinal.get("delimiter");
String complex_delimiter_level1 = optionsFinal.get("complex_delimiter_level_1");
String complex_delimiter_level2 = optionsFinal.get("complex_delimiter_level_2");
String complex_delimiter_level3 = optionsFinal.get("complex_delimiter_level_3");
String complex_delimiter_level4 = optionsFinal.get("complex_delimiter_level_4");
validateDateTimeFormat(timestampformat, "TimestampFormat");
validateDateTimeFormat(dateFormat, "DateFormat");
if (Boolean.parseBoolean(bad_records_logger_enable) || LoggerAction.REDIRECT.name().equalsIgnoreCase(bad_records_action)) {
if (!StringUtils.isEmpty(bad_record_path)) {
bad_record_path = CarbonUtil.checkAndAppendHDFSUrl(bad_record_path);
} else {
throw new InvalidLoadOptionException("Cannot redirect bad records as bad record location is not provided.");
}
}
carbonLoadModel.setBadRecordsLocation(bad_record_path);
validateGlobalSortPartitions(global_sort_partitions);
carbonLoadModel.setEscapeChar(checkDefaultValue(optionsFinal.get("escapechar"), "\\"));
carbonLoadModel.setQuoteChar(CarbonUtil.unescapeChar(checkDefaultValue(optionsFinal.get("quotechar"), "\"")));
carbonLoadModel.setCommentChar(checkDefaultValue(optionsFinal.get("commentchar"), "#"));
String lineSeparator = CarbonUtil.unescapeChar(options.get("line_separator"));
if (lineSeparator != null) {
carbonLoadModel.setLineSeparator(lineSeparator);
}
// if there isn't file header in csv file and load sql doesn't provide FILEHEADER option,
// we should use table schema to generate file header.
String fileHeader = optionsFinal.get("fileheader");
String headerOption = optionsFinal.get("header");
if (StringUtils.isNotEmpty(headerOption)) {
if (!headerOption.equalsIgnoreCase("true") && !headerOption.equalsIgnoreCase("false")) {
throw new InvalidLoadOptionException("'header' option should be either 'true' or 'false'.");
}
// whether the csv file has file header, the default value is true
if (Boolean.valueOf(headerOption)) {
if (!StringUtils.isEmpty(fileHeader)) {
throw new InvalidLoadOptionException("When 'header' option is true, 'fileheader' option is not required.");
}
} else {
if (StringUtils.isEmpty(fileHeader)) {
List<CarbonColumn> columns = table.getCreateOrderColumn();
List<String> columnNames = new ArrayList<>();
List<String> partitionColumns = new ArrayList<>();
for (int i = 0; i < columns.size(); i++) {
columnNames.add(columns.get(i).getColName());
}
columnNames.addAll(partitionColumns);
fileHeader = Strings.mkString(columnNames.toArray(new String[columnNames.size()]), ",");
}
}
}
String binaryDecoder = options.get("binary_decoder");
carbonLoadModel.setBinaryDecoder(binaryDecoder);
carbonLoadModel.setTimestampFormat(timestampformat);
carbonLoadModel.setDateFormat(dateFormat);
carbonLoadModel.setDefaultTimestampFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT));
carbonLoadModel.setDefaultDateFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT));
carbonLoadModel.setSerializationNullFormat(TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName() + "," + optionsFinal.get("serialization_null_format"));
carbonLoadModel.setBadRecordsLoggerEnable(TableOptionConstant.BAD_RECORDS_LOGGER_ENABLE.getName() + "," + bad_records_logger_enable);
carbonLoadModel.setBadRecordsAction(TableOptionConstant.BAD_RECORDS_ACTION.getName() + "," + bad_records_action.toUpperCase());
carbonLoadModel.setIsEmptyDataBadRecord(DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD + "," + optionsFinal.get("is_empty_data_bad_record"));
carbonLoadModel.setSkipEmptyLine(optionsFinal.get("skip_empty_line"));
carbonLoadModel.setSortScope(sort_scope);
if (global_sort_partitions == null) {
global_sort_partitions = table.getGlobalSortPartitions();
}
carbonLoadModel.setGlobalSortPartitions(global_sort_partitions);
if (delimiter.equalsIgnoreCase(complex_delimiter_level1) || complex_delimiter_level1.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level3)) {
throw new InvalidLoadOptionException("Field Delimiter and Complex types delimiter are same");
} else {
carbonLoadModel.setComplexDelimiter(complex_delimiter_level1);
carbonLoadModel.setComplexDelimiter(complex_delimiter_level2);
carbonLoadModel.setComplexDelimiter(complex_delimiter_level3);
carbonLoadModel.setComplexDelimiter(complex_delimiter_level4);
}
carbonLoadModel.setCsvDelimiter(CarbonUtil.unescapeChar(delimiter));
carbonLoadModel.setCsvHeader(fileHeader);
List<String> ignoreColumns = new ArrayList<>();
if (!isDataFrame) {
for (Map.Entry<String, String> partition : partitions.entrySet()) {
if (partition.getValue() != null) {
ignoreColumns.add(partition.getKey());
}
}
}
carbonLoadModel.setCsvHeaderColumns(LoadOption.getCsvHeaderColumns(carbonLoadModel, hadoopConf, ignoreColumns));
int validatedMaxColumns = validateMaxColumns(carbonLoadModel.getCsvHeaderColumns(), optionsFinal.get("maxcolumns"));
carbonLoadModel.setMaxColumns(String.valueOf(validatedMaxColumns));
if (carbonLoadModel.isCarbonTransactionalTable()) {
carbonLoadModel.readAndSetLoadMetadataDetails();
}
carbonLoadModel.setSortColumnsBoundsStr(optionsFinal.get("sort_column_bounds"));
carbonLoadModel.setLoadMinSize(optionsFinal.get(CarbonCommonConstants.CARBON_LOAD_MIN_SIZE_INMB));
validateAndSetLoadMinSize(carbonLoadModel);
validateAndSetColumnCompressor(carbonLoadModel);
validateAndSetBinaryDecoder(carbonLoadModel);
validateRangeColumn(optionsFinal, carbonLoadModel);
carbonLoadModel.setMetrics(new DataLoadMetrics());
}
Aggregations