use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.
the class CarbonLoadModelBuilder method build.
/**
* build CarbonLoadModel for data loading
* @param options Load options from user input
* @param optionsFinal Load options that populated with default values for optional options
* @param carbonLoadModel The output load model
* @param hadoopConf hadoopConf is needed to read CSV header if there 'fileheader' is not set in
* user provided load options
* @param partitions partition name map to path
* @param isDataFrame true if build for load for dataframe
*/
public void build(Map<String, String> options, Map<String, String> optionsFinal, CarbonLoadModel carbonLoadModel, Configuration hadoopConf, Map<String, String> partitions, boolean isDataFrame) throws InvalidLoadOptionException, IOException {
carbonLoadModel.setTableName(table.getTableName());
carbonLoadModel.setDatabaseName(table.getDatabaseName());
carbonLoadModel.setTablePath(table.getTablePath());
carbonLoadModel.setTableName(table.getTableName());
carbonLoadModel.setCarbonTransactionalTable(table.isTransactionalTable());
CarbonDataLoadSchema dataLoadSchema = new CarbonDataLoadSchema(table);
// Need to fill dimension relation
carbonLoadModel.setCarbonDataLoadSchema(dataLoadSchema);
String sort_scope = optionsFinal.get("sort_scope");
String bad_records_logger_enable = optionsFinal.get("bad_records_logger_enable");
String bad_records_action = optionsFinal.get("bad_records_action");
String bad_record_path = optionsFinal.get("bad_record_path");
String global_sort_partitions = optionsFinal.get("global_sort_partitions");
String timestampformat = optionsFinal.get("timestampformat");
String dateFormat = optionsFinal.get("dateformat");
String delimiter = optionsFinal.get("delimiter");
String complex_delimiter_level1 = optionsFinal.get("complex_delimiter_level_1");
String complex_delimiter_level2 = optionsFinal.get("complex_delimiter_level_2");
String complex_delimiter_level3 = optionsFinal.get("complex_delimiter_level_3");
String complex_delimiter_level4 = optionsFinal.get("complex_delimiter_level_4");
validateDateTimeFormat(timestampformat, "TimestampFormat");
validateDateTimeFormat(dateFormat, "DateFormat");
if (Boolean.parseBoolean(bad_records_logger_enable) || LoggerAction.REDIRECT.name().equalsIgnoreCase(bad_records_action)) {
if (!StringUtils.isEmpty(bad_record_path)) {
bad_record_path = CarbonUtil.checkAndAppendHDFSUrl(bad_record_path);
} else {
throw new InvalidLoadOptionException("Cannot redirect bad records as bad record location is not provided.");
}
}
carbonLoadModel.setBadRecordsLocation(bad_record_path);
validateGlobalSortPartitions(global_sort_partitions);
carbonLoadModel.setEscapeChar(checkDefaultValue(optionsFinal.get("escapechar"), "\\"));
carbonLoadModel.setQuoteChar(CarbonUtil.unescapeChar(checkDefaultValue(optionsFinal.get("quotechar"), "\"")));
carbonLoadModel.setCommentChar(checkDefaultValue(optionsFinal.get("commentchar"), "#"));
String lineSeparator = CarbonUtil.unescapeChar(options.get("line_separator"));
if (lineSeparator != null) {
carbonLoadModel.setLineSeparator(lineSeparator);
}
// if there isn't file header in csv file and load sql doesn't provide FILEHEADER option,
// we should use table schema to generate file header.
String fileHeader = optionsFinal.get("fileheader");
String headerOption = optionsFinal.get("header");
if (StringUtils.isNotEmpty(headerOption)) {
if (!headerOption.equalsIgnoreCase("true") && !headerOption.equalsIgnoreCase("false")) {
throw new InvalidLoadOptionException("'header' option should be either 'true' or 'false'.");
}
// whether the csv file has file header, the default value is true
if (Boolean.valueOf(headerOption)) {
if (!StringUtils.isEmpty(fileHeader)) {
throw new InvalidLoadOptionException("When 'header' option is true, 'fileheader' option is not required.");
}
} else {
if (StringUtils.isEmpty(fileHeader)) {
List<CarbonColumn> columns = table.getCreateOrderColumn();
List<String> columnNames = new ArrayList<>();
List<String> partitionColumns = new ArrayList<>();
for (int i = 0; i < columns.size(); i++) {
columnNames.add(columns.get(i).getColName());
}
columnNames.addAll(partitionColumns);
fileHeader = Strings.mkString(columnNames.toArray(new String[columnNames.size()]), ",");
}
}
}
String binaryDecoder = options.get("binary_decoder");
carbonLoadModel.setBinaryDecoder(binaryDecoder);
carbonLoadModel.setTimestampFormat(timestampformat);
carbonLoadModel.setDateFormat(dateFormat);
carbonLoadModel.setDefaultTimestampFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT));
carbonLoadModel.setDefaultDateFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT));
carbonLoadModel.setSerializationNullFormat(TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName() + "," + optionsFinal.get("serialization_null_format"));
carbonLoadModel.setBadRecordsLoggerEnable(TableOptionConstant.BAD_RECORDS_LOGGER_ENABLE.getName() + "," + bad_records_logger_enable);
carbonLoadModel.setBadRecordsAction(TableOptionConstant.BAD_RECORDS_ACTION.getName() + "," + bad_records_action.toUpperCase());
carbonLoadModel.setIsEmptyDataBadRecord(DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD + "," + optionsFinal.get("is_empty_data_bad_record"));
carbonLoadModel.setSkipEmptyLine(optionsFinal.get("skip_empty_line"));
carbonLoadModel.setSortScope(sort_scope);
if (global_sort_partitions == null) {
global_sort_partitions = table.getGlobalSortPartitions();
}
carbonLoadModel.setGlobalSortPartitions(global_sort_partitions);
if (delimiter.equalsIgnoreCase(complex_delimiter_level1) || complex_delimiter_level1.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level3)) {
throw new InvalidLoadOptionException("Field Delimiter and Complex types delimiter are same");
} else {
carbonLoadModel.setComplexDelimiter(complex_delimiter_level1);
carbonLoadModel.setComplexDelimiter(complex_delimiter_level2);
carbonLoadModel.setComplexDelimiter(complex_delimiter_level3);
carbonLoadModel.setComplexDelimiter(complex_delimiter_level4);
}
carbonLoadModel.setCsvDelimiter(CarbonUtil.unescapeChar(delimiter));
carbonLoadModel.setCsvHeader(fileHeader);
List<String> ignoreColumns = new ArrayList<>();
if (!isDataFrame) {
for (Map.Entry<String, String> partition : partitions.entrySet()) {
if (partition.getValue() != null) {
ignoreColumns.add(partition.getKey());
}
}
}
carbonLoadModel.setCsvHeaderColumns(LoadOption.getCsvHeaderColumns(carbonLoadModel, hadoopConf, ignoreColumns));
int validatedMaxColumns = validateMaxColumns(carbonLoadModel.getCsvHeaderColumns(), optionsFinal.get("maxcolumns"));
carbonLoadModel.setMaxColumns(String.valueOf(validatedMaxColumns));
if (carbonLoadModel.isCarbonTransactionalTable()) {
carbonLoadModel.readAndSetLoadMetadataDetails();
}
carbonLoadModel.setSortColumnsBoundsStr(optionsFinal.get("sort_column_bounds"));
carbonLoadModel.setLoadMinSize(optionsFinal.get(CarbonCommonConstants.CARBON_LOAD_MIN_SIZE_INMB));
validateAndSetLoadMinSize(carbonLoadModel);
validateAndSetColumnCompressor(carbonLoadModel);
validateAndSetBinaryDecoder(carbonLoadModel);
validateRangeColumn(optionsFinal, carbonLoadModel);
carbonLoadModel.setMetrics(new DataLoadMetrics());
}
use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.
the class DataLoadProcessBuilder method getReArrangedDataFields.
private static void getReArrangedDataFields(CarbonLoadModel loadModel, CarbonTable carbonTable, List<CarbonDimension> dimensions, List<CarbonMeasure> measures, List<DataField> complexDataFields, List<DataField> partitionColumns, List<DataField> dataFields) {
// re-arrange the data fields, as partition column data will be present in the end
List<ColumnSchema> partitionColumnSchemaList;
if (carbonTable.getPartitionInfo() != null) {
partitionColumnSchemaList = carbonTable.getPartitionInfo().getColumnSchemaList();
} else {
partitionColumnSchemaList = new ArrayList<>();
}
// 1.1 compatibility, dimensions will not have sort columns in the beginning in 1.1.
// Need to keep at the beginning now
List<DataField> sortDataFields = new ArrayList<>();
List<DataField> noSortDataFields = new ArrayList<>();
for (CarbonColumn column : dimensions) {
DataField dataField = new DataField(column);
if (column.isComplex()) {
List<CarbonDimension> childDimensions = ((CarbonDimension) dataField.getColumn()).getListOfChildDimensions();
for (CarbonDimension childDimension : childDimensions) {
if (childDimension.getDataType() == DataTypes.DATE) {
childDimension.setDateFormat(loadModel.getDateFormat());
} else if (childDimension.getDataType() == DataTypes.TIMESTAMP) {
childDimension.setTimestampFormat(loadModel.getTimestampFormat());
}
}
if (partitionColumnSchemaList.size() != 0 && partitionColumnSchemaList.contains(column.getColumnSchema())) {
partitionColumns.add(dataField);
} else {
complexDataFields.add(dataField);
}
} else {
if (column.getDataType() == DataTypes.DATE) {
dataField.setDateFormat(loadModel.getDateFormat());
column.setDateFormat(loadModel.getDateFormat());
} else if (column.getDataType() == DataTypes.TIMESTAMP) {
dataField.setTimestampFormat(loadModel.getTimestampFormat());
column.setTimestampFormat(loadModel.getTimestampFormat());
}
if (partitionColumnSchemaList.size() != 0 && partitionColumnSchemaList.contains(column.getColumnSchema())) {
partitionColumns.add(dataField);
} else {
if (dataField.getColumn().getColumnSchema().isSortColumn()) {
sortDataFields.add(dataField);
} else {
noSortDataFields.add(dataField);
}
}
}
}
if (sortDataFields.size() != 0) {
dataFields.addAll(sortDataFields);
}
if (noSortDataFields.size() != 0) {
dataFields.addAll(noSortDataFields);
}
if (complexDataFields.size() != 0) {
dataFields.addAll(complexDataFields);
}
for (CarbonColumn column : measures) {
if (partitionColumnSchemaList.size() != 0 && partitionColumnSchemaList.contains(column.getColumnSchema())) {
partitionColumns.add(new DataField(column));
} else {
// This dummy measure is added when no measure was present. We no need to load it.
if (!(column.getColName().equals("default_dummy_measure"))) {
dataFields.add(new DataField(column));
}
}
}
if (partitionColumns.size() != 0) {
// add partition columns at the end
// re-arrange the partition columns as per column schema
List<DataField> reArrangedPartitionColumns = new ArrayList<>();
for (ColumnSchema col : partitionColumnSchemaList) {
for (DataField field : partitionColumns) {
if (field.getColumn().getColumnSchema().equals(col)) {
reArrangedPartitionColumns.add(field);
break;
}
}
}
dataFields.addAll(reArrangedPartitionColumns);
}
}
use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.
the class DataLoadProcessBuilder method getDataFields.
private static void getDataFields(CarbonLoadModel loadModel, List<CarbonDimension> dimensions, List<CarbonMeasure> measures, List<DataField> complexDataFields, List<DataField> dataFields) {
// And then add complex data types and measures.
for (CarbonColumn column : dimensions) {
DataField dataField = new DataField(column);
if (column.getDataType() == DataTypes.DATE) {
dataField.setDateFormat(loadModel.getDateFormat());
column.setDateFormat(loadModel.getDateFormat());
} else if (column.getDataType() == DataTypes.TIMESTAMP) {
dataField.setTimestampFormat(loadModel.getTimestampFormat());
column.setTimestampFormat(loadModel.getTimestampFormat());
}
if (column.isComplex()) {
complexDataFields.add(dataField);
List<CarbonDimension> childDimensions = ((CarbonDimension) dataField.getColumn()).getListOfChildDimensions();
for (CarbonDimension childDimension : childDimensions) {
if (childDimension.getDataType() == DataTypes.DATE) {
childDimension.setDateFormat(loadModel.getDateFormat());
} else if (childDimension.getDataType() == DataTypes.TIMESTAMP) {
childDimension.setTimestampFormat(loadModel.getTimestampFormat());
}
}
} else {
dataFields.add(dataField);
}
}
dataFields.addAll(complexDataFields);
for (CarbonColumn column : measures) {
// This dummy measure is added when no measure was present. We no need to load it.
if (!(column.getColName().equals("default_dummy_measure"))) {
dataFields.add(new DataField(column));
}
}
}
use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.
the class SchemaGenerator method getMinMaxLength.
/**
* Method to get the min max length of each column. It will return the length of only column
* which will be cached
*
* @param segmentProperties
* @param minMaxCacheColumns
* @return
*/
private static int[] getMinMaxLength(SegmentProperties segmentProperties, List<CarbonColumn> minMaxCacheColumns) {
int[] minMaxLen = null;
if (null != minMaxCacheColumns) {
minMaxLen = new int[minMaxCacheColumns.size()];
int counter = 0;
for (CarbonColumn column : minMaxCacheColumns) {
minMaxLen[counter++] = segmentProperties.createColumnValueLength()[BlockletIndexUtil.getColumnOrdinal(segmentProperties, column)];
}
} else {
minMaxLen = segmentProperties.createColumnValueLength();
}
return minMaxLen;
}
use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.
the class BlockletIndexUtil method getMinMaxForColumnsToBeCached.
/**
* Method to get the min/max values for columns to be cached
*
* @param segmentProperties
* @param minMaxCacheColumns
* @param minMaxValuesForAllColumns
* @return
*/
public static byte[][] getMinMaxForColumnsToBeCached(SegmentProperties segmentProperties, List<CarbonColumn> minMaxCacheColumns, byte[][] minMaxValuesForAllColumns) {
byte[][] minMaxValuesForColumnsToBeCached = minMaxValuesForAllColumns;
if (null != minMaxCacheColumns) {
minMaxValuesForColumnsToBeCached = new byte[minMaxCacheColumns.size()][];
int counter = 0;
for (CarbonColumn column : minMaxCacheColumns) {
minMaxValuesForColumnsToBeCached[counter++] = minMaxValuesForAllColumns[getColumnOrdinal(segmentProperties, column)];
}
}
return minMaxValuesForColumnsToBeCached;
}
Aggregations