org.apache.hadoop.hive.serde2.ColumnProjectionUtils Java Exaples

Source File: HiveVectorizedReaderSetting.java From multiple-dimension-spread with Apache License 2.0

6 votes

public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
  this.hiveReaderConfig = hiveReaderConfig;

  rbCtx = Utilities.getVectorizedRowBatchCtx( job );
  partitionValues = new Object[rbCtx.getPartitionColumnCount()];
  if( 0 < partitionValues.length ){
    rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
  }

  TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
  columnNames = rbCtx.getRowColumnNames();
  needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );

  projectionColumn = new boolean[columnNames.length];
  assignors = new IColumnVectorAssignor[columnNames.length];
  for( int id : needColumnIds ){
    projectionColumn[id] = true;
    assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
  }
}

Source File: MDSSerde.java From multiple-dimension-spread with Apache License 2.0

6 votes

@Override
public void initialize( final Configuration conf, final Properties table , final Properties part ) throws SerDeException{
  LOG.info( table.toString() );
  if( part != null ){
    LOG.info( part.toString() );
  }
  String columnNameProperty = table.getProperty(serdeConstants.LIST_COLUMNS);
  String columnTypeProperty = table.getProperty(serdeConstants.LIST_COLUMN_TYPES);

  String projectionColumnNames = conf.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , "" );

  StructTypeInfo rootType;
  if( projectionColumnNames.isEmpty() ){
    rootType = getAllReadTypeInfo( columnNameProperty , columnTypeProperty );
  }
  else{
    rootType = getColumnProjectionTypeInfo( columnNameProperty , columnTypeProperty , projectionColumnNames );
  }

  inspector = MDSObjectInspectorFactory.craeteObjectInspectorFromTypeInfo( rootType );
}

Source File: HoodieParquetRealtimeInputFormat.java From hudi with Apache License 2.0

6 votes

/**
 * Add a field to the existing fields projected.
 */
private static Configuration addProjectionField(Configuration conf, String fieldName, int fieldIndex) {
  String readColNames = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "");
  String readColIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "");

  String readColNamesPrefix = readColNames + ",";
  if (readColNames == null || readColNames.isEmpty()) {
    readColNamesPrefix = "";
  }
  String readColIdsPrefix = readColIds + ",";
  if (readColIds == null || readColIds.isEmpty()) {
    readColIdsPrefix = "";
  }

  if (!readColNames.contains(fieldName)) {
    // If not already in the list - then add it
    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNamesPrefix + fieldName);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIdsPrefix + fieldIndex);
    if (LOG.isDebugEnabled()) {
      LOG.debug(String.format("Adding extra column " + fieldName + ", to enable log merging cols (%s) ids (%s) ",
          conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
          conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)));
    }
  }
  return conf;
}

Source File: HiveTableInputFormat.java From flink with Apache License 2.0

6 votes

private void addSchemaToConf(JobConf jobConf) {
	// set columns/types -- including partition cols
	List<String> typeStrs = Arrays.stream(fieldTypes)
			.map(t -> HiveTypeUtil.toHiveTypeInfo(t, true).toString())
			.collect(Collectors.toList());
	jobConf.set(IOConstants.COLUMNS, String.join(",", fieldNames));
	jobConf.set(IOConstants.COLUMNS_TYPES, String.join(",", typeStrs));
	// set schema evolution -- excluding partition cols
	int numNonPartCol = fieldNames.length - partitionKeys.size();
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS, String.join(",", Arrays.copyOfRange(fieldNames, 0, numNonPartCol)));
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS_TYPES, String.join(",", typeStrs.subList(0, numNonPartCol)));

	// in older versions, parquet reader also expects the selected col indices in conf, excluding part cols
	String readColIDs = Arrays.stream(selectedFields)
			.filter(i -> i < numNonPartCol)
			.mapToObj(String::valueOf)
			.collect(Collectors.joining(","));
	jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIDs);
}

Source File: InputFormatTestUtil.java From hudi with Apache License 2.0

6 votes

public static void setPropsForInputFormat(JobConf jobConf,
    Schema schema, String hiveColumnTypes) {
  List<Schema.Field> fields = schema.getFields();
  String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
  String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
  Configuration conf = HoodieTestUtils.getDefaultHadoopConf();

  String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr"))
      .map(Schema.Field::name).collect(Collectors.joining(","));
  hiveColumnNames = hiveColumnNames + ",datestr";
  String modifiedHiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(hiveColumnTypes);
  modifiedHiveColumnTypes = modifiedHiveColumnTypes + ",string";
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes);
  jobConf.addResource(conf);
}

Source File: OrcInputFormat.java From hive-dwrf with Apache License 2.0

5 votes

/**
 * Take the configuration and figure out which columns we need to include.
 * @param types the types of the file
 * @param conf the configuration
 * @return true for each column that should be included
 */
private static boolean[] findIncludedColumns(List<OrcProto.Type> types,
                                             Configuration conf) {
  String includedStr =
      conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
  if (includedStr == null || includedStr.trim().length() == 0) {
    return null;
  } else {
    int numColumns = types.size();
    boolean[] result = new boolean[numColumns];
    result[0] = true;
    OrcProto.Type root = types.get(0);
    List<Integer> included = ColumnProjectionUtils.getReadColumnIDs(conf);
    for(int i=0; i < root.getSubtypesCount(); ++i) {
      if (included.contains(i)) {
        includeColumnRecursive(types, result, root.getSubtypes(i));
      }
    }
    // if we are filtering at least one column, return the boolean array
    for(boolean include: result) {
      if (!include) {
        return result;
      }
    }
    return null;
  }
}

Source File: OrcStorage.java From spork with Apache License 2.0

5 votes

@Override
public void setLocation(String location, Job job) throws IOException {
    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
    if (!UDFContext.getUDFContext().isFrontend()) {
        typeInfo = (TypeInfo)ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
    } else if (typeInfo == null) {
        typeInfo = getTypeInfo(location, job);
    }
    if (typeInfo != null && oi == null) {
        oi = OrcStruct.createObjectInspector(typeInfo);
    }
    if (!UDFContext.getUDFContext().isFrontend()) {
        if (p.getProperty(signature + RequiredColumnsSuffix) != null) {
            mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p
                    .getProperty(signature + RequiredColumnsSuffix));
            job.getConfiguration().setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,
                    getReqiredColumnIdString(mRequiredColumns));
            if (p.getProperty(signature + SearchArgsSuffix) != null) {
                // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
                job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                        getReqiredColumnNamesString(getSchema(location, job), mRequiredColumns));
            }
        } else if (p.getProperty(signature + SearchArgsSuffix) != null) {
            // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                    getReqiredColumnNamesString(getSchema(location, job)));
        }
        if (p.getProperty(signature + SearchArgsSuffix) != null) {
            job.getConfiguration().set(SARG_PUSHDOWN, p.getProperty(signature + SearchArgsSuffix));
        }

    }
    FileInputFormat.setInputPaths(job, location);
}

Source File: DataWritableReadSupport.java From parquet-mr with Apache License 2.0

5 votes

/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}

Source File: IndexRRecordReader.java From indexr with Apache License 2.0

5 votes

private void getIncludeColumns(Configuration conf, Segment segment) {
    List<ColumnSchema> segColSchemas = segment.schema().getColumns();
    String columnNamesStr = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
    if (ColumnProjectionUtils.isReadAllColumns(conf) ||
            columnNamesStr == null) {
        projectCols = new ColumnSchema[segColSchemas.size()];
        projectColIds = new int[segColSchemas.size()];
        for (int i = 0; i < segColSchemas.size(); i++) {
            projectCols[i] = segColSchemas.get(i);
            projectColIds[i] = i;
        }
    } else {
        String[] ss = Strings.isEmpty(columnNamesStr.trim()) ? new String[]{} : columnNamesStr.split(",");
        projectCols = new ColumnSchema[ss.length];
        projectColIds = new int[ss.length];
        for (int i = 0; i < ss.length; i++) {
            String col = ss[i];
            int colId = Trick.indexFirst(segColSchemas, c -> c.getName().equalsIgnoreCase(col));
            //Preconditions.checkState(colId >= 0, String.format("Column [%s] not found in segment [%s]", col, segment.name()));
            if (colId < 0) {
                projectCols[i] = null;
                projectColIds[i] = -1;
            } else {
                projectCols[i] = segColSchemas.get(colId);
                projectColIds[i] = colId;
            }
        }
    }
}

Source File: TestHoodieRealtimeRecordReader.java From hudi with Apache License 2.0

5 votes

private void setHiveColumnNameProps(List<Schema.Field> fields, JobConf jobConf, boolean isPartitioned) {
  String names = fields.stream().map(Field::name).collect(Collectors.joining(","));
  String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);

  String hiveOrderedColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase(PARTITION_COLUMN))
      .map(Field::name).collect(Collectors.joining(","));
  if (isPartitioned) {
    hiveOrderedColumnNames += "," + PARTITION_COLUMN;
    jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, PARTITION_COLUMN);
  }
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveOrderedColumnNames);
}

Source File: AbstractRealtimeRecordReader.java From hudi with Apache License 2.0

5 votes

/**
 * Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls
 * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into the
 * job conf.
 */
private void init() throws IOException {
  Schema schemaFromLogFile =
      LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
  if (schemaFromLogFile == null) {
    writerSchema = HoodieRealtimeRecordReaderUtils.readSchema(jobConf, split.getPath());
    LOG.debug("Writer Schema From Parquet => " + writerSchema.getFields());
  } else {
    writerSchema = schemaFromLogFile;
    LOG.debug("Writer Schema From Log => " + writerSchema.getFields());
  }
  // Add partitioning fields to writer schema for resulting row to contain null values for these fields
  String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
  List<String> partitioningFields =
      partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList())
          : new ArrayList<>();
  writerSchema = HoodieRealtimeRecordReaderUtils.addPartitionFields(writerSchema, partitioningFields);
  List<String> projectionFields = HoodieRealtimeRecordReaderUtils.orderFields(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
      jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), partitioningFields);

  Map<String, Field> schemaFieldsMap = HoodieRealtimeRecordReaderUtils.getNameToFieldMap(writerSchema);
  hiveSchema = constructHiveOrderedSchema(writerSchema, schemaFieldsMap);
  // TODO(vc): In the future, the reader schema should be updated based on log files & be able
  // to null out fields not present before

  readerSchema = HoodieRealtimeRecordReaderUtils.generateProjectionSchema(writerSchema, schemaFieldsMap, projectionFields);
  LOG.info(String.format("About to read compacted logs %s for base split %s, projecting cols %s",
      split.getDeltaLogPaths(), split.getPath(), projectionFields));
}

Source File: AbstractRealtimeRecordReader.java From hudi with Apache License 2.0

5 votes

public AbstractRealtimeRecordReader(HoodieRealtimeFileSplit split, JobConf job) {
  this.split = split;
  this.jobConf = job;
  LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR));
  LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
  LOG.info("partitioningColumns ==> " + job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""));
  try {
    this.usesCustomPayload = usesCustomPayload();
    LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
    init();
  } catch (IOException e) {
    throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
  }
}

Source File: HoodieParquetRealtimeInputFormat.java From hudi with Apache License 2.0

5 votes

@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf jobConf,
    final Reporter reporter) throws IOException {
  // Hive on Spark invokes multiple getRecordReaders from different threads in the same spark task (and hence the
  // same JVM) unlike Hive on MR. Due to this, accesses to JobConf, which is shared across all threads, is at the
  // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible
  // latency incurred here due to the synchronization since get record reader is called once per spilt before the
  // actual heavy lifting of reading the parquet files happen.
  if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
    synchronized (jobConf) {
      LOG.info(
          "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
              + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
      if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
        // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
        // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
        // hoodie additional projection columns are reset after calling setConf and only natural projections
        // (one found in select queries) are set. things would break because of this.
        // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
        // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction
        // time.
        cleanProjectionColumnIds(jobConf);
        addRequiredProjectionFields(jobConf);

        this.conf = jobConf;
        this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true");
      }
    }
  }

  LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
      + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
  // sanity check
  ValidationUtils.checkArgument(split instanceof HoodieRealtimeFileSplit,
      "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split);

  return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf,
      super.getRecordReader(split, jobConf, reporter));
}

Source File: HoodieParquetRealtimeInputFormat.java From hudi with Apache License 2.0

5 votes

/**
 * Hive will append read columns' ids to old columns' ids during getRecordReader. In some cases, e.g. SELECT COUNT(*),
 * the read columns' id is an empty string and Hive will combine it with Hoodie required projection ids and becomes
 * e.g. ",2,0,3" and will cause an error. Actually this method is a temporary solution because the real bug is from
 * Hive. Hive has fixed this bug after 3.0.0, but the version before that would still face this problem. (HIVE-22438)
 */
private static void cleanProjectionColumnIds(Configuration conf) {
  String columnIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
  if (!columnIds.isEmpty() && columnIds.charAt(0) == ',') {
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, columnIds.substring(1));
    if (LOG.isDebugEnabled()) {
      LOG.debug("The projection Ids: {" + columnIds + "} start with ','. First comma is removed");
    }
  }
}

Source File: HoodieMergeOnReadTestUtils.java From hudi with Apache License 2.0

5 votes

private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema,
                                           String basePath) {
  List<Schema.Field> fields = schema.getFields();
  String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
  String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
  Configuration conf = HoodieTestUtils.getDefaultHadoopConf();

  String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr"))
      .map(Schema.Field::name).collect(Collectors.joining(","));
  hiveColumnNames = hiveColumnNames + ",datestr";

  String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES);
  hiveColumnTypes = hiveColumnTypes + ",string";
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);

  // Hoodie Input formats are also configurable
  Configurable configurable = (Configurable)inputFormat;
  configurable.setConf(conf);
  jobConf.addResource(conf);
}

Source File: HdfsSerDeImportService.java From hadoop-etl-udfs with MIT License

5 votes

private static void initProperties(
        Properties props,
        Configuration conf,
        List<HCatTableColumn> columns,
        List<OutputColumnSpec> outputColumns) throws Exception {
    String colNames = "";
    String colTypes = "";
    for (HCatTableColumn colInfo : columns) {
        colNames += colInfo.getName() + ",";
        colTypes += colInfo.getDataType() + ",";
    }
    if (colNames.length() > 0)
        colNames = colNames.substring(0, colNames.length() - 1);
    if (colTypes.length() > 0)
        colTypes = colTypes.substring(0, colTypes.length() - 1);
    props.put(serdeConstants.LIST_COLUMNS, colNames);
    props.put(serdeConstants.LIST_COLUMN_TYPES, colTypes);
    props.put(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL");
    // Fix for Avro (NullPointerException if null)
    if (props.getProperty("columns.comments") == null) {
        props.put("columns.comments", "");
    }
    // Pushdown projection if we don't need all columns
    Set<Integer> requiredColumns = new HashSet<>();
    for (OutputColumnSpec spec : outputColumns) {
        if (spec.getColumnPosition() < columns.size()) {
            requiredColumns.add(spec.getColumnPosition());
        }
    }
    if (requiredColumns.size() < columns.size()) {
        ColumnProjectionUtils.appendReadColumns(conf, new ArrayList<>(requiredColumns));
    }
}

Source File: HiveCassandraStandardColumnInputFormat.java From Hive-Cassandra with Apache License 2.0

4 votes

@Override
public RecordReader<BytesWritable, MapWritable> getRecordReader(InputSplit split,
    JobConf jobConf, final Reporter reporter) throws IOException {
  HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split;

  List<String> columns = AbstractColumnSerDe.parseColumnMapping(cassandraSplit.getColumnMapping());
  isTransposed = AbstractColumnSerDe.isTransposed(columns);


  List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

  if (columns.size() < readColIDs.size()) {
    throw new IOException("Cannot read more columns than the given table contains.");
  }

  org.apache.cassandra.hadoop.ColumnFamilySplit cfSplit = cassandraSplit.getSplit();
  Job job = new Job(jobConf);

  TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {
    @Override
    public void progress() {
      reporter.progress();
    }
  };

  SlicePredicate predicate = new SlicePredicate();

  if (isTransposed || readColIDs.size() == columns.size() || readColIDs.size() == 0) {
    SliceRange range = new SliceRange();
    AbstractType comparator = BytesType.instance;

    String comparatorType = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_COMPARATOR);
    if (comparatorType != null && !comparatorType.equals("")) {
      try {
        comparator = TypeParser.parse(comparatorType);
      } catch (Exception ex) {
        throw new IOException("Comparator class not found.");
      }
    }

    String sliceStart = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_START);
    String sliceEnd = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_FINISH);
    String reversed = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_REVERSED);

    range.setStart(comparator.fromString(sliceStart == null ? "" : sliceStart));
    range.setFinish(comparator.fromString(sliceEnd == null ? "" : sliceEnd));
    range.setReversed(reversed == null ? false : reversed.equals("true"));
    range.setCount(cassandraSplit.getSlicePredicateSize());
    predicate.setSlice_range(range);
  } else {
    int iKey = columns.indexOf(AbstractColumnSerDe.CASSANDRA_KEY_COLUMN);
    predicate.setColumn_names(getColumnNames(iKey, columns, readColIDs));
  }


  try {
    ConfigHelper.setInputColumnFamily(tac.getConfiguration(),
        cassandraSplit.getKeyspace(), cassandraSplit.getColumnFamily());

    ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate);
    ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize());
    ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + "");
    ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost());
    ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner());
    // Set Split Size
    ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize());

    CassandraHiveRecordReader rr = null;

    if(isTransposed && tac.getConfiguration().getBoolean(AbstractColumnSerDe.CASSANDRA_ENABLE_WIDEROW_ITERATOR, true)) {
      rr = new CassandraHiveRecordReader(new ColumnFamilyWideRowRecordReader(), isTransposed);
    } else {
      rr = new CassandraHiveRecordReader(new ColumnFamilyRecordReader(), isTransposed);
    }
    rr.initialize(cfSplit, tac);

    return rr;

  } catch (Exception ie) {
    throw new IOException(ie);
  }
}

Source File: HiveReaderSetting.java From multiple-dimension-spread with Apache License 2.0

4 votes

public HiveReaderSetting( final FileSplit split, final JobConf job ){
  config = new Configuration();

  disableSkipBlock = job.getBoolean( "mds.disable.block.skip" , false );
  disableFilterPushdown = job.getBoolean( "mds.disable.filter.pushdown" , false );

  Set<String> pathNameSet= createPathSet( split.getPath() );
  List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
  String filterExprSerialized = job.get( TableScanDesc.FILTER_EXPR_CONF_STR );
  if( filterExprSerialized != null ){
    filterExprs.add( Utilities.deserializeExpression(filterExprSerialized) );
  }

  MapWork mapWork;
  try{
    mapWork = Utilities.getMapWork(job);
  }catch( Exception e ){
    mapWork = null;
  }

  if( mapWork == null ){
    node = createExpressionNode( filterExprs );
    isVectorModeFlag = false;
    return;
  }

  node = createExpressionNode( filterExprs );

  for( Map.Entry<String,PartitionDesc> pathsAndParts: mapWork.getPathToPartitionInfo().entrySet() ){
    if( ! pathNameSet.contains( pathsAndParts.getKey() ) ){
      continue;
    }
    Properties props = pathsAndParts.getValue().getTableDesc().getProperties();
    if( props.containsKey( "mds.expand" ) ){
      config.set( "spread.reader.expand.column" , props.getProperty( "mds.expand" ) );
    }
    if( props.containsKey( "mds.flatten" ) ){
      config.set( "spread.reader.flatten.column" , props.getProperty( "mds.flatten" ) );
    }
  }

  config.set( "spread.reader.read.column.names" , createReadColumnNames( job.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , null ) ) );

  // Next Hive vesion;
  // Utilities.getUseVectorizedInputFileFormat(job)
  isVectorModeFlag = Utilities.isVectorMode( job );
}

org.apache.hadoop.hive.serde2.ColumnProjectionUtils Java Examples