org.apache.hive.hcatalog.data.DefaultHCatRecord Java Exaples

Source File: HdfsUtil.java From ES-Fastloader with Apache License 2.0

6 votes

public static Job getHdfsJob(Configuration conf, TaskConfig taskConfig, IndexInfo indexInfo) throws Exception {
    Job job = Job.getInstance(conf, MAIN_CLASS);
    job.setJobName("DidiFastIndex_" + taskConfig.getEsTemplate());
    job.setJarByClass(FastIndex.class);
    job.setMapperClass(FastIndexMapper.class);
    job.setInputFormatClass(HCatInputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DefaultHCatRecord.class);
    HCatInputFormat.setInput(job, taskConfig.getHiveDB(), taskConfig.getHiveTable(), taskConfig.getFilterStr());

    job.setReducerClass(FastIndexReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);
    job.setNumReduceTasks(indexInfo.getReducerNum());
    job.setOutputFormatClass(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(taskConfig.getHdfsMROutputPath()));

    return job;
}

Source File: FastIndexMapper.java From ES-Fastloader with Apache License 2.0

6 votes

@Override
protected void map(Object key, HCatRecord value, Context context) throws IOException, InterruptedException {
    DefaultHCatRecord hCatRecord = (DefaultHCatRecord) value;
    int shardNo;

    List<String> keyList = taskConfig.getKeyList();
    if(keyList==null || keyList.size()==0) {
        shardNo = (int) (Math.random()*templateConfig.getReducerNum());
    } else {
        String keyStr = getKeyValue(keyList, hCatRecord);
        shardNo = CommonUtils.getShardId(keyStr, templateConfig.getReducerNum());
    }

    //shard分片个数与reduce个数一样
    context.write(new IntWritable(shardNo), hCatRecord);
}

Source File: FastIndexMapper.java From ES-Fastloader with Apache License 2.0

6 votes

private String getKeyValue(List<String> keys, DefaultHCatRecord hCatRecord) throws HCatException {
    StringBuilder sb = new StringBuilder();
    for (String key : keys) {
        Object id = hCatRecord.get(key, this.schema);
        if (id == null || StringUtils.isBlank(id.toString())) {
            sb.append("");
        } else {
            sb.append(id.toString());
        }
        sb.append("_");
    }

    if (sb.length() > 1) {
        return sb.substring(0, sb.length() - 1);
    } else {
        return sb.toString();
    }
}

Source File: HCatalogTestUtils.java From aliyun-maxcompute-data-collectors with Apache License 2.0

6 votes

private List<HCatRecord> generateHCatRecords(int numRecords,
  HCatSchema hCatTblSchema, ColumnGenerator... extraCols) throws Exception {
  List<HCatRecord> records = new ArrayList<HCatRecord>();
  List<HCatFieldSchema> hCatTblCols = hCatTblSchema.getFields();
  int size = hCatTblCols.size();
  for (int i = 0; i < numRecords; ++i) {
    DefaultHCatRecord record = new DefaultHCatRecord(size);
    record.set(hCatTblCols.get(0).getName(), hCatTblSchema, i);
    record.set(hCatTblCols.get(1).getName(), hCatTblSchema, "textfield" + i);
    int idx = 0;
    for (int j = 0; j < extraCols.length; ++j) {
      if (extraCols[j].getKeyType() == KeyType.STATIC_KEY) {
        continue;
      }
      record.set(hCatTblCols.get(idx + 2).getName(), hCatTblSchema,
        extraCols[j].getHCatValue(i));
      ++idx;
    }

    records.add(record);
  }
  return records;
}

Source File: TableDataInserterTest.java From HiveRunner with Apache License 2.0

5 votes

@Test
public void insertsRowsIntoExistingTable() {
  Multimap<Map<String, String>, HCatRecord> data = ImmutableMultimap
      .<Map<String, String>, HCatRecord>builder()
      .put(of("local_date", "2015-10-14"), new DefaultHCatRecord(asList((Object) "aa", "bb")))
      .put(of("local_date", "2015-10-14"), new DefaultHCatRecord(asList((Object) "aa2", "bb2")))
      .put(of("local_date", "2015-10-14"), new DefaultHCatRecord(asList((Object) "cc", "dd")))
      .put(of("local_date", "2015-10-15"), new DefaultHCatRecord(asList((Object) "ee", "ff")))
      .build();

  TableDataInserter inserter = new TableDataInserter(TEST_DB, TEST_TABLE, hiveShell.getHiveConf());
  inserter.insert(data);

  List<String> result = hiveShell.executeQuery("select * from testdb.test_table");
  Collections.sort(result);

  assertEquals(4, result.size());
  assertEquals("aa", result.get(0).split("\t")[0]);
  assertEquals("bb", result.get(0).split("\t")[1]);
  assertEquals("2015-10-14", result.get(0).split("\t")[2]);

  assertEquals("aa2", result.get(1).split("\t")[0]);
  assertEquals("bb2", result.get(1).split("\t")[1]);
  assertEquals("2015-10-14", result.get(1).split("\t")[2]);

  assertEquals("cc", result.get(2).split("\t")[0]);
  assertEquals("dd", result.get(2).split("\t")[1]);
  assertEquals("2015-10-14", result.get(2).split("\t")[2]);

  assertEquals("ee", result.get(3).split("\t")[0]);
  assertEquals("ff", result.get(3).split("\t")[1]);
  assertEquals("2015-10-15", result.get(3).split("\t")[2]);
}

Source File: HCatInputFormatBase.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Creates a HCatInputFormat for the given database, table, and
 * {@link org.apache.hadoop.conf.Configuration}.
 * By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
 * The return type of the InputFormat can be changed to Flink-native tuples by calling
 * {@link HCatInputFormatBase#asFlinkTuples()}.
 *
 * @param database The name of the database to read from.
 * @param table The name of the table to read.
 * @param config The Configuration for the InputFormat.
 * @throws java.io.IOException
 */
public HCatInputFormatBase(String database, String table, Configuration config) throws IOException {
	super();
	this.configuration = config;
	HadoopUtils.mergeHadoopConf(this.configuration);

	this.hCatInputFormat = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.setInput(this.configuration, database, table);
	this.outputSchema = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.getTableSchema(this.configuration);

	// configure output schema of HCatFormat
	configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
	// set type information
	this.resultType = new WritableTypeInfo(DefaultHCatRecord.class);
}

Source File: TableDataBuilder.java From HiveRunner with Apache License 2.0

5 votes

TableDataBuilder copyRow() {
  checkState(row != null, "No previous row to copy.");
  HCatRecord copy = new DefaultHCatRecord(new ArrayList<>(row.getAll()));
  flushRow();
  row = copy;
  return this;
}

Source File: HCatInputFormatBase.java From flink with Apache License 2.0

5 votes

/**
 * Creates a HCatInputFormat for the given database, table, and
 * {@link org.apache.hadoop.conf.Configuration}.
 * By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
 * The return type of the InputFormat can be changed to Flink-native tuples by calling
 * {@link HCatInputFormatBase#asFlinkTuples()}.
 *
 * @param database The name of the database to read from.
 * @param table The name of the table to read.
 * @param config The Configuration for the InputFormat.
 * @throws java.io.IOException
 */
public HCatInputFormatBase(String database, String table, Configuration config) throws IOException {
	super();
	this.configuration = config;
	HadoopUtils.mergeHadoopConf(this.configuration);

	this.hCatInputFormat = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.setInput(this.configuration, database, table);
	this.outputSchema = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.getTableSchema(this.configuration);

	// configure output schema of HCatFormat
	configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
	// set type information
	this.resultType = new WritableTypeInfo(DefaultHCatRecord.class);
}

Source File: FactDistinctColumnsMapperTest.java From kylin with Apache License 2.0

5 votes

@Test
public void testMapper() throws IOException {
    Configuration configuration = mapDriver.getConfiguration();
    configuration.set(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT, "100");
    configuration.set(BatchConstants.CFG_CUBE_NAME, "test_kylin_cube_with_slr_1_new_segment");
    configuration.set(BatchConstants.CFG_CUBE_SEGMENT_ID, "198va32a-a33e-4b69-83dd-0bb8b1f8c53b");
    HCatRecord value1 = new DefaultHCatRecord(11);
    value1.set(0, "2012-08-16");
    value1.set(1, "48027");
    value1.set(2, "0");
    value1.set(3, "Home & Garden");
    value1.set(4, "Cheese & Crackers");
    value1.set(5, "Cheese & Crackers");
    value1.set(6, "48027");
    value1.set(7, "16");
    value1.set(8, "10000010");
    value1.set(9, "204.28");
    value1.set(10, "5");
    mapDriver.addInput(new LongWritable(0), value1);

    List<Pair<SelfDefineSortableKey, Text>> result = mapDriver.run();
    int colsNeedDictSize = cubeDesc.getAllColumnsNeedDictionaryBuilt().size();
    int cuboidsCnt = cubeDesc.getAllCuboids().size();

    assertEquals(
            colsNeedDictSize + (cubeDesc.getRowkey().getRowKeyColumns().length - colsNeedDictSize) * 2 + cuboidsCnt,
            result.size());
}

Source File: HCatInputFormatBase.java From flink with Apache License 2.0

5 votes

/**
 * Creates a HCatInputFormat for the given database, table, and
 * {@link org.apache.hadoop.conf.Configuration}.
 * By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
 * The return type of the InputFormat can be changed to Flink-native tuples by calling
 * {@link HCatInputFormatBase#asFlinkTuples()}.
 *
 * @param database The name of the database to read from.
 * @param table The name of the table to read.
 * @param config The Configuration for the InputFormat.
 * @throws java.io.IOException
 */
public HCatInputFormatBase(String database, String table, Configuration config) throws IOException {
	super();
	this.configuration = config;
	HadoopUtils.mergeHadoopConf(this.configuration);

	this.hCatInputFormat = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.setInput(this.configuration, database, table);
	this.outputSchema = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.getTableSchema(this.configuration);

	// configure output schema of HCatFormat
	configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
	// set type information
	this.resultType = new WritableTypeInfo(DefaultHCatRecord.class);
}

Source File: FastIndexReducer.java From ES-Fastloader with Apache License 2.0

5 votes

@Override
protected void reduce(IntWritable key, Iterable<DefaultHCatRecord> values, Context context) throws IOException, InterruptedException {
    this.reduceId = key.get();

    LogUtils.info("reduce start, es reduceNo is:" + reduceId);
    Iterator<DefaultHCatRecord> records = values.iterator();


    while (records.hasNext()) {
        DefaultHCatRecord record = records.next();
        if (record != null) {
            JSONObject jsonObject = transformer.tranform(record.getAll());

            String Primekey;
            List<String> keyList = taskConfig.getKeyList();
            if(keyList==null || keyList.size()==0) {
                Primekey = UUID.randomUUID().toString();
            } else {
                Primekey = getKeyValue(keyList, jsonObject);
            }

            esWriter.bulk(Primekey, jsonObject);
        }
    }

    esWriter.finish();
    context.write(NullWritable.get(), NullWritable.get());
    log.info("reduce finish!");
}

Source File: JsonSerdeUtilsTest.java From incubator-hivemall with Apache License 2.0

4 votes

@Test
public void testRW() throws Exception {
    List<Object> rlist = new ArrayList<Object>(13);
    {
        rlist.add(new Byte("123"));
        rlist.add(new Short("456"));
        rlist.add(new Integer(789));
        rlist.add(new Long(1000L));
        rlist.add(new Double(5.3D));
        rlist.add(new Float(2.39F));
        rlist.add(new String("hcat\nand\nhadoop"));
        rlist.add(null);

        List<Object> innerStruct = new ArrayList<Object>(2);
        innerStruct.add(new String("abc"));
        innerStruct.add(new String("def"));
        rlist.add(innerStruct);

        List<Integer> innerList = new ArrayList<Integer>();
        innerList.add(314);
        innerList.add(007);
        rlist.add(innerList);

        Map<Short, String> map = new HashMap<Short, String>(3);
        map.put(new Short("2"), "hcat is cool");
        map.put(new Short("3"), "is it?");
        map.put(new Short("4"), "or is it not?");
        rlist.add(map);

        rlist.add(new Boolean(true));

        List<Object> c1 = new ArrayList<Object>();
        List<Object> c1_1 = new ArrayList<Object>();
        c1_1.add(new Integer(12));
        List<Object> i2 = new ArrayList<Object>();
        List<Integer> ii1 = new ArrayList<Integer>();
        ii1.add(new Integer(13));
        ii1.add(new Integer(14));
        i2.add(ii1);
        Map<String, List<?>> ii2 = new HashMap<String, List<?>>();
        List<Integer> iii1 = new ArrayList<Integer>();
        iii1.add(new Integer(15));
        ii2.put("phew", iii1);
        i2.add(ii2);
        c1_1.add(i2);
        c1.add(c1_1);
        rlist.add(c1);
        rlist.add(HiveDecimal.create(new BigDecimal("123.45")));//prec 5, scale 2
        rlist.add(new HiveChar("hive\nchar", 10));
        rlist.add(new HiveVarchar("hive\nvarchar", 20));
        rlist.add(Date.valueOf("2014-01-07"));
        rlist.add(new Timestamp(System.currentTimeMillis()));
        rlist.add("hive\nbinary".getBytes("UTF-8"));
    }

    DefaultHCatRecord r = new DefaultHCatRecord(rlist);

    List<String> columnNames =
            Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(","));
    List<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(
        "tinyint,smallint,int,bigint,double,float,string,string,"
                + "struct<a:string,b:string>,array<int>,map<smallint,string>,boolean,"
                + "array<struct<i1:int,i2:struct<ii1:array<int>,ii2:map<string,struct<iii1:int>>>>>,"
                + "decimal(5,2),char(10),varchar(20),date,timestamp,binary");

    StructTypeInfo rowTypeInfo =
            (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    HCatRecordObjectInspector objInspector =
            HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo);

    Text serialized = JsonSerdeUtils.serialize(r, objInspector, columnNames);
    List<Object> deserialized =
            JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes);

    assertRecordEquals(rlist, deserialized);
}

Source File: JsonSerdeUtilsTest.java From incubator-hivemall with Apache License 2.0

4 votes

@Test
public void testRWNull() throws Exception {
    List<Object> nlist = new ArrayList<Object>(13);
    {
        nlist.add(null); // tinyint
        nlist.add(null); // smallint
        nlist.add(null); // int
        nlist.add(null); // bigint
        nlist.add(null); // double
        nlist.add(null); // float
        nlist.add(null); // string
        nlist.add(null); // string
        nlist.add(null); // struct
        nlist.add(null); // array
        nlist.add(null); // map
        nlist.add(null); // bool
        nlist.add(null); // complex
        nlist.add(null); //decimal(5,2)
        nlist.add(null); //char(10)
        nlist.add(null); //varchar(20)
        nlist.add(null); //date
        nlist.add(null); //timestamp
        nlist.add(null); //binary
    }

    DefaultHCatRecord r = new DefaultHCatRecord(nlist);

    List<String> columnNames =
            Arrays.asList("ti,si,i,bi,d,f,s,n,r,l,m,b,c1,bd,hc,hvc,dt,ts,bin".split(","));
    List<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(
        "tinyint,smallint,int,bigint,double,float,string,string,"
                + "struct<a:string,b:string>,array<int>,map<smallint,string>,boolean,"
                + "array<struct<i1:int,i2:struct<ii1:array<int>,ii2:map<string,struct<iii1:int>>>>>,"
                + "decimal(5,2),char(10),varchar(20),date,timestamp,binary");

    StructTypeInfo rowTypeInfo =
            (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    HCatRecordObjectInspector objInspector =
            HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo);

    Text serialized = JsonSerdeUtils.serialize(r, objInspector, columnNames);
    List<Object> deserialized =
            JsonSerdeUtils.deserialize(serialized, columnNames, columnTypes);

    assertRecordEquals(nlist, deserialized);
}

Source File: HCatalogIOTestUtils.java From beam with Apache License 2.0

4 votes

/** returns a DefaultHCatRecord instance for passed value. */
private static DefaultHCatRecord toHCatRecord(int value) {
  return new DefaultHCatRecord(Arrays.asList("record " + value, value));
}

Source File: HCatalogIO.java From beam with Apache License 2.0

4 votes

@Override
@SuppressWarnings({"unchecked", "rawtypes"})
public Coder<HCatRecord> getOutputCoder() {
  return (Coder) WritableCoder.of(DefaultHCatRecord.class);
}

Source File: HCatalogIOTest.java From beam with Apache License 2.0

4 votes

/** Perform end-to-end test of Write-then-Read operation. */
@Test
@NeedsEmptyTestTablesForUnboundedReads
public void testWriteThenUnboundedReadSuccess() throws Exception {

  defaultPipeline
      .apply(Create.of(buildHCatRecords(TEST_RECORDS_COUNT)))
      .apply(
          HCatalogIO.write()
              .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
              .withDatabase(TEST_DATABASE)
              .withTable(TEST_TABLE)
              .withPartition(getPartitions())
              .withBatchSize(512L));
  defaultPipeline.run();
  final ImmutableList<String> partitions = ImmutableList.of("load_date", "product_type");
  final PCollection<HCatRecord> data =
      readAfterWritePipeline
          .apply(
              "ReadData",
              HCatalogIO.read()
                  .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
                  .withDatabase(TEST_DATABASE)
                  .withPartitionCols(partitions)
                  .withTable(TEST_TABLE)
                  .withPollingInterval(Duration.millis(15000))
                  .withTerminationCondition(Watch.Growth.afterTotalOf(Duration.millis(60000))))
          .setCoder((Coder) WritableCoder.of(DefaultHCatRecord.class));

  final PCollection<String> output =
      data.apply(
          ParDo.of(
              new DoFn<HCatRecord, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().get(0).toString());
                }
              }));

  PAssert.that(output).containsInAnyOrder(getExpectedRecords(TEST_RECORDS_COUNT));
  readAfterWritePipeline.run();
}

Source File: HCatalogTestUtils.java From aliyun-maxcompute-data-collectors with Apache License 2.0

4 votes

public List<HCatRecord> loadHCatTable(String dbName,
  String tableName, Map<String, String> partKeyMap,
  HCatSchema tblSchema, List<HCatRecord> records)
  throws Exception {

  Job job = new Job(conf, "HCat load job");

  job.setJarByClass(this.getClass());
  job.setMapperClass(HCatWriterMapper.class);


  // Just writ 10 lines to the file to drive the mapper
  Path path = new Path(fs.getWorkingDirectory(),
    "mapreduce/HCatTableIndexInput");

  job.getConfiguration()
    .setInt(ConfigurationConstants.PROP_MAPRED_MAP_TASKS, 1);
  int writeCount = records.size();
  recsToLoad.clear();
  recsToLoad.addAll(records);
  createInputFile(path, writeCount);
  // input/output settings
  HCatWriterMapper.setWrittenRecordCount(0);

  FileInputFormat.setInputPaths(job, path);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(HCatOutputFormat.class);
  OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName,
    partKeyMap);

  HCatOutputFormat.setOutput(job, outputJobInfo);
  HCatOutputFormat.setSchema(job, tblSchema);
  job.setMapOutputKeyClass(BytesWritable.class);
  job.setMapOutputValueClass(DefaultHCatRecord.class);

  job.setNumReduceTasks(0);
  SqoopHCatUtilities.addJars(job, new SqoopOptions());
  boolean success = job.waitForCompletion(true);

  if (!success) {
    throw new IOException("Loading HCatalog table with test records failed");
  }
  utils.invokeOutputCommitterForLocalMode(job);
  LOG.info("Loaded " + HCatWriterMapper.writtenRecordCount + " records");
  return recsToLoad;
}

Source File: SqoopHCatImportHelper.java From aliyun-maxcompute-data-collectors with Apache License 2.0

4 votes

public HCatRecord convertToHCatRecord(SqoopRecord sqr) throws IOException,
  InterruptedException {
  try {
    // Loading of LOBs was delayed until we have a Context.
    sqr.loadLargeObjects(lobLoader);
  } catch (SQLException sqlE) {
    throw new IOException(sqlE);
  }
  if (colCount == -1) {
    colCount = sqr.getFieldMap().size();
  }

  Map<String, Object> fieldMap = sqr.getFieldMap();
  HCatRecord result = new DefaultHCatRecord(fieldCount);

  for (Map.Entry<String, Object> entry : fieldMap.entrySet()) {
    String key = entry.getKey();
    Object val = entry.getValue();
    String hfn = key.toLowerCase();
    boolean skip = false;
    if (staticPartitionKeys != null && staticPartitionKeys.length > 0) {
      for (int i = 0; i < staticPartitionKeys.length; ++i) {
        if (staticPartitionKeys[i].equals(hfn)) {
          skip = true;
          break;
        }
      }
    }
    if (skip) {
      continue;
    }
    HCatFieldSchema hfs = null;
    try {
      hfs = hCatFullTableSchema.get(hfn);
    } catch (Exception e) {
      throw new IOException("Unable to lookup " + hfn + " in the hcat schema");
    }
    if (debugHCatImportMapper) {
      LOG.debug("SqoopRecordVal: field = " + key + " Val " + val
        + " of type " + (val == null ? null : val.getClass().getName())
        + ", hcattype " + hfs.getTypeString());
    }
    Object hCatVal = toHCat(val, hfs);

    result.set(hfn, hCatFullTableSchema, hCatVal);
  }

  return result;
}

Source File: TableDataBuilder.java From HiveRunner with Apache License 2.0

4 votes

TableDataBuilder newRow() {
  flushRow();
  row = new DefaultHCatRecord(schema.size());
  return this;
}

org.apache.hive.hcatalog.data.DefaultHCatRecord Java Examples