org.apache.hadoop.io.ArrayWritable Java Exaples

Source File: AbstractParquetMapInspector.java From parquet-mr with Apache License 2.0

6 votes

@Override
public int getMapSize(final Object data) {
  if (data == null) {
    return -1;
  }

  if (data instanceof ArrayWritable) {
    final Writable[] mapContainer = ((ArrayWritable) data).get();

    if (mapContainer == null || mapContainer.length == 0) {
      return -1;
    } else {
      return ((ArrayWritable) mapContainer[0]).get().length;
    }
  }

  if (data instanceof Map) {
    return ((Map) data).size();
  }

  throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint()
		throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003encrypt.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	conf.set("hadoopoffice.read.lowFootprint", "true");
	// for decryption simply set the password
	conf.set("hadoopoffice.read.security.crypt.password", "test2");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);

	InterruptedException ex = assertThrows(InterruptedException.class,
			() -> reader.initialize(splits.get(0), context), "Exception is thrown in case of wrong password");
}

Source File: FSEditLogOp.java From hadoop with Apache License 2.0

6 votes

@Override
public void writeFields(DataOutputStream out) throws IOException {
  FSImageSerialization.writeLong(inodeId, out);
  FSImageSerialization.writeString(path, out);
  FSImageSerialization.writeShort(replication, out);
  FSImageSerialization.writeLong(mtime, out);
  FSImageSerialization.writeLong(atime, out);
  FSImageSerialization.writeLong(blockSize, out);
  new ArrayWritable(Block.class, blocks).write(out);
  permissions.write(out);

  if (this.opCode == OP_ADD) {
    AclEditLogUtil.write(aclEntries, out);
    XAttrEditLogProto.Builder b = XAttrEditLogProto.newBuilder();
    b.addAllXAttrs(PBHelper.convertXAttrProto(xAttrs));
    b.build().writeDelimitedTo(out);
    FSImageSerialization.writeString(clientName,out);
    FSImageSerialization.writeString(clientMachine,out);
    FSImageSerialization.writeBoolean(overwrite, out);
    FSImageSerialization.writeByte(storagePolicyId, out);
    // write clientId and callId
    writeRpcIds(rpcClientId, rpcCallId, out);
  }
}

Source File: RealtimeUnmergedRecordReader.java From hudi with Apache License 2.0

6 votes

/**
 * Construct a Unmerged record reader that parallely consumes both parquet and log records and buffers for upstream
 * clients to consume.
 *
 * @param split File split
 * @param job Job Configuration
 * @param realReader Parquet Reader
 */
public RealtimeUnmergedRecordReader(HoodieRealtimeFileSplit split, JobConf job,
    RecordReader<NullWritable, ArrayWritable> realReader) {
  super(split, job);
  this.parquetReader = new SafeParquetRecordReaderWrapper(realReader);
  // Iterator for consuming records from parquet file
  this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader);
  this.executor = new BoundedInMemoryExecutor<>(getMaxCompactionMemoryInBytes(), getParallelProducers(),
      Option.empty(), x -> x, new DefaultSizeEstimator<>());
  // Consumer of this record reader
  this.iterator = this.executor.getQueue().iterator();
  this.logRecordScanner = new HoodieUnMergedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf),
      split.getBasePath(), split.getDeltaLogPaths(), getReaderSchema(), split.getMaxCommitTime(),
      Boolean.parseBoolean(jobConf.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)),
      false, jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), record -> {
        // convert Hoodie log record to Hadoop AvroWritable and buffer
        GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get();
        ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema());
        this.executor.getQueue().insertRecord(aWritable);
  });
  // Start reading and buffering
  this.executor.startProducers();
}

Source File: DiscoveryLogic.java From datawave with Apache License 2.0

6 votes

/**
 * Takes in a batch scanner and returns an iterator over the DiscoveredThing objects contained in the value.
 *
 * @param scanner
 * @return
 */
public static Iterator<DiscoveredThing> transformScanner(final BatchScanner scanner) {
    return concat(transform(scanner.iterator(), new Function<Entry<Key,Value>,Iterator<DiscoveredThing>>() {
        DataInputBuffer in = new DataInputBuffer();
        
        @Override
        public Iterator<DiscoveredThing> apply(Entry<Key,Value> from) {
            Value value = from.getValue();
            in.reset(value.get(), value.getSize());
            ArrayWritable aw = new ArrayWritable(DiscoveredThing.class);
            try {
                aw.readFields(in);
            } catch (IOException e) {
                log.error(e);
                return null;
            }
            ArrayList<DiscoveredThing> thangs = Lists.newArrayListWithCapacity(aw.get().length);
            for (Writable w : aw.get()) {
                thangs.add((DiscoveredThing) w);
            }
            return thangs.iterator();
        }
    }));
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint()
		throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	conf.set("hadoopoffice.read.lowFootprint", "true");
	conf.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password
	conf.set("hadoopoffice.read.security.crypt.password", "test2");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
	InterruptedException ex = assertThrows(InterruptedException.class,
			() -> reader.initialize(splits.get(0), context), "Exception is thrown in case of wrong password");
}

Source File: StandardParquetHiveMapInspector.java From parquet-mr with Apache License 2.0

6 votes

@Override
public Object getMapValueElement(final Object data, final Object key) {
  if (data == null || key == null) {
    return null;
  }
  if (data instanceof ArrayWritable) {
    final Writable[] mapContainer = ((ArrayWritable) data).get();

    if (mapContainer == null || mapContainer.length == 0) {
      return null;
    }
    final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get();
    for (final Writable obj : mapArray) {
      final ArrayWritable mapObj = (ArrayWritable) obj;
      final Writable[] arr = mapObj.get();
      if (key.equals(arr[0])) {
        return arr[1];
      }
    }
    return null;
  }
  if (data instanceof Map) {
    return ((Map) data).get(key);
  }
  throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: ArrayWritableObjectInspector.java From indexr with Apache License 2.0

6 votes

@Override
public Object getStructFieldData(final Object data, final StructField fieldRef) {
    if (data == null) {
        return null;
    }

    if (data instanceof ArrayWritable) {
        final ArrayWritable arr = (ArrayWritable) data;
        return arr.get()[((StructFieldImpl) fieldRef).getIndex()];
    }

    //since setStructFieldData and create return a list, getStructFieldData should be able to
    //handle list data. This is required when table serde is ParquetHiveSerDe and partition serde
    //is something else.
    if (data instanceof List) {
        return ((List) data).get(((StructFieldImpl) fieldRef).getIndex());
    }

    throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: QueryUtils.java From incubator-retired-pirk with Apache License 2.0

6 votes

/**
 * Pulls the correct selector from the MapWritable data element given the queryType
 * <p>
 * Pulls first element of array if element is an array type
 */
public static String getSelectorByQueryType(MapWritable dataMap, QuerySchema qSchema, DataSchema dSchema)
{
  String selector;

  String fieldName = qSchema.getSelectorName();
  if (dSchema.isArrayElement(fieldName))
  {
    if (dataMap.get(dSchema.getTextName(fieldName)) instanceof WritableArrayWritable)
    {
      String[] selectorArray = ((WritableArrayWritable) dataMap.get(dSchema.getTextName(fieldName))).toStrings();
      selector = selectorArray[0];
    }
    else
    {
      String[] elementArray = ((ArrayWritable) dataMap.get(dSchema.getTextName(fieldName))).toStrings();
      selector = elementArray[0];
    }
  }
  else
  {
    selector = dataMap.get(dSchema.getTextName(fieldName)).toString();
  }

  return selector;
}

Source File: FSEditLogOp.java From big-c with Apache License 2.0

6 votes

@Override
public void writeFields(DataOutputStream out) throws IOException {
  FSImageSerialization.writeLong(inodeId, out);
  FSImageSerialization.writeString(path, out);
  FSImageSerialization.writeShort(replication, out);
  FSImageSerialization.writeLong(mtime, out);
  FSImageSerialization.writeLong(atime, out);
  FSImageSerialization.writeLong(blockSize, out);
  new ArrayWritable(Block.class, blocks).write(out);
  permissions.write(out);

  if (this.opCode == OP_ADD) {
    AclEditLogUtil.write(aclEntries, out);
    XAttrEditLogProto.Builder b = XAttrEditLogProto.newBuilder();
    b.addAllXAttrs(PBHelper.convertXAttrProto(xAttrs));
    b.build().writeDelimitedTo(out);
    FSImageSerialization.writeString(clientName,out);
    FSImageSerialization.writeString(clientMachine,out);
    FSImageSerialization.writeBoolean(overwrite, out);
    FSImageSerialization.writeByte(storagePolicyId, out);
    // write clientId and callId
    writeRpcIds(rpcClientId, rpcCallId, out);
  }
}

Source File: HoodieParquetInputFormat.java From hudi with Apache License 2.0

6 votes

@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job,
    final Reporter reporter) throws IOException {
  // TODO enable automatic predicate pushdown after fixing issues
  // FileSplit fileSplit = (FileSplit) split;
  // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent());
  // String tableName = metadata.getTableName();
  // String mode = HoodieHiveUtil.readMode(job, tableName);

  // if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
  // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split);
  // LOG.info("Setting parquet predicate push down as " + predicate);
  // ParquetInputFormat.setFilterPredicate(job, predicate);
  // clearOutExistingPredicate(job);
  // }
  return super.getRecordReader(split, job, reporter);
}

Source File: FSEditLogOp.java From RDFS with Apache License 2.0

6 votes

@Override 
void writeFields(DataOutputStream out) throws IOException {
  out.writeInt(5);
  FSImageSerialization.writeString(path, out);
  FSImageSerialization.writeShortAsString(replication, out);
  FSImageSerialization.writeLongAsString(mtime, out);
  FSImageSerialization.writeLongAsString(atime, out);
  FSImageSerialization.writeLongAsString(blockSize, out);
  new ArrayWritable(Block.class, blocks).write(out);
  permissions.write(out);

  if (this.opCode == OP_ADD) {
    FSImageSerialization.writeString(clientName,out);
    FSImageSerialization.writeString(clientMachine,out);
  }
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password

	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}

Source File: AbstractSpreadSheetDocumentRecordWriter.java From hadoopoffice with Apache License 2.0

6 votes

/**
*
* Write SpreadSheetDAO into a table document. Note this does not necessarily mean it is already written in the OutputStream, but usually the in-memory representation.
* @param key is ignored
* @param value is a SpreadSheet Cell to be inserted into the table document
*
*/
@Override
public synchronized void write(NullWritable key, K value) throws IOException {
		try {
			if (value==null) {
				return;
			}
			if (value instanceof ArrayWritable) {
				ArrayWritable row = (ArrayWritable)value;
				Writable[] rowCellDAO = row.get();
				for (int i=0;i<rowCellDAO.length;i++) {
					this.officeWriter.write(rowCellDAO[i]);
				}
			} else {
				this.officeWriter.write(value);
			}
		} catch (OfficeWriterException e) {
			LOG.error(e);
		}
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003encrypt.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "sax");
	// for decryption simply set the password

	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}

Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0

6 votes

private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
    int totalExpected) throws IOException {
  int actualCount = 0;
  int totalCount = 0;
  InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
  for (InputSplit split : splits) {
    RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf, null);
    NullWritable key = recordReader.createKey();
    ArrayWritable writable = recordReader.createValue();

    while (recordReader.next(key, writable)) {
      // writable returns an array with [field1, field2, _hoodie_commit_time,
      // _hoodie_commit_seqno]
      // Take the commit time and compare with the one we are interested in
      if (commit.equals((writable.get()[2]).toString())) {
        actualCount++;
      }
      totalCount++;
    }
  }
  assertEquals(expectedNumberOfRecordsInCommit, actualCount, msg);
  assertEquals(totalExpected, totalCount, msg);
}

Source File: LinkedMapWritable.java From elasticsearch-hadoop with Apache License 2.0

6 votes

@Override
public String toString() {
    Iterator<Entry<Writable, Writable>> i = entrySet().iterator();
    if (!i.hasNext())
        return "{}";

    StringBuilder sb = new StringBuilder();
    sb.append('{');
    for (;;) {
        Entry<Writable, Writable> e = i.next();
        Writable key = e.getKey();
        Writable value = e.getValue();
        sb.append(key == this ? "(this Map)" : key);
        sb.append('=');
        if (value instanceof ArrayWritable) {
            sb.append(Arrays.toString(((ArrayWritable) value).get()));
        }
        else {
            sb.append(value == this ? "(this Map)" : value);
        }
        if (!i.hasNext())
            return sb.append('}').toString();
        sb.append(", ");
    }
}

Source File: ArrayWritableObjectInspector.java From parquet-mr with Apache License 2.0

6 votes

@Override
public Object getStructFieldData(final Object data, final StructField fieldRef) {
  if (data == null) {
    return null;
  }

  if (data instanceof ArrayWritable) {
    final ArrayWritable arr = (ArrayWritable) data;
    return arr.get()[((StructFieldImpl) fieldRef).getIndex()];
  }

  //since setStructFieldData and create return a list, getStructFieldData should be able to 
  //handle list data. This is required when table serde is ParquetHiveSerDe and partition serde
  //is something else.
  if (data instanceof List) {
    return ((List) data).get(((StructFieldImpl) fieldRef).getIndex());
  }

  throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: DataWritableGroupConverter.java From parquet-mr with Apache License 2.0

6 votes

public final ArrayWritable getCurrentArray() {
  final Writable[] writableArr;
  if (this.rootMap != null) { // We're at the root : we can safely re-use the same map to save perf
    writableArr = this.rootMap;
  } else {
    writableArr = new Writable[currentArr.length];
  }

  for (int i = 0; i < currentArr.length; i++) {
    final Object obj = currentArr[i];
    if (obj instanceof List) {
      final List<?> objList = (List<?>)obj;
      final ArrayWritable arr = new ArrayWritable(Writable.class,
          objList.toArray(new Writable[objList.size()]));
      writableArr[i] = arr;
    } else {
      writableArr[i] = (Writable) obj;
    }
  }
  return new ArrayWritable(Writable.class, writableArr);
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "us");
	conf.set("hadoopoffice.read.header.read", "true");
	conf.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	conf.set("hadoopoffice.read.header.column.names.regex","column");
	conf.set("hadoopoffice.read.header.column.names.replace", "spalte");
	conf.set("hadoopoffice.read.lowFootprint", "true");
	conf.set("hadoopoffice.read.lowFootprint.parser", "sax");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull(reader, "Format returned  null RecordReader");
	reader.initialize(splits.get(0), context);

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}

Source File: ParquetHiveArrayInspector.java From parquet-mr with Apache License 2.0

5 votes

@Override
public List<?> getList(final Object data) {
  if (data == null) {
    return null;
  }

  if (data instanceof ArrayWritable) {
    final Writable[] listContainer = ((ArrayWritable) data).get();

    if (listContainer == null || listContainer.length == 0) {
      return null;
    }

    final Writable subObj = listContainer[0];

    if (subObj == null) {
      return null;
    }

    final Writable[] array = ((ArrayWritable) subObj).get();
    final List<Writable> list = new ArrayList<Writable>();

    for (final Writable obj : array) {
      list.add(obj);
    }

    return list;
  }

  throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedPositiveLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "sax");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2013encrypt.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013encrypt.xlsx]Sheet1!A1\"");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

Source File: TestStandardParquetHiveMapInspector.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testRegularMap() {
  final Writable[] entry1 = new Writable[]{new IntWritable(0), new IntWritable(1)};
  final Writable[] entry2 = new Writable[]{new IntWritable(2), new IntWritable(3)};

  final ArrayWritable internalMap = new ArrayWritable(ArrayWritable.class, new Writable[]{
    new ArrayWritable(Writable.class, entry1), new ArrayWritable(Writable.class, entry2)});

  final ArrayWritable map = new ArrayWritable(ArrayWritable.class, new Writable[]{internalMap});

  assertEquals("Wrong result of inspection", new IntWritable(1), inspector.getMapValueElement(map, new IntWritable(0)));
  assertEquals("Wrong result of inspection", new IntWritable(3), inspector.getMapValueElement(map, new IntWritable(2)));
  assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 0)));
  assertNull("Wrong result of inspection", inspector.getMapValueElement(map, new ShortWritable((short) 2)));
}

Source File: ArrayWritableObjectInspector.java From parquet-mr with Apache License 2.0

5 votes

@Override
public List<Object> getStructFieldsDataAsList(final Object data) {
  if (data == null) {
    return null;
  }

  if (data instanceof ArrayWritable) {
    final ArrayWritable arr = (ArrayWritable) data;
    final Object[] arrWritable = arr.get();
    return new ArrayList<Object>(Arrays.asList(arrWritable));
  }

  throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: ParquetRecordReaderWrapper.java From parquet-mr with Apache License 2.0

5 votes

public ParquetRecordReaderWrapper(
    final ParquetInputFormat<ArrayWritable> newInputFormat,
    final InputSplit oldSplit,
    final JobConf oldJobConf,
    final Reporter reporter)
        throws IOException, InterruptedException {
  this(newInputFormat, oldSplit, oldJobConf, reporter,
      (new HiveBindingFactory()).create());
}

Source File: TestParquetHiveArrayInspector.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNullContainer() {
  final ArrayWritable list = new ArrayWritable(ArrayWritable.class, null);
  assertEquals("Wrong size", -1, inspector.getListLength(list));
  assertNull("Should be null", inspector.getList(list));
  assertNull("Should be null", inspector.getListElement(list, 0));
}

Source File: AbstractParquetMapInspector.java From parquet-mr with Apache License 2.0

5 votes

@Override
public Map<?, ?> getMap(final Object data) {
  if (data == null) {
    return null;
  }

  if (data instanceof ArrayWritable) {
    final Writable[] mapContainer = ((ArrayWritable) data).get();

    if (mapContainer == null || mapContainer.length == 0) {
      return null;
    }

    final Writable[] mapArray = ((ArrayWritable) mapContainer[0]).get();
    final Map<Writable, Writable> map = new HashMap<Writable, Writable>();

    for (final Writable obj : mapArray) {
      final ArrayWritable mapObj = (ArrayWritable) obj;
      final Writable[] arr = mapObj.get();
      map.put(arr[0], arr[1]);
    }

    return map;
  }

  if (data instanceof Map) {
    return (Map) data;
  }

  throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName());
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "us");
	conf.set("hadoopoffice.read.header.read", "true");
	conf.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	conf.set("hadoopoffice.read.header.column.names.regex","column");
	conf.set("hadoopoffice.read.header.column.names.replace", "spalte");
	conf.set("hadoopoffice.read.lowFootprint", "true");
	conf.set("hadoopoffice.read.lowFootprint.parser", "stax");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull(reader, "Format returned  null RecordReader");
	reader.initialize(splits.get(0), context);

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "us");
	job.set("hadoopoffice.read.header.read", "true");
	job.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	job.set("hadoopoffice.read.header.column.names.regex", "column");
	job.set("hadoopoffice.read.header.column.names.replace", "spalte");
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}

org.apache.hadoop.io.ArrayWritable Java Examples