org.apache.pig.impl.util.ObjectSerializer#deserialize

Source File: FixedWidthLoader.java From spork with Apache License 2.0

6 votes

@Override
public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
    // Save reader to use in getNext()
    this.reader = reader;

    splitIndex = split.getSplitIndex();

    // Get schema from front-end
    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfContextSignature });

    String strSchema = p.getProperty(SCHEMA_SIGNATURE);
    if (strSchema == null) {
        throw new IOException("Could not find schema in UDF context");
    }
    schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));

    requiredFields = (boolean[]) ObjectSerializer.deserialize(p.getProperty(REQUIRED_FIELDS_SIGNATURE));
    if (requiredFields != null) {
        numRequiredFields = 0;
        for (int i = 0; i < requiredFields.length; i++) {
            if (requiredFields[i])
                numRequiredFields++;
        }
    }
}

Source File: HBaseStorage.java From spork with Apache License 2.0

6 votes

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    if (location.startsWith("hbase://")){
        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location.substring(8));
    }else{
        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location);
    }

    String serializedSchema = getUDFProperties().getProperty(contextSignature + "_schema");
    if (serializedSchema!= null) {
        schema_ = (ResourceSchema) ObjectSerializer.deserialize(serializedSchema);
    }

    m_conf = initializeLocalJobConfig(job);
    // Not setting a udf property and getting the hbase delegation token
    // only once like in setLocation as setStoreLocation gets different Job
    // objects for each call and the last Job passed is the one that is
    // launched. So we end up getting multiple hbase delegation tokens.
    addHBaseDelegationToken(m_conf, job);
}

Source File: PhoenixHBaseStorage.java From phoenix with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Parse the HBase table name and configure job
 */
@Override
public void setStoreLocation(String location, Job job) throws IOException {
	String prefix = "hbase://";
	if (location.startsWith(prefix)) {
		tableName = location.substring(prefix.length());
	}
	config = new PhoenixPigConfiguration(job.getConfiguration());
	config.configure(server, tableName, batchSize);

	String serializedSchema = getUDFProperties().getProperty(contextSignature + SCHEMA);
	if (serializedSchema != null) {
		schema = (ResourceSchema) ObjectSerializer.deserialize(serializedSchema);
	}
}

Source File: PhoenixHBaseLoader.java From phoenix with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
@Override
public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
    this.reader = reader;
    final String resourceSchemaAsStr = getValueFromUDFContext(this.contextSignature,RESOURCE_SCHEMA_SIGNATURE);
    if (resourceSchemaAsStr == null) {
        throw new IOException("Could not find schema in UDF context");
    }
   schema = (ResourceSchema)ObjectSerializer.deserialize(resourceSchemaAsStr); 
}

Source File: TestPruneColumn.java From spork with Apache License 2.0

5 votes

@Override
public Tuple getNext() throws IOException {
    if (aliases==null) {
        aliases = (String[])ObjectSerializer.deserialize(UDFContext.getUDFContext().getUDFProperties(this.getClass()).getProperty(signature));
        Tuple t = TupleFactory.getInstance().newTuple();
        for (String s : aliases)
            t.append(s);
        return t;
    }
    return null;
}

Source File: PigSchemaConverter.java From parquet-mr with Apache License 2.0

5 votes

public static RequiredFieldList deserializeRequiredFieldList(String requiredFieldString) {
  if(requiredFieldString == null) {
      return null;
  }

  try {
    return (RequiredFieldList) ObjectSerializer.deserialize(requiredFieldString);
  } catch (IOException e) {
    throw new RuntimeException("Failed to deserialize pushProjection", e);
  }
}

Source File: PigBytesRawComparator.java From spork with Apache License 2.0

5 votes

public void setConf(Configuration conf) {
    try {
        mAsc = (boolean[])ObjectSerializer.deserialize(conf.get(
            "pig.sortOrder"));
    } catch (IOException ioe) {
        mLog.error("Unable to deserialize pig.sortOrder " +
            ioe.getMessage());
        throw new RuntimeException(ioe);
    }
    if (mAsc == null) {
        mAsc = new boolean[1];
        mAsc[0] = true;
    }
    ((BinInterSedes.BinInterSedesTupleRawComparator)mWrappedComp).setConf(conf);
}

Source File: PigFloatRawComparator.java From spork with Apache License 2.0

5 votes

public void setConf(Configuration conf) {
    try {
        mAsc = (boolean[])ObjectSerializer.deserialize(conf.get(
            "pig.sortOrder"));
    } catch (IOException ioe) {
        mLog.error("Unable to deserialize pig.sortOrder " +
            ioe.getMessage());
        throw new RuntimeException(ioe);
    }
    if (mAsc == null) {
        mAsc = new boolean[1];
        mAsc[0] = true;
    }
}

Source File: SchemaTuple.java From spork with Apache License 2.0

5 votes

protected static Schema staticSchemaGen(String s) {
    try {
        if (s.equals("")) {
            Log.warn("No Schema present in SchemaTuple generated class");
            return new Schema();
        }
        return (Schema) ObjectSerializer.deserialize(s);
    } catch (IOException e) {
        throw new RuntimeException("Unable to deserialize serialized Schema: " + s, e);
    }
}

Source File: OrcStorage.java From spork with Apache License 2.0

5 votes

private TypeInfo getTypeInfo(String location, Job job) throws IOException {
    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
    TypeInfo typeInfo = (TypeInfo) ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
    if (typeInfo == null) {
        typeInfo = getTypeInfoFromLocation(location, job);
    }
    if (typeInfo != null) {
        p.setProperty(signature + SchemaSignatureSuffix, ObjectSerializer.serialize(typeInfo));
    }
    return typeInfo;
}

Source File: OrcStorage.java From spork with Apache License 2.0

5 votes

@Override
public void setLocation(String location, Job job) throws IOException {
    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
    if (!UDFContext.getUDFContext().isFrontend()) {
        typeInfo = (TypeInfo)ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
    } else if (typeInfo == null) {
        typeInfo = getTypeInfo(location, job);
    }
    if (typeInfo != null && oi == null) {
        oi = OrcStruct.createObjectInspector(typeInfo);
    }
    if (!UDFContext.getUDFContext().isFrontend()) {
        if (p.getProperty(signature + RequiredColumnsSuffix) != null) {
            mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p
                    .getProperty(signature + RequiredColumnsSuffix));
            job.getConfiguration().setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,
                    getReqiredColumnIdString(mRequiredColumns));
            if (p.getProperty(signature + SearchArgsSuffix) != null) {
                // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
                job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                        getReqiredColumnNamesString(getSchema(location, job), mRequiredColumns));
            }
        } else if (p.getProperty(signature + SearchArgsSuffix) != null) {
            // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                    getReqiredColumnNamesString(getSchema(location, job)));
        }
        if (p.getProperty(signature + SearchArgsSuffix) != null) {
            job.getConfiguration().set(SARG_PUSHDOWN, p.getProperty(signature + SearchArgsSuffix));
        }

    }
    FileInputFormat.setInputPaths(job, location);
}

Source File: PigInputFormat.java From spork with Apache License 2.0

4 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public org.apache.hadoop.mapreduce.RecordReader<Text, Tuple> createRecordReader(
        org.apache.hadoop.mapreduce.InputSplit split,
        TaskAttemptContext context) throws IOException,
        InterruptedException {
    // We need to create a TaskAttemptContext based on the Configuration which
    // was used in the getSplits() to produce the split supplied here. For
    // this, let's find out the input of the script which produced the split
    // supplied here and then get the corresponding Configuration and setup
    // TaskAttemptContext based on it and then call the real InputFormat's
    // createRecordReader() method

    PigSplit pigSplit = (PigSplit)split;
    activeSplit = pigSplit;
    // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
    // passing total # of splits to each split so it can be retrieved
    // here and set it to the configuration object. This number is needed
    // by PoissonSampleLoader to compute the number of samples
    int n = pigSplit.getTotalSplits();
    context.getConfiguration().setInt("pig.mapsplits.count", n);
    Configuration conf = context.getConfiguration();
    PigContext.setPackageImportList((ArrayList<String>) ObjectSerializer
            .deserialize(conf.get("udf.import.list")));
    MapRedUtil.setupUDFContext(conf);
    LoadFunc loadFunc = getLoadFunc(pigSplit.getInputIndex(), conf);
    // Pass loader signature to LoadFunc and to InputFormat through
    // the conf
    passLoadSignature(loadFunc, pigSplit.getInputIndex(), conf);

    // merge entries from split specific conf into the conf we got
    PigInputFormat.mergeSplitSpecificConf(loadFunc, pigSplit, conf);

    // for backward compatibility
    PigInputFormat.sJob = conf;

    InputFormat inputFormat = loadFunc.getInputFormat();

    List<Long> inpLimitLists =
            (ArrayList<Long>)ObjectSerializer.deserialize(
                    conf.get("pig.inpLimits"));

    return new PigRecordReader(inputFormat, pigSplit, loadFunc, context, inpLimitLists.get(pigSplit.getInputIndex()));
}

Source File: PigGenericMapReduce.java From spork with Apache License 2.0

4 votes

/**
 * Configures the Reduce plan, the POPackage operator
 * and the reporter thread
 */
@SuppressWarnings("unchecked")
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    inIllustrator = inIllustrator(context);
    if (inIllustrator)
        pack = getPack(context);
    Configuration jConf = context.getConfiguration();
    SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf));
    context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
    sJobContext = context;
    sJobConfInternal.set(context.getConfiguration());
    sJobConf = context.getConfiguration();
    try {
        PigContext.setPackageImportList((ArrayList<String>)ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext)ObjectSerializer.deserialize(jConf.get("pig.pigContext"));

        // This attempts to fetch all of the generated code from the distributed cache, and resolve it
        SchemaTupleBackend.initialize(jConf, pigContext);

        if (rp == null)
            rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf
                    .get("pig.reducePlan"));
        stores = PlanHelper.getPhysicalOperators(rp, POStore.class);

        if (!inIllustrator)
            pack = (POPackage)ObjectSerializer.deserialize(jConf.get("pig.reduce.package"));
        // To be removed
        if(rp.isEmpty())
            log.debug("Reduce Plan empty!");
        else{
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            rp.explain(baos);
            log.debug(baos.toString());
        }
        pigReporter = new ProgressableReporter();
        if(!(rp.isEmpty())) {
            roots = rp.getRoots().toArray(new PhysicalOperator[1]);
            leaf = rp.getLeaves().get(0);
        }

        // Get the UDF specific context
        MapRedUtil.setupUDFContext(jConf);

    } catch (IOException ioe) {
        String msg = "Problem while configuring reduce plan.";
        throw new RuntimeException(msg, ioe);
    }

    log.info("Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location"));

    Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
}

Source File: AvroStorage.java From spork with Apache License 2.0

4 votes

/**
 * Set input location and obtain input schema.
 */
@SuppressWarnings("unchecked")
@Override
public void setLocation(String location, Job job) throws IOException {
    if (inputAvroSchema != null) {
        return;
    }

    if (!UDFContext.getUDFContext().isFrontend()) {
        Properties udfProps = getUDFProperties();
        String mergedSchema = udfProps.getProperty(AVRO_MERGED_SCHEMA_PROPERTY);
        if (mergedSchema != null) {
            HashMap<URI, Map<Integer, Integer>> mergedSchemaMap =
                    (HashMap<URI, Map<Integer, Integer>>) ObjectSerializer.deserialize(mergedSchema);
            schemaToMergedSchemaMap = new HashMap<Path, Map<Integer, Integer>>();
            for (Entry<URI, Map<Integer, Integer>> entry : mergedSchemaMap.entrySet()) {
                schemaToMergedSchemaMap.put(new Path(entry.getKey()), entry.getValue());
            }
        }
        String schema = udfProps.getProperty(AVRO_INPUT_SCHEMA_PROPERTY);
        if (schema != null) {
            try {
                inputAvroSchema = new Schema.Parser().parse(schema);
                return;
            } catch (Exception e) {
                // Cases like testMultipleSchemas2 cause exception while deserializing
                // symbols. In that case, we get it again.
                LOG.warn("Exception while trying to deserialize schema in backend. " +
                        "Will construct again. schema= " + schema, e);
            }
        }
    }

    Configuration conf = job.getConfiguration();
    Set<Path> paths = AvroStorageUtils.getPaths(location, conf, true);
    if (!paths.isEmpty()) {
        // Set top level directories in input format. Adding all files will
        // bloat configuration size
        FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
        // Scan all directories including sub directories for schema
        if (inputAvroSchema == null) {
            setInputAvroSchema(paths, conf);
        }
    } else {
        throw new IOException("Input path \'" + location + "\' is not found");
    }

}

Source File: PigCombiner.java From spork with Apache License 2.0

4 votes

/**
 * Configures the Reduce plan, the POPackage operator
 * and the reporter thread
 */
@SuppressWarnings("unchecked")
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration jConf = context.getConfiguration();
    try {
        PigContext.setPackageImportList((ArrayList<String>)ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext)ObjectSerializer.deserialize(jConf.get("pig.pigContext"));
        if (pigContext.getLog4jProperties()!=null)
            PropertyConfigurator.configure(pigContext.getLog4jProperties());

        cp = (PhysicalPlan) ObjectSerializer.deserialize(jConf
                .get("pig.combinePlan"));
        pack = (POPackage)ObjectSerializer.deserialize(jConf.get("pig.combine.package"));
        // To be removed
        if(cp.isEmpty())
            log.debug("Combine Plan empty!");
        else{
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            cp.explain(baos);
            log.debug(baos.toString());
        }

        keyType = ((byte[])ObjectSerializer.deserialize(jConf.get("pig.map.keytype")))[0];
        // till here

        pigReporter = new ProgressableReporter();
        if(!(cp.isEmpty())) {
            roots = cp.getRoots().toArray(new PhysicalOperator[1]);
            leaf = cp.getLeaves().get(0);
        }
    } catch (IOException ioe) {
        String msg = "Problem while configuring combiner's reduce plan.";
        throw new RuntimeException(msg, ioe);
    }

    // Avoid log spamming
    if (firstTime) {
        log.info("Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location"));
        firstTime = false;
    }
}

Source File: PigGenericMapBase.java From spork with Apache License 2.0

4 votes

/**
 * Configures the mapper with the map plan and the
 * reproter thread
 */
@SuppressWarnings("unchecked")
@Override
public void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);

    Configuration job = context.getConfiguration();
    SpillableMemoryManager.configure(ConfigurationUtil.toProperties(job));
    context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
    PigMapReduce.sJobContext = context;
    PigMapReduce.sJobConfInternal.set(context.getConfiguration());
    PigMapReduce.sJobConf = context.getConfiguration();
    inIllustrator = inIllustrator(context);

    PigContext.setPackageImportList((ArrayList<String>)ObjectSerializer.deserialize(job.get("udf.import.list")));
    pigContext = (PigContext)ObjectSerializer.deserialize(job.get("pig.pigContext"));

    // This attempts to fetch all of the generated code from the distributed cache, and resolve it
    SchemaTupleBackend.initialize(job, pigContext);

    if (pigContext.getLog4jProperties()!=null)
        PropertyConfigurator.configure(pigContext.getLog4jProperties());

    if (mp == null)
        mp = (PhysicalPlan) ObjectSerializer.deserialize(
            job.get("pig.mapPlan"));
    stores = PlanHelper.getPhysicalOperators(mp, POStore.class);

    // To be removed
    if(mp.isEmpty())
        log.debug("Map Plan empty!");
    else{
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        mp.explain(baos);
        log.debug(baos.toString());
    }
    keyType = ((byte[])ObjectSerializer.deserialize(job.get("pig.map.keytype")))[0];
    // till here

    pigReporter = new ProgressableReporter();
    // Get the UDF specific context
    MapRedUtil.setupUDFContext(job);

    if(!(mp.isEmpty())) {

        PigSplit split = (PigSplit)context.getInputSplit();
        List<OperatorKey> targetOpKeys = split.getTargetOps();

        ArrayList<PhysicalOperator> targetOpsAsList = new ArrayList<PhysicalOperator>();
        for (OperatorKey targetKey : targetOpKeys) {
            targetOpsAsList.add(mp.getOperator(targetKey));
        }
        roots = targetOpsAsList.toArray(new PhysicalOperator[1]);
        leaf = mp.getLeaves().get(0);
    }

    PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
    pigStatusReporter.setContext(new MRTaskContext(context));

    log.info("Aliases being processed per job phase (AliasName[line,offset]): " + job.get("pig.alias.location"));

    Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
}

Source File: AvroStorage.java From Cubert with Apache License 2.0

4 votes

/**
 * Set input location and obtain input schema.
 */
@SuppressWarnings("unchecked")
@Override
public void setLocation(String location, Job job) throws IOException {
    if (inputAvroSchema != null) {
        return;
    }

    if (!UDFContext.getUDFContext().isFrontend()) {
        Properties udfProps = getUDFProperties();
        String mergedSchema = udfProps.getProperty(AVRO_MERGED_SCHEMA_PROPERTY);
        if (mergedSchema != null) {
            HashMap<URI, Map<Integer, Integer>> mergedSchemaMap =
                    (HashMap<URI, Map<Integer, Integer>>) ObjectSerializer.deserialize(mergedSchema);
            schemaToMergedSchemaMap = new HashMap<Path, Map<Integer, Integer>>();
            for (Entry<URI, Map<Integer, Integer>> entry : mergedSchemaMap.entrySet()) {
                schemaToMergedSchemaMap.put(new Path(entry.getKey()), entry.getValue());
            }
        }
        String schema = udfProps.getProperty(AVRO_INPUT_SCHEMA_PROPERTY);
        if (schema != null) {
            try {
                inputAvroSchema = new Schema.Parser().parse(schema);
                return;
            } catch (Exception e) {
                // Cases like testMultipleSchemas2 cause exception while deserializing
                // symbols. In that case, we get it again.
                LOG.warn("Exception while trying to deserialize schema in backend. " +
                        "Will construct again. schema= " + schema, e);
            }
        }
    }

    Configuration conf = job.getConfiguration();
    Set<Path> paths = AvroStorageUtils.getPaths(location, conf, true);
    if (!paths.isEmpty()) {
        // Set top level directories in input format. Adding all files will
        // bloat configuration size
        FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
        // Scan all directories including sub directories for schema
        if (inputAvroSchema == null) {
            setInputAvroSchema(paths, conf);
        }
    } else {
        throw new IOException("Input path \'" + location + "\' is not found");
    }

}

Source File: CSVLoader.java From spork with Apache License 2.0

4 votes

@Override
public Tuple getNext() throws IOException {
    mProtoTuple = new ArrayList<Object>();

    boolean inField = false;
    boolean inQuotedField = false;
    boolean evenQuotesSeen = true;
    
    if (!mRequiredColumnsInitialized) {
        if (signature != null) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
            mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
        }
        mRequiredColumnsInitialized = true;
    }
    try {
        if (!in.nextKeyValue()) {
            return null;
        }                                                                                           
        Text value = (Text) in.getCurrentValue();
        byte[] buf = value.getBytes();
        int len = value.getLength();
        int fieldID = 0;

        ByteBuffer fieldBuffer = ByteBuffer.allocate(len);

        for (int i = 0; i < len; i++) {
            byte b = buf[i];
            inField = true;
            if (inQuotedField) {
                if (b == DOUBLE_QUOTE) {
                    evenQuotesSeen = !evenQuotesSeen;
                    if (evenQuotesSeen) {
                        fieldBuffer.put(DOUBLE_QUOTE);
                    }
                } else
                    if (!evenQuotesSeen &&
                            (b == FIELD_DEL || b == RECORD_DEL)) {
                        inQuotedField = false;
                        inField = false;
                        readField(fieldBuffer, fieldID++);
                    } else {
                        fieldBuffer.put(b);
                    }
            } else if (b == DOUBLE_QUOTE) {
                inQuotedField = true;
                evenQuotesSeen = true;
            } else if (b == FIELD_DEL) {
                inField = false;
                readField(fieldBuffer, fieldID++); // end of the field
            } else {
                evenQuotesSeen = true;
                fieldBuffer.put(b);
            }
        }
        if (inField) readField(fieldBuffer, fieldID++);
    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, 
                PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
}

Source File: IcebergStorage.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
public <T extends Serializable> T getFromUDFContext(String key, Class<T> clazz) throws IOException {
  Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{signature});

  return (T) ObjectSerializer.deserialize(properties.getProperty(key));
}

Source File: IcebergPigInputFormat.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private boolean advance() throws IOException {
  if (reader != null) {
    reader.close();
  }

  if (!tasks.hasNext()) {
    return false;
  }

  FileScanTask currentTask = tasks.next();

  Schema tableSchema = (Schema) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA)));
  LOG.debug("[{}]: Task table schema: {}", signature, tableSchema);

  List<String> projectedFields =
      (List<String>) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS)));
  LOG.debug("[{}]: Task projected fields: {}", signature, projectedFields);

  Schema projectedSchema = projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema;

  PartitionSpec spec = currentTask.asFileScanTask().spec();
  DataFile file = currentTask.file();
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());

  Set<Integer> idColumns = spec.identitySourceIds();

  // schema needed for the projection and filtering
  boolean hasJoinedPartitionColumns = !idColumns.isEmpty();

  switch (file.format()) {
    case PARQUET:
      Map<Integer, Object> partitionValueMap = Maps.newHashMap();

      if (hasJoinedPartitionColumns) {

        Schema readSchema = TypeUtil.selectNot(projectedSchema, idColumns);
        Schema projectedPartitionSchema = TypeUtil.select(projectedSchema, idColumns);

        Map<String, Integer> partitionSpecFieldIndexMap = Maps.newHashMap();
        for (int i = 0; i < spec.fields().size(); i++) {
          partitionSpecFieldIndexMap.put(spec.fields().get(i).name(), i);
        }

        for (Types.NestedField field : projectedPartitionSchema.columns()) {
          int partitionIndex = partitionSpecFieldIndexMap.get(field.name());

          Object partitionValue = file.partition().get(partitionIndex, Object.class);
          partitionValueMap.put(field.fieldId(), convertPartitionValue(field.type(), partitionValue));
        }

        reader = Parquet.read(inputFile)
            .project(readSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      } else {
        reader = Parquet.read(inputFile)
            .project(projectedSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      }

      recordIterator = reader.iterator();

      break;
    default:
      throw new UnsupportedOperationException("Unsupported file format: " + file.format());
  }

  return true;
}

Java Code Examples for org.apache.pig.impl.util.ObjectSerializer#deserialize()