org.apache.avro.mapred.Pair Java Exaples

Source File: RegressionTestLoglik.java From ml-ease with Apache License 2.0

6 votes

@Override
public void reduce(Utf8 key,
                   Iterable<RegressionTestLoglikOutput> values,
                   AvroCollector<Pair<Utf8, RegressionTestLoglikOutput>> collector,
                   Reporter reporter) throws IOException
{
  double sumLoglik = 0;
  double n = 0;
  for (RegressionTestLoglikOutput value : values)
  {
    float loglik = value.testLoglik;
    sumLoglik += loglik;
    n += value.count;
  }
  RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
  output.key = "averageTestLoglik";
  output.testLoglik = (float) sumLoglik;
  output.count = n;
  collector.collect(new Pair<Utf8, RegressionTestLoglikOutput>(key, output));
}

Source File: RegressionAdmmTrain.java From ml-ease with Apache License 2.0

6 votes

@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<Integer, RegressionPrepareOutput>> collector,
                Reporter reporter) throws IOException
{
  Integer key = Integer.parseInt(data.key.toString());
  for (int i = 0; i < _lambdaRhoConsumer.get().size(); i++)
  {
    int newkey = key * _lambdaRhoConsumer.get().size() + i;
    // String newkey = String.valueOf(lambda)+"#"+key;
    data.key = String.valueOf(newkey);
    Pair<Integer, RegressionPrepareOutput> outPair =
        new Pair<Integer, RegressionPrepareOutput>(newkey, data);
    collector.collect(outPair);
  }
}

Source File: ItemModelTestLoglik.java From ml-ease with Apache License 2.0

6 votes

@Override
public void reduce(Utf8 key,
                   Iterable<RegressionTestLoglikOutput> values,
                   AvroCollector<Pair<Utf8, RegressionTestLoglikOutput>> collector,
                   Reporter reporter) throws IOException
{
  double sumLoglik = 0;
  double n = 0;
  for (RegressionTestLoglikOutput value : values)
  {
    float loglik = value.testLoglik;
    sumLoglik += loglik;
    n += value.count;
  }
  RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
  output.key = key;
  output.testLoglik = (float) sumLoglik;
  output.count = n;
  collector.collect(new Pair<Utf8, RegressionTestLoglikOutput>(key, output));
}

Source File: MergeAvroMapper.java From aliyun-maxcompute-data-collectors with Apache License 2.0

6 votes

@Override
protected void setup(Context context) throws InterruptedException, IOException {
  super.setup(context);
  Configuration conf = context.getConfiguration();
  final String userClassName = conf.get(MergeJob.MERGE_SQOOP_RECORD_KEY);
  try {
    final Class<? extends Object> clazz = Class.forName(userClassName, true,
        Thread.currentThread().getContextClassLoader());
    sqoopRecordImpl = (SqoopRecord) ReflectionUtils.newInstance(clazz, conf);
    for (final Field field : clazz.getDeclaredFields()) {
      final String fieldName = field.getName();
      final String fieldTypeName = field.getType().getName();
      sqoopRecordFields.put(fieldName.toLowerCase(), new Pair<String, String>(fieldName,
          fieldTypeName));
    }
  } catch (ClassNotFoundException e) {
    throw new IOException("Cannot find the user record class with class name"
        + userClassName, e);
  }
}

Source File: PartitionIdAssigner.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<String, Integer>> collector,
                Reporter reporter) throws IOException
{
  String key = data.key.toString();
  for (float lambda : _lambdaSet)
  {
    String newkey = String.valueOf(lambda) + "#" + key;
    data.key = newkey;
    Pair<String, Integer> outPair = new Pair<String, Integer>(newkey, 1);
    collector.collect(outPair);
  }
}

Source File: PartitionPreservingSchemas.java From datafu with Apache License 2.0

5 votes

public Schema getMapOutputSchema()
{
  if (_mapOutputSchema == null)
  {
    _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), 
                                          getMapOutputValueSchema());
  }
  return _mapOutputSchema;
}

Source File: PartitionCollapsingSchemas.java From datafu with Apache License 2.0

5 votes

public Schema getMapOutputSchema()
{
  if (_mapOutputSchema == null)
  {
    _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), 
                                          getMapOutputValueSchema());
  }
  return _mapOutputSchema;
}

Source File: ItemModelTrain.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<String, RegressionPrepareOutput>> collector,
                Reporter reporter) throws IOException
{
  String key = data.key.toString();
  Pair<String, RegressionPrepareOutput> outPair =
      new Pair<String, RegressionPrepareOutput>(key, data);
  collector.collect(outPair);
}

Source File: RegressionTestLoglik.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<String, RegressionTestLoglikOutput>> collector,
                Reporter reporter) throws IOException
{
  int response = Util.getIntAvro(data, "response");
  double pred = Util.getDoubleAvro(data, "pred");
  double weight = 1;
  if (data.get("weight")!=null)
  {
    weight = Util.getDoubleAvro(data, "weight");
  }
  if (response != 1 && response != 0 && response != -1)
    throw new IOException("response should be 1,0 or -1!");
  double loglik = 0;
  if (response == 1)
  {
    loglik = -Math.log1p(Math.exp(-pred)) * weight;
  }
  else
  {
    loglik = -Math.log1p(Math.exp(pred)) * weight;
  }
  RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
  output.key = "loglik";
  output.testLoglik = (float) loglik;
  output.count = weight;
  collector.collect(new Pair<String, RegressionTestLoglikOutput>("loglik", output));
}

Source File: PartitionIdAssigner.java From ml-ease with Apache License 2.0

5 votes

@Override
public void reduce(Utf8 key,
                   Iterable<Integer> values,
                   AvroCollector<Pair<String, Integer>> collector,
                   Reporter reporter) throws IOException
{
  collector.collect(new Pair<String, Integer>(key, 1));
}

Source File: PartitionIdAssigner.java From ml-ease with Apache License 2.0

5 votes

@Override
public void reduce(Utf8 key,
                   Iterable<Integer> values,
                   AvroCollector<Pair<String, Integer>> collector,
                   Reporter reporter) throws IOException
{
  collector.collect(new Pair<String, Integer>(key, _partitionId));
  _partitionId++;
}

Source File: ItemModelTestLoglik.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<String, RegressionTestLoglikOutput>> collector,
                Reporter reporter) throws IOException
{
  int response = Util.getIntAvro(data, "response");
  Map<Utf8, Float> pred = (Map<Utf8, Float>) data.get("pred");
  double weight = 1;
  if (data.get("weight")!=null)
  {
    weight = Util.getDoubleAvro(data, "weight");
  }
  if (response != 1 && response != 0 && response != -1)
  {
    throw new IOException("response should be 1,0 or -1!");
  }
  for (Utf8 k : pred.keySet())
  {
    double loglik = 0;
    if (response == 1)
    {
      loglik = -Math.log1p(Math.exp(-pred.get(k))) * weight;
    }
    else
    {
      loglik = -Math.log1p(Math.exp(pred.get(k))) * weight;
    }
    RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
    output.key = k;
    output.testLoglik = (float) loglik;
    output.count = weight;
    collector.collect(new Pair<String, RegressionTestLoglikOutput>(k.toString(), output));
  }
}

Source File: ItemModelTestLoglik.java From ml-ease with Apache License 2.0

5 votes

@Override
public void run() throws Exception
{
  JobConfig props = super.getJobConfig();
  JobConf conf = super.createJobConf(ItemModelTestLoglikMapper.class,
                                     ItemModelTestLoglikReducer.class,
                                     ItemModelTestLoglikCombiner.class,
                                     Pair.getPairSchema(Schema.create(Type.STRING),
                                                        RegressionTestLoglikOutput.SCHEMA$),
                                                        RegressionTestLoglikOutput.SCHEMA$);
  AvroUtils.runAvroJob(conf);
}

Source File: RegressionNaiveTrain.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<String, RegressionPrepareOutput>> collector,
                Reporter reporter) throws IOException
{
  String key = data.key.toString();
  for (float lambda : _lambdaSet)
  {
    String newkey = String.valueOf(lambda) + "#" + key;
    data.key = newkey;
    Pair<String, RegressionPrepareOutput> outPair =
        new Pair<String, RegressionPrepareOutput>(newkey, data);
    collector.collect(outPair);
  }
}

Source File: RegressionTest.java From ml-ease with Apache License 2.0

5 votes

private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("AdmmTestOutput",
                          "Test output for AdmmTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}

Source File: RegressionTest.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<Float, GenericData.Record>> collector,
                Reporter reporter) throws IOException
{
  LinearModel model;
  if (_lambda >= 0)
  {
    model = _modelConsumer.get().get(String.valueOf(_lambda));
  }
  else
  {
    // lambda should be -1 and it should include only 1 model which is the best-model
    // found in train
    Iterator<LinearModel> iter = _modelConsumer.get().values().iterator();
    model = iter.next();
  }
  float pred = (float) model.evalInstanceAvro(data, false, _ignoreValue);
  GenericData.Record output = new GenericData.Record(_outputSchema);
  List<Schema.Field> inputFields = data.getSchema().getFields();
  for (Schema.Field field : inputFields)
  {
    output.put(field.name(), data.get(field.name()));
    _logger.info(field.name() + ": " + data.get(field.name()));
  }
  output.put("pred", pred);
  Pair<Float, GenericData.Record> outPair =
      new Pair<Float, GenericData.Record>(pred, output);
  collector.collect(outPair);
}

Source File: ItemModelTest.java From ml-ease with Apache License 2.0

5 votes

private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("PerItemTestOutput",
                          "Test output for PerItemTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.STRING), inputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}

Source File: ItemModelTest.java From ml-ease with Apache License 2.0

5 votes

@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<String, GenericData.Record>> collector,
                Reporter reporter) throws IOException
{
  if (data.get(_itemKey) == null)
  {
    throw new IOException("data does not contain the column" + _itemKey);
  }
  String itemKey = data.get(_itemKey).toString();
  collector.collect(new Pair<String, GenericData.Record>(itemKey, data));
}

Source File: MergeAvroMapper.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

private SqoopRecord toSqoopRecord(GenericRecord genericRecord) throws IOException {
  Schema avroSchema = genericRecord.getSchema();
  for (Schema.Field field : avroSchema.getFields()) {
    Pair<String, String> sqoopRecordField = sqoopRecordFields.get(field.name().toLowerCase());
    if (null == sqoopRecordField) {
      throw new IOException("Cannot find field '" + field.name() + "' in fields of user class"
          + sqoopRecordImpl.getClass().getName() + ". Fields are: "
          + Arrays.deepToString(sqoopRecordFields.values().toArray()));
    }
    Object avroObject = genericRecord.get(field.name());
    Object fieldVal = AvroUtil.fromAvro(avroObject, field.schema(), sqoopRecordField.value());
    sqoopRecordImpl.setField(sqoopRecordField.key(), fieldVal);
  }
  return sqoopRecordImpl;
}

Source File: ItemModelTrain.java From ml-ease with Apache License 2.0

4 votes

@Override
public void run() throws Exception
{
  JobConfig props = super.getJobConfig();
  _logger.info("Start training per-key naive logistic regression model...");
  String outBasePath = props.getString(OUTPUT_MODEL_PATH);
  String outpath = outBasePath + "/models";
  props.put("output.path", outpath);
  JobConf conf =
      createJobConf(ItemModelTrainMapper.class,
                    ItemModelTrainReducer.class,
                    Pair.getPairSchema(Schema.create(Type.STRING),
                                       RegressionPrepareOutput.SCHEMA$),
                    LinearModelWithVarAvro.SCHEMA$);
  // set up conf
  String interceptPriorMeanMap = props.getString(INTERCEPT_PRIOR_MEAN_MAP,"");
  if (!interceptPriorMeanMap.equals(""))
  {
    AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(interceptPriorMeanMap), INTERCEPT_PRIOR_MEAN_MAP);
  }
  String lambdaMap = props.getString(LAMBDA_MAP,"");
  if (!lambdaMap.equals(""))
  {
    AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(lambdaMap), LAMBDA_MAP);
  }
  conf.setFloat(INTERCEPT_DEFAULT_PRIOR_MEAN, (float)props.getDouble(INTERCEPT_DEFAULT_PRIOR_MEAN,0));
  conf.set(INTERCEPT_LAMBDAS,props.get(INTERCEPT_LAMBDAS));
  conf.set(DEFAULT_LAMBDAS,props.get(DEFAULT_LAMBDAS));
  conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
  conf.setFloat(LIBLINEAR_EPSILON, (float) props.getDouble(LIBLINEAR_EPSILON, 0.001f));
  conf.setBoolean(COMPUTE_VAR, props.getBoolean(COMPUTE_VAR,false));
  conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
  conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));
  // run job
  AvroUtils.runAvroJob(conf);
  boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true);
  if (removeTmpDir)
  {
    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(outBasePath + "/tmp-data"), true);
  }
}

Source File: ItemModelTrain.java From ml-ease with Apache License 2.0

4 votes

@Override
public void consume(Object object)
{
  Pair record = (Pair) object;
  _result.put(record.key().toString(), Double.parseDouble(record.value().toString()));
}

org.apache.avro.mapred.Pair Java Examples