org.apache.avro.mapred.Pair Java Examples
The following examples show how to use
org.apache.avro.mapred.Pair.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RegressionTestLoglik.java From ml-ease with Apache License 2.0 | 6 votes |
@Override public void reduce(Utf8 key, Iterable<RegressionTestLoglikOutput> values, AvroCollector<Pair<Utf8, RegressionTestLoglikOutput>> collector, Reporter reporter) throws IOException { double sumLoglik = 0; double n = 0; for (RegressionTestLoglikOutput value : values) { float loglik = value.testLoglik; sumLoglik += loglik; n += value.count; } RegressionTestLoglikOutput output = new RegressionTestLoglikOutput(); output.key = "averageTestLoglik"; output.testLoglik = (float) sumLoglik; output.count = n; collector.collect(new Pair<Utf8, RegressionTestLoglikOutput>(key, output)); }
Example #2
Source File: RegressionAdmmTrain.java From ml-ease with Apache License 2.0 | 6 votes |
@Override public void map(RegressionPrepareOutput data, AvroCollector<Pair<Integer, RegressionPrepareOutput>> collector, Reporter reporter) throws IOException { Integer key = Integer.parseInt(data.key.toString()); for (int i = 0; i < _lambdaRhoConsumer.get().size(); i++) { int newkey = key * _lambdaRhoConsumer.get().size() + i; // String newkey = String.valueOf(lambda)+"#"+key; data.key = String.valueOf(newkey); Pair<Integer, RegressionPrepareOutput> outPair = new Pair<Integer, RegressionPrepareOutput>(newkey, data); collector.collect(outPair); } }
Example #3
Source File: ItemModelTestLoglik.java From ml-ease with Apache License 2.0 | 6 votes |
@Override public void reduce(Utf8 key, Iterable<RegressionTestLoglikOutput> values, AvroCollector<Pair<Utf8, RegressionTestLoglikOutput>> collector, Reporter reporter) throws IOException { double sumLoglik = 0; double n = 0; for (RegressionTestLoglikOutput value : values) { float loglik = value.testLoglik; sumLoglik += loglik; n += value.count; } RegressionTestLoglikOutput output = new RegressionTestLoglikOutput(); output.key = key; output.testLoglik = (float) sumLoglik; output.count = n; collector.collect(new Pair<Utf8, RegressionTestLoglikOutput>(key, output)); }
Example #4
Source File: MergeAvroMapper.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
@Override protected void setup(Context context) throws InterruptedException, IOException { super.setup(context); Configuration conf = context.getConfiguration(); final String userClassName = conf.get(MergeJob.MERGE_SQOOP_RECORD_KEY); try { final Class<? extends Object> clazz = Class.forName(userClassName, true, Thread.currentThread().getContextClassLoader()); sqoopRecordImpl = (SqoopRecord) ReflectionUtils.newInstance(clazz, conf); for (final Field field : clazz.getDeclaredFields()) { final String fieldName = field.getName(); final String fieldTypeName = field.getType().getName(); sqoopRecordFields.put(fieldName.toLowerCase(), new Pair<String, String>(fieldName, fieldTypeName)); } } catch (ClassNotFoundException e) { throw new IOException("Cannot find the user record class with class name" + userClassName, e); } }
Example #5
Source File: PartitionIdAssigner.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(RegressionPrepareOutput data, AvroCollector<Pair<String, Integer>> collector, Reporter reporter) throws IOException { String key = data.key.toString(); for (float lambda : _lambdaSet) { String newkey = String.valueOf(lambda) + "#" + key; data.key = newkey; Pair<String, Integer> outPair = new Pair<String, Integer>(newkey, 1); collector.collect(outPair); } }
Example #6
Source File: PartitionPreservingSchemas.java From datafu with Apache License 2.0 | 5 votes |
public Schema getMapOutputSchema() { if (_mapOutputSchema == null) { _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), getMapOutputValueSchema()); } return _mapOutputSchema; }
Example #7
Source File: PartitionCollapsingSchemas.java From datafu with Apache License 2.0 | 5 votes |
public Schema getMapOutputSchema() { if (_mapOutputSchema == null) { _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), getMapOutputValueSchema()); } return _mapOutputSchema; }
Example #8
Source File: ItemModelTrain.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(RegressionPrepareOutput data, AvroCollector<Pair<String, RegressionPrepareOutput>> collector, Reporter reporter) throws IOException { String key = data.key.toString(); Pair<String, RegressionPrepareOutput> outPair = new Pair<String, RegressionPrepareOutput>(key, data); collector.collect(outPair); }
Example #9
Source File: RegressionTestLoglik.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(GenericData.Record data, AvroCollector<Pair<String, RegressionTestLoglikOutput>> collector, Reporter reporter) throws IOException { int response = Util.getIntAvro(data, "response"); double pred = Util.getDoubleAvro(data, "pred"); double weight = 1; if (data.get("weight")!=null) { weight = Util.getDoubleAvro(data, "weight"); } if (response != 1 && response != 0 && response != -1) throw new IOException("response should be 1,0 or -1!"); double loglik = 0; if (response == 1) { loglik = -Math.log1p(Math.exp(-pred)) * weight; } else { loglik = -Math.log1p(Math.exp(pred)) * weight; } RegressionTestLoglikOutput output = new RegressionTestLoglikOutput(); output.key = "loglik"; output.testLoglik = (float) loglik; output.count = weight; collector.collect(new Pair<String, RegressionTestLoglikOutput>("loglik", output)); }
Example #10
Source File: PartitionIdAssigner.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void reduce(Utf8 key, Iterable<Integer> values, AvroCollector<Pair<String, Integer>> collector, Reporter reporter) throws IOException { collector.collect(new Pair<String, Integer>(key, 1)); }
Example #11
Source File: PartitionIdAssigner.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void reduce(Utf8 key, Iterable<Integer> values, AvroCollector<Pair<String, Integer>> collector, Reporter reporter) throws IOException { collector.collect(new Pair<String, Integer>(key, _partitionId)); _partitionId++; }
Example #12
Source File: ItemModelTestLoglik.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(GenericData.Record data, AvroCollector<Pair<String, RegressionTestLoglikOutput>> collector, Reporter reporter) throws IOException { int response = Util.getIntAvro(data, "response"); Map<Utf8, Float> pred = (Map<Utf8, Float>) data.get("pred"); double weight = 1; if (data.get("weight")!=null) { weight = Util.getDoubleAvro(data, "weight"); } if (response != 1 && response != 0 && response != -1) { throw new IOException("response should be 1,0 or -1!"); } for (Utf8 k : pred.keySet()) { double loglik = 0; if (response == 1) { loglik = -Math.log1p(Math.exp(-pred.get(k))) * weight; } else { loglik = -Math.log1p(Math.exp(pred.get(k))) * weight; } RegressionTestLoglikOutput output = new RegressionTestLoglikOutput(); output.key = k; output.testLoglik = (float) loglik; output.count = weight; collector.collect(new Pair<String, RegressionTestLoglikOutput>(k.toString(), output)); } }
Example #13
Source File: ItemModelTestLoglik.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); JobConf conf = super.createJobConf(ItemModelTestLoglikMapper.class, ItemModelTestLoglikReducer.class, ItemModelTestLoglikCombiner.class, Pair.getPairSchema(Schema.create(Type.STRING), RegressionTestLoglikOutput.SCHEMA$), RegressionTestLoglikOutput.SCHEMA$); AvroUtils.runAvroJob(conf); }
Example #14
Source File: RegressionNaiveTrain.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(RegressionPrepareOutput data, AvroCollector<Pair<String, RegressionPrepareOutput>> collector, Reporter reporter) throws IOException { String key = data.key.toString(); for (float lambda : _lambdaSet) { String newkey = String.valueOf(lambda) + "#" + key; data.key = newkey; Pair<String, RegressionPrepareOutput> outPair = new Pair<String, RegressionPrepareOutput>(newkey, data); collector.collect(outPair); } }
Example #15
Source File: RegressionTest.java From ml-ease with Apache License 2.0 | 5 votes |
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = createJobConf(); Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf)); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } _logger.info("Input Schema=" + inputSchema.toString()); List<Schema.Field> inputFields = inputSchema.getFields(); Schema.Field predField = new Schema.Field("pred", Schema.create(Type.FLOAT), "", null); List<Schema.Field> outputFields = new LinkedList<Schema.Field>(); for (Schema.Field field : inputFields) { outputFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); } outputFields.add(predField); Schema outputSchema = Schema.createRecord("AdmmTestOutput", "Test output for AdmmTest", "com.linkedin.lab.regression.avro", false); outputSchema.setFields(outputFields); AvroJob.setOutputSchema(conf, outputSchema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema)); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); return conf; }
Example #16
Source File: RegressionTest.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(GenericData.Record data, AvroCollector<Pair<Float, GenericData.Record>> collector, Reporter reporter) throws IOException { LinearModel model; if (_lambda >= 0) { model = _modelConsumer.get().get(String.valueOf(_lambda)); } else { // lambda should be -1 and it should include only 1 model which is the best-model // found in train Iterator<LinearModel> iter = _modelConsumer.get().values().iterator(); model = iter.next(); } float pred = (float) model.evalInstanceAvro(data, false, _ignoreValue); GenericData.Record output = new GenericData.Record(_outputSchema); List<Schema.Field> inputFields = data.getSchema().getFields(); for (Schema.Field field : inputFields) { output.put(field.name(), data.get(field.name())); _logger.info(field.name() + ": " + data.get(field.name())); } output.put("pred", pred); Pair<Float, GenericData.Record> outPair = new Pair<Float, GenericData.Record>(pred, output); collector.collect(outPair); }
Example #17
Source File: ItemModelTest.java From ml-ease with Apache License 2.0 | 5 votes |
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = createJobConf(); Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf)); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } _logger.info("Input Schema=" + inputSchema.toString()); List<Schema.Field> inputFields = inputSchema.getFields(); Schema.Field predField = new Schema.Field("pred", Schema.create(Type.FLOAT), "", null); List<Schema.Field> outputFields = new LinkedList<Schema.Field>(); for (Schema.Field field : inputFields) { outputFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); } outputFields.add(predField); Schema outputSchema = Schema.createRecord("PerItemTestOutput", "Test output for PerItemTest", "com.linkedin.lab.regression.avro", false); outputSchema.setFields(outputFields); AvroJob.setOutputSchema(conf, outputSchema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING), inputSchema)); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); return conf; }
Example #18
Source File: ItemModelTest.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void map(GenericData.Record data, AvroCollector<Pair<String, GenericData.Record>> collector, Reporter reporter) throws IOException { if (data.get(_itemKey) == null) { throw new IOException("data does not contain the column" + _itemKey); } String itemKey = data.get(_itemKey).toString(); collector.collect(new Pair<String, GenericData.Record>(itemKey, data)); }
Example #19
Source File: MergeAvroMapper.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private SqoopRecord toSqoopRecord(GenericRecord genericRecord) throws IOException { Schema avroSchema = genericRecord.getSchema(); for (Schema.Field field : avroSchema.getFields()) { Pair<String, String> sqoopRecordField = sqoopRecordFields.get(field.name().toLowerCase()); if (null == sqoopRecordField) { throw new IOException("Cannot find field '" + field.name() + "' in fields of user class" + sqoopRecordImpl.getClass().getName() + ". Fields are: " + Arrays.deepToString(sqoopRecordFields.values().toArray())); } Object avroObject = genericRecord.get(field.name()); Object fieldVal = AvroUtil.fromAvro(avroObject, field.schema(), sqoopRecordField.value()); sqoopRecordImpl.setField(sqoopRecordField.key(), fieldVal); } return sqoopRecordImpl; }
Example #20
Source File: ItemModelTrain.java From ml-ease with Apache License 2.0 | 4 votes |
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); _logger.info("Start training per-key naive logistic regression model..."); String outBasePath = props.getString(OUTPUT_MODEL_PATH); String outpath = outBasePath + "/models"; props.put("output.path", outpath); JobConf conf = createJobConf(ItemModelTrainMapper.class, ItemModelTrainReducer.class, Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$), LinearModelWithVarAvro.SCHEMA$); // set up conf String interceptPriorMeanMap = props.getString(INTERCEPT_PRIOR_MEAN_MAP,""); if (!interceptPriorMeanMap.equals("")) { AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(interceptPriorMeanMap), INTERCEPT_PRIOR_MEAN_MAP); } String lambdaMap = props.getString(LAMBDA_MAP,""); if (!lambdaMap.equals("")) { AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(lambdaMap), LAMBDA_MAP); } conf.setFloat(INTERCEPT_DEFAULT_PRIOR_MEAN, (float)props.getDouble(INTERCEPT_DEFAULT_PRIOR_MEAN,0)); conf.set(INTERCEPT_LAMBDAS,props.get(INTERCEPT_LAMBDAS)); conf.set(DEFAULT_LAMBDAS,props.get(DEFAULT_LAMBDAS)); conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000)); conf.setFloat(LIBLINEAR_EPSILON, (float) props.getDouble(LIBLINEAR_EPSILON, 0.001f)); conf.setBoolean(COMPUTE_VAR, props.getBoolean(COMPUTE_VAR,false)); conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false)); conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false)); // run job AvroUtils.runAvroJob(conf); boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true); if (removeTmpDir) { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(outBasePath + "/tmp-data"), true); } }
Example #21
Source File: ItemModelTrain.java From ml-ease with Apache License 2.0 | 4 votes |
@Override public void consume(Object object) { Pair record = (Pair) object; _result.put(record.key().toString(), Double.parseDouble(record.value().toString())); }