org.elasticsearch.hadoop.mr.EsInputFormat Java Exaples

Source File: AbstractMRNewApiSearchTest.java From elasticsearch-hadoop with Apache License 2.0

6 votes

private Configuration createConf() throws IOException {
    Configuration conf = HdpBootstrap.hadoopConfig();
    HadoopCfgUtils.setGenericOptions(conf);
    Job job = new Job(conf);
    job.setInputFormatClass(EsInputFormat.class);
    job.setOutputFormatClass(PrintStreamOutputFormat.class);
    job.setOutputKeyClass(Text.class);

    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);

    job.setOutputValueClass(mapType);
    conf.set(ConfigurationOptions.ES_QUERY, query);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(readMetadata));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, String.valueOf(readAsJson));

    new QueryTestParams(tempFolder).provisionQueries(conf);
    job.setNumReduceTasks(0);
    //PrintStreamOutputFormat.stream(conf, Stream.OUT);

    Configuration cfg = job.getConfiguration();
    HdpBootstrap.addProperties(cfg, TestSettings.TESTING_PROPS, false);
    return cfg;
}

Source File: AbstractExtraMRTests.java From elasticsearch-hadoop with Apache License 2.0

6 votes

private JobConf createReadJobConf() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(EsInputFormat.class);
    conf.setOutputFormat(PrintStreamOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);
    conf.setOutputValueClass(MapWritable.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumReduceTasks(0);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(random.nextBoolean()));
    conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, "true");

    FileInputFormat.setInputPaths(conf, new Path(MRSuite.testData.gibberishDat(conf)));
    return conf;
}

Source File: AbstractMROldApiSearchTest.java From elasticsearch-hadoop with Apache License 2.0

6 votes

private JobConf createJobConf() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(EsInputFormat.class);
    conf.setOutputFormat(PrintStreamOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);
    conf.setOutputValueClass(mapType);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.set(ConfigurationOptions.ES_QUERY, query);
    conf.setNumReduceTasks(0);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(readMetadata));
    conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, String.valueOf(readAsJson));

    new QueryTestParams(tempFolder).provisionQueries(conf);
    FileInputFormat.setInputPaths(conf, new Path(MRSuite.testData.sampleArtistsDatUri()));

    HdpBootstrap.addProperties(conf, TestSettings.TESTING_PROPS, false);
    return conf;
}

Source File: ComputeResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaRDD<MapWritable> readDataES() throws IOException, PIRException
{
  logger.info("Reading data ");

  JavaRDD<MapWritable> jsonRDD;

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  jsonRDD = sc.newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values().coalesce(numDataPartitions);

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return jsonRDD.filter(new FilterData(accum, bVars));
  }
  else
  {
    logger.info("qSchema.getFilter() is null");
    return jsonRDD;
  }
}

Source File: HadoopFormatIOElasticTest.java From beam with Apache License 2.0

5 votes

/**
 * Set the Elasticsearch configuration parameters in the Hadoop configuration object.
 * Configuration object should have InputFormat class, key class and value class set. Mandatory
 * fields for ESInputFormat to be set are es.resource, es.nodes, es.port, es.internal.es.version.
 * Please refer to <a
 * href="https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html"
 * >Elasticsearch Configuration</a> for more details.
 */
private Configuration getConfiguration() {
  Configuration conf = new Configuration();
  conf.set(ConfigurationOptions.ES_NODES, ELASTIC_IN_MEM_HOSTNAME);
  conf.set(ConfigurationOptions.ES_PORT, String.format("%s", port));
  conf.set(ConfigurationOptions.ES_RESOURCE, ELASTIC_RESOURCE);
  conf.set("es.internal.es.version", ELASTIC_INTERNAL_VERSION);
  conf.set(ConfigurationOptions.ES_NODES_DISCOVERY, TRUE);
  conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, TRUE);
  conf.setClass("mapreduce.job.inputformat.class", EsInputFormat.class, InputFormat.class);
  conf.setClass("key.class", Text.class, Object.class);
  conf.setClass("value.class", LinkedMapWritable.class, Object.class);
  return conf;
}

Source File: ESEntityExtractor.java From deep-spark with Apache License 2.0

5 votes

public ESEntityExtractor(Class<T> t) {
    super();
    this.deepJobConfig = new ESDeepJobConfig(t);
    this.inputFormat = new EsInputFormat<>();
    this.outputFormat = new EsOutputFormat();

}

Source File: ReadFromES.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf(), "ReadFromES");
    // DO NOT SET JAR BY CLASS HERE
    //
    // job.setJarByClass(getClass());

    EsMapReduceUtil.initCredentials(job);

    job.getConfiguration().set("es.output.json", "true");

    job.setInputFormatClass(EsInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    TextOutputFormat.setOutputPath(job, new Path(args[0]));

    job.setMapperClass(MapperImpl.class);
    // Secure Hadoop CANNOT perform shuffle phases without native libraries
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    if (!job.waitForCompletion(true)) {
        return 1;
    }
    return 0;
}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

4 votes

/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readDataES() throws IOException
{
  logger.info("Reading data ");

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  // Read data from hdfs
  JavaDStream<MapWritable> mwStream;
  if (useQueueStream)
  {
    Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
    JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values()
        .coalesce(numDataPartitions);
    rddQueue.add(rddIn);

    mwStream = jssc.queueStream(rddQueue);
  }
  else
  {
    JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, EsInputFormat.class);
    mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
    {
      private static final long serialVersionUID = 1L;

      @Override
      public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
      {
        return pair.values();
      }
    }).repartition(numDataPartitions);
  }

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return mwStream.filter(new FilterData(accum, bVars));
  }
  else
  {
    return mwStream;
  }
}

Source File: ESCellExtractor.java From deep-spark with Apache License 2.0

4 votes

public ESCellExtractor(Class<Cells> cellsClass) {
    super();
    this.deepJobConfig = new ESDeepJobConfig(cellsClass);
    this.inputFormat = new EsInputFormat<>();
    this.outputFormat = new EsOutputFormat();
}

org.elasticsearch.hadoop.mr.EsInputFormat Java Examples