Java Code Examples for org.apache.spark.api.java.JavaRDD#saveAsTextFile()

The following examples show how to use org.apache.spark.api.java.JavaRDD#saveAsTextFile() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Readonly.java    From flink-perf with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	System.err.println("Starting spark with master="+master+" in="+inFile);
	
	SparkConf conf = new SparkConf().setAppName("Read only job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	JavaRDD<String> file = sc.textFile(inFile);
	JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
		private static final long serialVersionUID = 1L;

		@Override
		public Boolean call(String arg0) throws Exception {
			return false;
		}
	});
	res.saveAsTextFile("file:///tmp/out");
}
 
Example 2
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test4(SparkSession spark, String outText1) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T4: Create collection and store it as text file in " + outText1);
    List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
    JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data);
    distData.saveAsTextFile(outText1);
    JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD();
    long baseCount = txtRes.count();
    countAndCompare(baseCount, distData.count(), "T4", baseCount);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true);
  }

}
 
Example 3
Source File: BaseTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected String export(JavaRDD<DataSet> trainingData) {
    String baseDir = getBaseDirForRDD(trainingData);
    String dataDir = baseDir + "data/";
    String pathsDir = baseDir + "paths/";

    log.info("Initiating RDD<DataSet> export at {}", baseDir);
    JavaRDD<String> paths = trainingData
                    .mapPartitionsWithIndex(new BatchAndExportDataSetsFunction(batchSizePerWorker, dataDir), true);
    paths.saveAsTextFile(pathsDir);
    log.info("RDD<DataSet> export complete at {}", baseDir);

    lastExportedRDDId = trainingData.id();
    lastRDDExportPath = baseDir;
    return baseDir;
}
 
Example 4
Source File: HdfsModelExporter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void export(JavaRDD<ExportContainer<T>> rdd) {
    if (codec == null)
        rdd.saveAsTextFile(path);
    else
        rdd.saveAsTextFile(path, codec.getClass());
}
 
Example 5
Source File: SparkExport.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static void exportCSVSpark(String directory, String delimiter, String quote, int outputSplits,
                JavaRDD<List<Writable>> data) {

    //NOTE: Order is probably not random here...
    JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote));
    lines.coalesce(outputSplits);

    lines.saveAsTextFile(directory);
}
 
Example 6
Source File: Grep.java    From flink-perf with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];

	String patterns[] = new String[args.length-3];
	System.arraycopy(args,3,patterns,0,args.length-3);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	JavaRDD<String> file = sc.textFile(inFile);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}
 
Example 7
Source File: PileupSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) {
    JavaRDD<String> lines = rdd.map(pileupFunction(metadata, outputInsertLength, showVerbose));
    if (numReducers != 0) {
        lines = lines.coalesce(numReducers);
    }
    lines.saveAsTextFile(outputFile);
}
 
Example 8
Source File: CopybookSparkExample.java    From CopybookInputFormat with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {

	}
	if (args.length == 0) {
		System.out
				.println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}");
		return;
	}

	String master = args[0];
	String copybookInputPath = args[1];
	String dataFileInputPath = args[2];
	String outputPath = args[3];

	JavaSparkContext jsc = new JavaSparkContext(master,
			"UniqueSeqGenerator", null, "SparkCopybookExample.jar");

	Configuration config = new Configuration();
	config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
	CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath);
	
	JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config);
	JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction());

	pipeDelimiter.saveAsTextFile(outputPath);
}
 
Example 9
Source File: TestSuite.java    From stocator with Apache License 2.0 5 votes vote down vote up
public void test8(SparkSession spark, String outText1, boolean isTimeOutTest) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T8: Timeout retry test. Please wait with patience");
    System.out.println("T8:Create collection and store it as text file in " + outText1);
    List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
    JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data);
    distData.saveAsTextFile(outText1);
    JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD();
    long baseCount = txtRes.count();
    countAndCompare(baseCount, distData.count(), "T8", baseCount);
    if (isTimeOutTest) {
      System.out.println("T8: Sleep for 10 minutes ");
      Thread.sleep(10 * 60 * 1000);
      System.out.println("T8: About to wake up");
      System.out.println("T8: Re-define data source");
    }
    txtRes = spark.read().textFile(outText1).javaRDD();
    baseCount = txtRes.count();
    countAndCompare(baseCount, distData.count(), "T8", baseCount);
    System.out.println("T8: Sleep for 10 minutes ");

  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true);
  }

}
 
Example 10
Source File: BaseTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected String exportMDS(JavaRDD<MultiDataSet> trainingData) {
    String baseDir = getBaseDirForRDD(trainingData);
    String dataDir = baseDir + "data/";
    String pathsDir = baseDir + "paths/";

    log.info("Initiating RDD<MultiDataSet> export at {}", baseDir);
    JavaRDD<String> paths = trainingData.mapPartitionsWithIndex(
                    new BatchAndExportMultiDataSetsFunction(batchSizePerWorker, dataDir), true);
    paths.saveAsTextFile(pathsDir);
    log.info("RDD<MultiDataSet> export complete at {}", baseDir);

    lastExportedRDDId = trainingData.id();
    lastRDDExportPath = baseDir;
    return baseDir;
}
 
Example 11
Source File: SparkExport.java    From DataVec with Apache License 2.0 5 votes vote down vote up
public static void exportCSVSpark(String directory, String delimiter, String quote, int outputSplits,
                JavaRDD<List<Writable>> data) {

    //NOTE: Order is probably not random here...
    JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote));
    lines.coalesce(outputSplits);

    lines.saveAsTextFile(directory);
}
 
Example 12
Source File: ParallelValidator.java    From metadata-qa-marc with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) throws ParseException {

    final Validator validator = new Validator(args);
    ValidatorParameters params = validator.getParameters();
    validator.setDoPrintInProcessRecord(false);

    logger.info("Input file is " + params.getDetailsFileName());
    SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
    JavaSparkContext context = new JavaSparkContext(conf);

    System.err.println(validator.getParameters().formatParameters());

    JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

    JavaRDD<String> baseCountsRDD = inputFile
      .flatMap(content -> {
        MarcReader reader = ReadMarc.getMarcStringReader(content);
        Record marc4jRecord = reader.next();
        MarcRecord marcRecord = MarcFactory.createFromMarc4j(
          marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
        validator.processRecord(marcRecord, 1);
        return ValidationErrorFormatter
          .formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
          .iterator();
      }
    );
    baseCountsRDD.saveAsTextFile(validator.getParameters().getDetailsFileName());
  }
 
Example 13
Source File: SplitFasta.java    From ViraPipe with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}
 
Example 14
Source File: MultiReturnParameterizedBuiltinSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}
 
Example 15
Source File: RenameContigsUniq.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", true,"Divide or merge to n partitions" ) );
    options.addOption(new Option( "fa", true, "Include only files with extension given " ));
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        cmd = parser.parse( options, args );
    }
    catch( ParseException exp ) {
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd;
    if(fastaonly!=null)
        rdd = sc.textFile(in+"/*."+fastaonly);
    else
        rdd = sc.textFile(in); //take whole directory as input

    JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{

        String[] fseq = fasta.trim().split("\n");
        String id = fseq[0].split(" ")[0];

        //Give unique id for sequence
        String seq_id = id+"_"+UUID.randomUUID().toString();
        String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");

        return ">"+seq_id+"\n"+seq;
    });

    if(partitions!=null)
        crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
    else
        crdd.saveAsTextFile(out);

    sc.stop();
}
 
Example 16
Source File: GrepCaching.java    From flink-perf with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];
	String storageLevel = args[3];

	String patterns[] = new String[args.length-4];
	System.arraycopy(args, 4, patterns, 0, args.length - 4);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	StorageLevel sl;
	switch(storageLevel) {
		case "MEMORY_ONLY":
			sl = StorageLevel.MEMORY_ONLY(); break;
		case "MEMORY_AND_DISK":
			sl = StorageLevel.MEMORY_AND_DISK(); break;
		case "MEMORY_ONLY_SER":
			sl = StorageLevel.MEMORY_ONLY_SER(); break;
		case "MEMORY_AND_DISK_SER":
			sl = StorageLevel.MEMORY_AND_DISK_SER(); break;
		case "NONE":
			sl = StorageLevel.NONE(); break;
		default:
			throw new RuntimeException("Unknown storage level "+storageLevel);
	}

	JavaRDD<String> file = sc.textFile(inFile).persist(sl);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}
 
Example 17
Source File: ActionRDDTest.java    From hui-bigdata-spark with Apache License 2.0 4 votes vote down vote up
@Test
public void saveAsTxtFile() throws Exception{
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    stringJavaRDD.saveAsTextFile(OUTPUT_TXT_PATH);
}
 
Example 18
Source File: MultiReturnParameterizedBuiltinSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override 
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;
	
	try
	{
		//get input RDD and meta data
		FrameObject fo = sec.getFrameObject(input1.getName());
		FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
		JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>)
			sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
		String spec = ec.getScalarInput(input2).getStringValue();
		DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName());
		DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
		String[] colnames = !TfMetaUtils.isIDSpec(spec) ?
			in.lookup(1L).get(0).getColumnNames() : null; 
		
		//step 1: build transform meta data
		Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), null);
		
		MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); 
		JavaRDD<String> rcMaps = in
			.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild))
			.distinct().groupByKey()
			.flatMap(new TransformEncodeGroupFunction(accMax));
		if( containsMVImputeEncoder(encoderBuild) ) {
			EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
			rcMaps = rcMaps.union(
				in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva))
				  .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) );
		}
		rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval
		
		//consolidate meta data frame (reuse multi-threaded reader, special handling missing values) 
		FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
		FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
		meta.recomputeColumnCardinality(); //recompute num distinct items per column
		meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames());
		
		//step 2: transform apply (similar to spark transformapply)
		//compute omit offset map for block shifts
		TfOffsetMap omap = null;
		if( TfMetaUtils.containsOmitSpec(spec, colnames) ) {
			omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(
				new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
		}
		
		//create encoder broadcast (avoiding replication per task) 
		Encoder encoder = EncoderFactory.createEncoder(spec, colnames,
			fo.getSchema(), (int)fo.getNumColumns(), meta);
		mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); 
		Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
		Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null;
		
		//execute transform apply
		JavaPairRDD<Long,FrameBlock> tmp = in
			.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils
			.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
		
		//set output and maintain lineage/output characteristics
		sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
		sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
		sec.setFrameOutput(_outputs.get(1).getName(), meta);
	}
	catch(IOException ex) {
		throw new RuntimeException(ex);
	}
}
 
Example 19
Source File: Data2CoNLL.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
protected int run() throws Exception {

  SparkConf sparkConf = new SparkConf()
      .setAppName("Data2CoNLL")
      .set("spark.hadoop.validateOutputSpecs", "false")
      .set("spark.yarn.executor.memoryOverhead", "3072")
      .set("spark.rdd.compress", "true")
      .set("spark.core.connection.ack.wait.timeout", "600")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      //.set("spark.kryo.registrationRequired", "true")
      .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
          InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
          VectorIndexer.class})
      ;//.setMaster("local[4]"); //Remove this if you run it on the server.


  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
      * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

  FileSystem fs = FileSystem.get(new Configuration());

  int partitionNumber = 3 * totalCores;
  if(partitions != null) {
    partitionNumber = partitions;
  }

  //Read training documents serialized as SCAS
  JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

  JavaRDD<String> docStrings = documents.map( s -> {
    JCas jCas = s.getJCas();
    NYTArticleMetaData metadata = JCasUtil.selectSingle(jCas, NYTArticleMetaData.class);

    StringJoiner docBuilder = new StringJoiner("\n");

    docBuilder.add("-DOCSTART- (" +  metadata.getGuid() + ")");
    docBuilder.add("");

    Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
    for(Sentence sentence: sentences) {
      List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
      for(Token token: tokens) {
        CoreLabel taggedWord = CoreNlpUtils.tokenToWord(token);
        StringJoiner lineBuilder = new StringJoiner("\t");
        lineBuilder.add(taggedWord.word().toLowerCase());
        docBuilder.add(lineBuilder.toString());
      }
      docBuilder.add("");
    }
    return docBuilder.toString();
  });

  docStrings.saveAsTextFile(output);
  sc.stop();
  return 0;
}
 
Example 20
Source File: SaprkFile.java    From sparkResearch with Apache License 2.0 4 votes vote down vote up
public static void jsonFile(JavaSparkContext sparkContext) {
    //json文件的读写
    JavaRDD<String> rdd = sparkContext.textFile("url");
    JavaRDD<Person> result = rdd.mapPartitions(new ParseJson()).filter(new filterData());
    result.saveAsTextFile("url");
}