Java Code Examples for org.apache.spark.api.java.JavaRDD#saveAsTextFile()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#saveAsTextFile() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Readonly.java From flink-perf with Apache License 2.0 | 6 votes |
public static void main(String[] args) { String master = args[0]; String inFile = args[1]; System.err.println("Starting spark with master="+master+" in="+inFile); SparkConf conf = new SparkConf().setAppName("Read only job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> file = sc.textFile(inFile); JavaRDD<String> res = file.filter(new Function<String, Boolean>() { private static final long serialVersionUID = 1L; @Override public Boolean call(String arg0) throws Exception { return false; } }); res.saveAsTextFile("file:///tmp/out"); }
Example 2
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test4(SparkSession spark, String outText1) throws Exception { try { System.out.println("*********************************"); System.out.println("T4: Create collection and store it as text file in " + outText1); List<Integer> data = Arrays.asList(1, 2, 3, 4, 5); JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data); distData.saveAsTextFile(outText1); JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD(); long baseCount = txtRes.count(); countAndCompare(baseCount, distData.count(), "T4", baseCount); } catch (Exception e) { throw e; } finally { deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true); } }
Example 3
Source File: BaseTrainingMaster.java From deeplearning4j with Apache License 2.0 | 5 votes |
protected String export(JavaRDD<DataSet> trainingData) { String baseDir = getBaseDirForRDD(trainingData); String dataDir = baseDir + "data/"; String pathsDir = baseDir + "paths/"; log.info("Initiating RDD<DataSet> export at {}", baseDir); JavaRDD<String> paths = trainingData .mapPartitionsWithIndex(new BatchAndExportDataSetsFunction(batchSizePerWorker, dataDir), true); paths.saveAsTextFile(pathsDir); log.info("RDD<DataSet> export complete at {}", baseDir); lastExportedRDDId = trainingData.id(); lastRDDExportPath = baseDir; return baseDir; }
Example 4
Source File: HdfsModelExporter.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void export(JavaRDD<ExportContainer<T>> rdd) { if (codec == null) rdd.saveAsTextFile(path); else rdd.saveAsTextFile(path, codec.getClass()); }
Example 5
Source File: SparkExport.java From deeplearning4j with Apache License 2.0 | 5 votes |
public static void exportCSVSpark(String directory, String delimiter, String quote, int outputSplits, JavaRDD<List<Writable>> data) { //NOTE: Order is probably not random here... JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote)); lines.coalesce(outputSplits); lines.saveAsTextFile(directory); }
Example 6
Source File: Grep.java From flink-perf with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String master = args[0]; String inFile = args[1]; String outFile = args[2]; String patterns[] = new String[args.length-3]; System.arraycopy(args,3,patterns,0,args.length-3); System.err.println("Starting spark with master="+master+" in="+inFile); System.err.println("Using patterns: "+ Arrays.toString(patterns)); SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> file = sc.textFile(inFile); for(int p = 0; p < patterns.length; p++) { final String pattern = patterns[p]; JavaRDD<String> res = file.filter(new Function<String, Boolean>() { private static final long serialVersionUID = 1L; Pattern p = Pattern.compile(pattern); @Override public Boolean call(String value) throws Exception { if (value == null || value.length() == 0) { return false; } final Matcher m = p.matcher(value); if (m.find()) { return true; } return false; } }); res.saveAsTextFile(outFile+"_"+pattern); } }
Example 7
Source File: PileupSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) { JavaRDD<String> lines = rdd.map(pileupFunction(metadata, outputInsertLength, showVerbose)); if (numReducers != 0) { lines = lines.coalesce(numReducers); } lines.saveAsTextFile(outputFile); }
Example 8
Source File: CopybookSparkExample.java From CopybookInputFormat with Apache License 2.0 | 5 votes |
public static void main(String[] args) { if (args.length == 0) { } if (args.length == 0) { System.out .println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}"); return; } String master = args[0]; String copybookInputPath = args[1]; String dataFileInputPath = args[2]; String outputPath = args[3]; JavaSparkContext jsc = new JavaSparkContext(master, "UniqueSeqGenerator", null, "SparkCopybookExample.jar"); Configuration config = new Configuration(); config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")); config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml")); config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml")); config.addResource(new Path("/etc/hadoop/conf/core-site.xml")); CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath); JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config); JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction()); pipeDelimiter.saveAsTextFile(outputPath); }
Example 9
Source File: TestSuite.java From stocator with Apache License 2.0 | 5 votes |
public void test8(SparkSession spark, String outText1, boolean isTimeOutTest) throws Exception { try { System.out.println("*********************************"); System.out.println("T8: Timeout retry test. Please wait with patience"); System.out.println("T8:Create collection and store it as text file in " + outText1); List<Integer> data = Arrays.asList(1, 2, 3, 4, 5); JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data); distData.saveAsTextFile(outText1); JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD(); long baseCount = txtRes.count(); countAndCompare(baseCount, distData.count(), "T8", baseCount); if (isTimeOutTest) { System.out.println("T8: Sleep for 10 minutes "); Thread.sleep(10 * 60 * 1000); System.out.println("T8: About to wake up"); System.out.println("T8: Re-define data source"); } txtRes = spark.read().textFile(outText1).javaRDD(); baseCount = txtRes.count(); countAndCompare(baseCount, distData.count(), "T8", baseCount); System.out.println("T8: Sleep for 10 minutes "); } catch (Exception e) { throw e; } finally { deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true); } }
Example 10
Source File: BaseTrainingMaster.java From deeplearning4j with Apache License 2.0 | 5 votes |
protected String exportMDS(JavaRDD<MultiDataSet> trainingData) { String baseDir = getBaseDirForRDD(trainingData); String dataDir = baseDir + "data/"; String pathsDir = baseDir + "paths/"; log.info("Initiating RDD<MultiDataSet> export at {}", baseDir); JavaRDD<String> paths = trainingData.mapPartitionsWithIndex( new BatchAndExportMultiDataSetsFunction(batchSizePerWorker, dataDir), true); paths.saveAsTextFile(pathsDir); log.info("RDD<MultiDataSet> export complete at {}", baseDir); lastExportedRDDId = trainingData.id(); lastRDDExportPath = baseDir; return baseDir; }
Example 11
Source File: SparkExport.java From DataVec with Apache License 2.0 | 5 votes |
public static void exportCSVSpark(String directory, String delimiter, String quote, int outputSplits, JavaRDD<List<Writable>> data) { //NOTE: Order is probably not random here... JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote)); lines.coalesce(outputSplits); lines.saveAsTextFile(directory); }
Example 12
Source File: ParallelValidator.java From metadata-qa-marc with GNU General Public License v3.0 | 5 votes |
public static void main(String[] args) throws ParseException { final Validator validator = new Validator(args); ValidatorParameters params = validator.getParameters(); validator.setDoPrintInProcessRecord(false); logger.info("Input file is " + params.getDetailsFileName()); SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount"); JavaSparkContext context = new JavaSparkContext(conf); System.err.println(validator.getParameters().formatParameters()); JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]); JavaRDD<String> baseCountsRDD = inputFile .flatMap(content -> { MarcReader reader = ReadMarc.getMarcStringReader(content); Record marc4jRecord = reader.next(); MarcRecord marcRecord = MarcFactory.createFromMarc4j( marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq()); validator.processRecord(marcRecord, 1); return ValidationErrorFormatter .formatForSummary(marcRecord.getValidationErrors(), params.getFormat()) .iterator(); } ); baseCountsRDD.saveAsTextFile(validator.getParameters().getDetailsFileName()); }
Example 13
Source File: SplitFasta.java From ViraPipe with MIT License | 5 votes |
public static void main(String[] args) throws IOException { Options options = new Options(); Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." ); Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." ); options.addOption( new Option( "partitions", "Divide or merge to n partitions" ) ); options.addOption( pathOpt ); options.addOption( opOpt ); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { // parse the command line arguments cmd = parser.parse( options, args ); } catch( ParseException exp ) { // oops, something went wrong System.err.println( "Parsing failed. Reason: " + exp.getMessage() ); } String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null; String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null; String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null; SparkConf conf = new SparkConf().setAppName("SplitFasta"); JavaSparkContext sc = new JavaSparkContext(conf); sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">"); JavaRDD<String> rdd = sc.textFile(in); JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions)); crdd.saveAsTextFile(out); sc.stop(); }
Example 14
Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext) ec; try { //get input RDD and meta data FrameObject fo = sec.getFrameObject(input1.getName()); FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName()); JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>) sec.getRDDHandleForFrameObject(fo, FileFormat.BINARY); String spec = ec.getScalarInput(input2).getStringValue(); DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null; //step 1: build transform meta data Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), null); MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); JavaRDD<String> rcMaps = in .mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)) .distinct().groupByKey() .flatMap(new TransformEncodeGroupFunction(accMax)); if( containsMVImputeEncoder(encoderBuild) ) { EncoderMVImpute mva = getMVImputeEncoder(encoderBuild); rcMaps = rcMaps.union( in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)) .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) ); } rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval //consolidate meta data frame (reuse multi-threaded reader, special handling missing values) FrameReader reader = FrameReaderFactory.createFrameReader(FileFormat.TEXT); FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns()); meta.recomputeColumnCardinality(); //recompute num distinct items per column meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames()); //step 2: transform apply (similar to spark transformapply) //compute omit offset map for block shifts TfOffsetMap omap = null; if( TfMetaUtils.containsOmitSpec(spec, colnames) ) { omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair( new RDDTransformApplyOffsetFunction(spec, colnames)).collect())); } //create encoder broadcast (avoiding replication per task) Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), meta); mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder); Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null; //execute transform apply JavaPairRDD<Long,FrameBlock> tmp = in .mapToPair(new RDDTransformApplyFunction(bmeta, bomap)); JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils .binaryBlockToMatrixBlock(tmp, mcOut, mcOut); //set output and maintain lineage/output characteristics sec.setRDDHandleForVariable(_outputs.get(0).getName(), out); sec.addLineageRDD(_outputs.get(0).getName(), input1.getName()); sec.setFrameOutput(_outputs.get(1).getName(), meta); } catch(IOException ex) { throw new RuntimeException(ex); } }
Example 15
Source File: RenameContigsUniq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { Options options = new Options(); Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." ); Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." ); options.addOption( new Option( "partitions", true,"Divide or merge to n partitions" ) ); options.addOption(new Option( "fa", true, "Include only files with extension given " )); options.addOption( pathOpt ); options.addOption( opOpt ); CommandLineParser parser = new BasicParser(); CommandLine cmd = null; try { cmd = parser.parse( options, args ); } catch( ParseException exp ) { System.err.println( "Parsing failed. Reason: " + exp.getMessage() ); } String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null; String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null; String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null; String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null; SparkConf conf = new SparkConf().setAppName("RenameContigsUniq"); JavaSparkContext sc = new JavaSparkContext(conf); sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">"); JavaRDD<String> rdd; if(fastaonly!=null) rdd = sc.textFile(in+"/*."+fastaonly); else rdd = sc.textFile(in); //take whole directory as input JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{ String[] fseq = fasta.trim().split("\n"); String id = fseq[0].split(" ")[0]; //Give unique id for sequence String seq_id = id+"_"+UUID.randomUUID().toString(); String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]",""); return ">"+seq_id+"\n"+seq; }); if(partitions!=null) crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out); else crdd.saveAsTextFile(out); sc.stop(); }
Example 16
Source File: GrepCaching.java From flink-perf with Apache License 2.0 | 4 votes |
public static void main(String[] args) { String master = args[0]; String inFile = args[1]; String outFile = args[2]; String storageLevel = args[3]; String patterns[] = new String[args.length-4]; System.arraycopy(args, 4, patterns, 0, args.length - 4); System.err.println("Starting spark with master="+master+" in="+inFile); System.err.println("Using patterns: "+ Arrays.toString(patterns)); SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sc = new JavaSparkContext(conf); StorageLevel sl; switch(storageLevel) { case "MEMORY_ONLY": sl = StorageLevel.MEMORY_ONLY(); break; case "MEMORY_AND_DISK": sl = StorageLevel.MEMORY_AND_DISK(); break; case "MEMORY_ONLY_SER": sl = StorageLevel.MEMORY_ONLY_SER(); break; case "MEMORY_AND_DISK_SER": sl = StorageLevel.MEMORY_AND_DISK_SER(); break; case "NONE": sl = StorageLevel.NONE(); break; default: throw new RuntimeException("Unknown storage level "+storageLevel); } JavaRDD<String> file = sc.textFile(inFile).persist(sl); for(int p = 0; p < patterns.length; p++) { final String pattern = patterns[p]; JavaRDD<String> res = file.filter(new Function<String, Boolean>() { private static final long serialVersionUID = 1L; Pattern p = Pattern.compile(pattern); @Override public Boolean call(String value) throws Exception { if (value == null || value.length() == 0) { return false; } final Matcher m = p.matcher(value); if (m.find()) { return true; } return false; } }); res.saveAsTextFile(outFile+"_"+pattern); } }
Example 17
Source File: ActionRDDTest.java From hui-bigdata-spark with Apache License 2.0 | 4 votes |
@Test public void saveAsTxtFile() throws Exception{ JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH); stringJavaRDD.saveAsTextFile(OUTPUT_TXT_PATH); }
Example 18
Source File: MultiReturnParameterizedBuiltinSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext) ec; try { //get input RDD and meta data FrameObject fo = sec.getFrameObject(input1.getName()); FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName()); JavaPairRDD<Long,FrameBlock> in = (JavaPairRDD<Long,FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo); String spec = ec.getScalarInput(input2).getStringValue(); DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null; //step 1: build transform meta data Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), null); MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext()); JavaRDD<String> rcMaps = in .mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)) .distinct().groupByKey() .flatMap(new TransformEncodeGroupFunction(accMax)); if( containsMVImputeEncoder(encoderBuild) ) { EncoderMVImpute mva = getMVImputeEncoder(encoderBuild); rcMaps = rcMaps.union( in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)) .groupByKey().flatMap(new TransformEncodeGroup2Function(mva)) ); } rcMaps.saveAsTextFile(fometa.getFileName()); //trigger eval //consolidate meta data frame (reuse multi-threaded reader, special handling missing values) FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo); FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns()); meta.recomputeColumnCardinality(); //recompute num distinct items per column meta.setColumnNames((colnames!=null)?colnames:meta.getColumnNames()); //step 2: transform apply (similar to spark transformapply) //compute omit offset map for block shifts TfOffsetMap omap = null; if( TfMetaUtils.containsOmitSpec(spec, colnames) ) { omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair( new RDDTransformApplyOffsetFunction(spec, colnames)).collect())); } //create encoder broadcast (avoiding replication per task) Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int)fo.getNumColumns(), meta); mcOut.setDimension(mcIn.getRows()-((omap!=null)?omap.getNumRmRows():0), encoder.getNumCols()); Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder); Broadcast<TfOffsetMap> bomap = (omap!=null) ? sec.getSparkContext().broadcast(omap) : null; //execute transform apply JavaPairRDD<Long,FrameBlock> tmp = in .mapToPair(new RDDTransformApplyFunction(bmeta, bomap)); JavaPairRDD<MatrixIndexes,MatrixBlock> out = FrameRDDConverterUtils .binaryBlockToMatrixBlock(tmp, mcOut, mcOut); //set output and maintain lineage/output characteristics sec.setRDDHandleForVariable(_outputs.get(0).getName(), out); sec.addLineageRDD(_outputs.get(0).getName(), input1.getName()); sec.setFrameOutput(_outputs.get(1).getName(), meta); } catch(IOException ex) { throw new RuntimeException(ex); } }
Example 19
Source File: Data2CoNLL.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
@Override protected int run() throws Exception { SparkConf sparkConf = new SparkConf() .setAppName("Data2CoNLL") .set("spark.hadoop.validateOutputSpecs", "false") .set("spark.yarn.executor.memoryOverhead", "3072") .set("spark.rdd.compress", "true") .set("spark.core.connection.ack.wait.timeout", "600") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //.set("spark.kryo.registrationRequired", "true") .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class, InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class, VectorIndexer.class}) ;//.setMaster("local[4]"); //Remove this if you run it on the server. JavaSparkContext sc = new JavaSparkContext(sparkConf); int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances")) * Integer.parseInt(sc.getConf().get("spark.executor.cores")); FileSystem fs = FileSystem.get(new Configuration()); int partitionNumber = 3 * totalCores; if(partitions != null) { partitionNumber = partitions; } //Read training documents serialized as SCAS JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values(); JavaRDD<String> docStrings = documents.map( s -> { JCas jCas = s.getJCas(); NYTArticleMetaData metadata = JCasUtil.selectSingle(jCas, NYTArticleMetaData.class); StringJoiner docBuilder = new StringJoiner("\n"); docBuilder.add("-DOCSTART- (" + metadata.getGuid() + ")"); docBuilder.add(""); Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class); for(Sentence sentence: sentences) { List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); for(Token token: tokens) { CoreLabel taggedWord = CoreNlpUtils.tokenToWord(token); StringJoiner lineBuilder = new StringJoiner("\t"); lineBuilder.add(taggedWord.word().toLowerCase()); docBuilder.add(lineBuilder.toString()); } docBuilder.add(""); } return docBuilder.toString(); }); docStrings.saveAsTextFile(output); sc.stop(); return 0; }
Example 20
Source File: SaprkFile.java From sparkResearch with Apache License 2.0 | 4 votes |
public static void jsonFile(JavaSparkContext sparkContext) { //json文件的读写 JavaRDD<String> rdd = sparkContext.textFile("url"); JavaRDD<Person> result = rdd.mapPartitions(new ParseJson()).filter(new filterData()); result.saveAsTextFile("url"); }