Java Code Examples for org.apache.spark.api.java.JavaSparkContext#addFile()
The following examples show how to use
org.apache.spark.api.java.JavaSparkContext#addFile() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FileIO.java From RP-DBSCAN with Apache License 2.0 | 6 votes |
public static List<String> broadCastData(JavaSparkContext sc, Configuration conf, String dirPath) throws IOException { FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(dirPath)); List<String> metaPaths = new ArrayList<String>(); long size = 0; for(int i=0; i<status.length; i++) { String path = status[i].getPath().toString(); String fileName = status[i].getPath().getName(); sc.addFile(path); metaPaths.add(fileName); size += status[i].getLen(); } System.out.println("size : " + size); return metaPaths; }
Example 2
Source File: GATKSparkTool.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Register the reference file (and associated dictionary and index) to be downloaded to every node using Spark's * copying mechanism ({@code SparkContext#addFile()}). * @param ctx the Spark context * @param referencePath the reference file, can be a local file or a remote path * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()} */ protected static String addReferenceFilesForSpark(JavaSparkContext ctx, Path referencePath) { if (referencePath == null) { return null; } Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(referencePath); Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referencePath); Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(referencePath); ctx.addFile(referencePath.toUri().toString()); if (Files.exists(indexPath)) { ctx.addFile(indexPath.toUri().toString()); } if (Files.exists(dictPath)) { ctx.addFile(dictPath.toUri().toString()); } if (Files.exists(gziPath)) { ctx.addFile(gziPath.toUri().toString()); } return referencePath.getFileName().toString(); }
Example 3
Source File: GATKSparkTool.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Register the VCF file (and associated index) to be downloaded to every node using Spark's copying mechanism * ({@code SparkContext#addFile()}). * @param ctx the Spark context * @param vcfFileNames the VCF files, can be local files or remote paths * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()} */ protected static List<String> addVCFsForSpark(JavaSparkContext ctx, List<String> vcfFileNames) { for (String vcfFileName : vcfFileNames) { String vcfIndexFileName; if (vcfFileName.endsWith(FileExtensions.VCF)) { vcfIndexFileName = vcfFileName + FileExtensions.VCF_INDEX; } else if (vcfFileName.endsWith(FileExtensions.COMPRESSED_VCF)) { vcfIndexFileName = vcfFileName + FileExtensions.COMPRESSED_VCF_INDEX; } else { throw new IllegalArgumentException("Unrecognized known sites file extension. Must be .vcf or .vcf.gz"); } ctx.addFile(vcfFileName); if (Files.exists(IOUtils.getPath(vcfIndexFileName))) { ctx.addFile(vcfIndexFileName); } } return vcfFileNames.stream().map(name -> IOUtils.getPath(name).getFileName().toString()).collect(Collectors.toList()); }
Example 4
Source File: BwaSparkEngine.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * @param ctx the Spark context * @param referenceFile the path to the reference file named <i>_prefix_.fa</i>, which is used to find the image file with name <i>_prefix_.fa.img</i>. * Can be <code>null</code> if the indexFileName is provided. * @param indexFileName the index image file name that already exists, or <code>null</code> to have the image file automatically distributed. * @param inputHeader the SAM file header to use for reads * @param refDictionary the sequence dictionary to use for reads if the SAM file header doesn't have one (or it's empty) */ public BwaSparkEngine(final JavaSparkContext ctx, final String referenceFile, final String indexFileName, SAMFileHeader inputHeader, final SAMSequenceDictionary refDictionary) { Utils.nonNull(referenceFile); Utils.nonNull(inputHeader); this.ctx = ctx; if (indexFileName != null) { this.indexFileName = indexFileName; this.resolveIndexFileName = false; } else { String indexFile = referenceFile + REFERENCE_INDEX_IMAGE_FILE_SUFFIX; ctx.addFile(indexFile); // distribute index file to all executors this.indexFileName = IOUtils.getPath(indexFile).getFileName().toString(); this.resolveIndexFileName = true; } if (inputHeader.getSequenceDictionary() == null || inputHeader.getSequenceDictionary().isEmpty()) { Utils.nonNull(refDictionary); inputHeader = inputHeader.clone(); inputHeader.setSequenceDictionary(refDictionary); } broadcastHeader = ctx.broadcast(inputHeader); }
Example 5
Source File: SparkSegmentGenerationJobRunner.java From incubator-pinot with Apache License 2.0 | 6 votes |
protected void packPluginsToDistributedCache(JavaSparkContext sparkContext) { String pluginsRootDir = PluginManager.get().getPluginsRootDir(); if (pluginsRootDir == null) { LOGGER.warn("Local Pinot plugins directory is null, skip packaging..."); return; } if (new File(pluginsRootDir).exists()) { File pluginsTarGzFile = new File(PINOT_PLUGINS_TAR_GZ); try { TarGzCompressionUtils.createTarGzOfDirectory(pluginsRootDir, pluginsTarGzFile.getPath()); } catch (IOException e) { LOGGER.error("Failed to tar plugins directory", e); } sparkContext.addFile(pluginsTarGzFile.getAbsolutePath()); String pluginsIncludes = System.getProperty(PLUGINS_INCLUDE_PROPERTY_NAME); if (pluginsIncludes != null) { sparkContext.getConf().set(PLUGINS_INCLUDE_PROPERTY_NAME, pluginsIncludes); } } else { LOGGER.warn("Cannot find local Pinot plugins directory at [{}]", pluginsRootDir); } }
Example 6
Source File: EntitySalienceTrainingSparkRunner.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
@Override protected int run() throws Exception { SparkConf sparkConf = new SparkConf() .setAppName("EntitySalienceTrainingSparkRunner") .set("spark.hadoop.validateOutputSpecs", "false") .set("spark.yarn.executor.memoryOverhead", "3072") .set("spark.rdd.compress", "true") .set("spark.core.connection.ack.wait.timeout", "600") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //.set("spark.kryo.registrationRequired", "true") .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class, InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class, VectorIndexer.class}) ;//.setMaster("local[4]"); //Remove this if you run it on the server. TrainingSettings trainingSettings = new TrainingSettings(); if(folds != null) { trainingSettings.setNumFolds(folds); } if(method != null) { trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.valueOf(method)); } if(defaultConf != null) { trainingSettings.setAidaDefaultConf(defaultConf); } if(scalingFactor != null) { trainingSettings.setPositiveInstanceScalingFactor(scalingFactor); } JavaSparkContext sc = new JavaSparkContext(sparkConf); int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances")) * Integer.parseInt(sc.getConf().get("spark.executor.cores")); // int totalCores = 4; //// trainingSettings.setFeatureExtractor(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE); //// trainingSettings.setAidaDefaultConf("db"); // //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG); // trainingSettings.setPositiveInstanceScalingFactor(1); //Add the cache files to each node only if annotation is required. //The input documents could already be annotated, and in this case no caches are needed. if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) { sc.addFile(trainingSettings.getBigramCountCache()); sc.addFile(trainingSettings.getKeywordCountCache()); sc.addFile(trainingSettings.getWordContractionsCache()); sc.addFile(trainingSettings.getWordExpansionsCache()); if (trainingSettings.getAidaDefaultConf().equals("db")) { sc.addFile(trainingSettings.getDatabaseAida()); } else { sc.addFile(trainingSettings.getCassandraConfig()); } } SQLContext sqlContext = new SQLContext(sc); FileSystem fs = FileSystem.get(new Configuration()); int partitionNumber = 3 * totalCores; if(partitions != null) { partitionNumber = partitions; } //Read training documents serialized as SCAS JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values(); //Instanciate a training spark runner TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner(); //Train a model CrossValidatorModel model = trainingSparkRunner.crossValidate(sc, sqlContext, documents, trainingSettings); //Create the model path String modelPath = output+"/"+sc.getConf().getAppId()+"/model_"+trainingSettings.getClassificationMethod(); //Delete the old model if there is one fs.delete(new Path(modelPath), true); //Save the new model model List<Model> models = new ArrayList<>(); models.add(model.bestModel()); sc.parallelize(models, 1).saveAsObjectFile(modelPath); //Save the model stats SparkClassificationModel.saveStats(model, trainingSettings, output+"/"+sc.getConf().getAppId()+"/"); return 0; }
Example 7
Source File: EntitySalienceTestingSparkRunner.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
@Override protected int run() throws Exception { SparkConf sparkConf = new SparkConf() .setAppName("EntitySalienceTrainingSparkRunner") .set("spark.hadoop.validateOutputSpecs", "false") //.set("spark.yarn.executor.memoryOverhead", "4096") .set("spark.rdd.compress", "true") .set("spark.core.connection.ack.wait.timeout", "600") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //.set("spark.kryo.registrationRequired", "true") .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class, InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class, VectorIndexer.class}) ;//setMaster("local"); //Remove this if you run it on the server. TrainingSettings trainingSettings = new TrainingSettings(); if(defaultConf != null) { trainingSettings.setAidaDefaultConf(defaultConf); } JavaSparkContext sc = new JavaSparkContext(sparkConf); int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances")) * Integer.parseInt(sc.getConf().get("spark.executor.cores")); // int totalCores = 2; //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG); trainingSettings.setPositiveInstanceScalingFactor(1); if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) { sc.addFile(trainingSettings.getBigramCountCache()); sc.addFile(trainingSettings.getKeywordCountCache()); sc.addFile(trainingSettings.getWordContractionsCache()); sc.addFile(trainingSettings.getWordExpansionsCache()); if (trainingSettings.getAidaDefaultConf().equals("db")) { sc.addFile(trainingSettings.getDatabaseAida()); } else { sc.addFile(trainingSettings.getCassandraConfig()); } } SQLContext sqlContext = new SQLContext(sc); int partitionNumber = 3 * totalCores; //Read training documents serialized as SCAS JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber); //Instanciate a training spark runner TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner(); PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first(); //Evaluate the model and write down the evaluation metrics. trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/"); return 0; }