Java Code Examples for org.apache.spark.SparkContext#newAPIHadoopRDD()
The following examples show how to use
org.apache.spark.SparkContext#newAPIHadoopRDD() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GraphXGraphGenerator.java From rya with Apache License 2.0 | 4 votes |
public RDD<Tuple2<Object, RyaTypeWritable>> getVertexRDD(SparkContext sc, Configuration conf) throws IOException, AccumuloSecurityException{ // Load configuration parameters zk = MRUtils.getACZK(conf); instance = MRUtils.getACInstance(conf); userName = MRUtils.getACUserName(conf); pwd = MRUtils.getACPwd(conf); mock = MRUtils.getACMock(conf, false); tablePrefix = MRUtils.getTablePrefix(conf); // Set authorizations if specified String authString = conf.get(MRUtils.AC_AUTH_PROP); if (authString != null && !authString.isEmpty()) { authorizations = new Authorizations(authString.split(",")); conf.set(ConfigUtils.CLOUDBASE_AUTHS, authString); // for consistency } else { authorizations = AccumuloRdfConstants.ALL_AUTHORIZATIONS; } // Set table prefix to the default if not set if (tablePrefix == null) { tablePrefix = RdfCloudTripleStoreConstants.TBL_PRFX_DEF; MRUtils.setTablePrefix(conf, tablePrefix); } // Check for required configuration parameters Preconditions.checkNotNull(instance, "Accumulo instance name [" + MRUtils.AC_INSTANCE_PROP + "] not set."); Preconditions.checkNotNull(userName, "Accumulo username [" + MRUtils.AC_USERNAME_PROP + "] not set."); Preconditions.checkNotNull(pwd, "Accumulo password [" + MRUtils.AC_PWD_PROP + "] not set."); Preconditions.checkNotNull(tablePrefix, "Table prefix [" + MRUtils.TABLE_PREFIX_PROPERTY + "] not set."); RdfCloudTripleStoreConstants.prefixTables(tablePrefix); // If connecting to real accumulo, set additional parameters and require zookeepers if (!mock) conf.set(ConfigUtils.CLOUDBASE_ZOOKEEPERS, zk); // for consistency // Ensure consistency between alternative configuration properties conf.set(ConfigUtils.CLOUDBASE_INSTANCE, instance); conf.set(ConfigUtils.CLOUDBASE_USER, userName); conf.set(ConfigUtils.CLOUDBASE_PASSWORD, pwd); conf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock); conf.set(RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX, tablePrefix); Job job = Job.getInstance(conf, sc.appName()); ClientConfiguration clientConfig = new ClientConfiguration().with(ClientProperty.INSTANCE_NAME, instance).with(ClientProperty.INSTANCE_ZK_HOST, zk); GraphXInputFormat.setInputTableName(job, EntityCentricIndex.getTableName(conf)); GraphXInputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd)); GraphXInputFormat.setZooKeeperInstance(job, clientConfig); GraphXInputFormat.setScanAuthorizations(job, authorizations); return sc.newAPIHadoopRDD(job.getConfiguration(), GraphXInputFormat.class, Object.class, RyaTypeWritable.class); }
Example 2
Source File: GraphXGraphGenerator.java From rya with Apache License 2.0 | 4 votes |
public RDD<Tuple2<Object, Edge>> getEdgeRDD(SparkContext sc, Configuration conf) throws IOException, AccumuloSecurityException{ // Load configuration parameters zk = MRUtils.getACZK(conf); instance = MRUtils.getACInstance(conf); userName = MRUtils.getACUserName(conf); pwd = MRUtils.getACPwd(conf); mock = MRUtils.getACMock(conf, false); tablePrefix = MRUtils.getTablePrefix(conf); // Set authorizations if specified String authString = conf.get(MRUtils.AC_AUTH_PROP); if (authString != null && !authString.isEmpty()) { authorizations = new Authorizations(authString.split(",")); conf.set(ConfigUtils.CLOUDBASE_AUTHS, authString); // for consistency } else { authorizations = AccumuloRdfConstants.ALL_AUTHORIZATIONS; } // Set table prefix to the default if not set if (tablePrefix == null) { tablePrefix = RdfCloudTripleStoreConstants.TBL_PRFX_DEF; MRUtils.setTablePrefix(conf, tablePrefix); } // Check for required configuration parameters Preconditions.checkNotNull(instance, "Accumulo instance name [" + MRUtils.AC_INSTANCE_PROP + "] not set."); Preconditions.checkNotNull(userName, "Accumulo username [" + MRUtils.AC_USERNAME_PROP + "] not set."); Preconditions.checkNotNull(pwd, "Accumulo password [" + MRUtils.AC_PWD_PROP + "] not set."); Preconditions.checkNotNull(tablePrefix, "Table prefix [" + MRUtils.TABLE_PREFIX_PROPERTY + "] not set."); RdfCloudTripleStoreConstants.prefixTables(tablePrefix); // If connecting to real accumulo, set additional parameters and require zookeepers if (!mock) conf.set(ConfigUtils.CLOUDBASE_ZOOKEEPERS, zk); // for consistency // Ensure consistency between alternative configuration properties conf.set(ConfigUtils.CLOUDBASE_INSTANCE, instance); conf.set(ConfigUtils.CLOUDBASE_USER, userName); conf.set(ConfigUtils.CLOUDBASE_PASSWORD, pwd); conf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock); conf.set(RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX, tablePrefix); Job job = Job.getInstance(conf, sc.appName()); ClientConfiguration clientConfig = new ClientConfiguration().with(ClientProperty.INSTANCE_NAME, instance).with(ClientProperty.INSTANCE_ZK_HOST, zk); RyaInputFormat.setTableLayout(job, TABLE_LAYOUT.SPO); RyaInputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd)); RyaInputFormat.setZooKeeperInstance(job, clientConfig); RyaInputFormat.setScanAuthorizations(job, authorizations); String tableName = RdfCloudTripleStoreUtils.layoutPrefixToTable(TABLE_LAYOUT.SPO, tablePrefix); InputFormatBase.setInputTableName(job, tableName); return sc.newAPIHadoopRDD(job.getConfiguration(), GraphXEdgeInputFormat.class, Object.class, Edge.class); }
Example 3
Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0 | 4 votes |
public static JavaPairRDD<GeoWaveInputKey, SimpleFeature> loadRawRDD( final SparkContext sc, final DataStorePluginOptions storeOptions, final RDDOptions rddOpts) throws IOException { if (sc == null) { LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again."); return null; } if (storeOptions == null) { LOGGER.error("Must supply input store to load. Please set storeOptions and try again."); return null; } if (rddOpts == null) { LOGGER.error("Must supply valid RDDOptions to load a rdd."); return null; } final Configuration conf = new Configuration(sc.hadoopConfiguration()); GeoWaveInputFormat.setStoreOptions(conf, storeOptions); if (rddOpts.getQuery() != null) { GeoWaveInputFormat.setQuery( conf, rddOpts.getQuery(), storeOptions.createAdapterStore(), storeOptions.createInternalAdapterStore(), storeOptions.createIndexStore()); } if ((rddOpts.getMinSplits() > -1) || (rddOpts.getMaxSplits() > -1)) { GeoWaveInputFormat.setMinimumSplitCount(conf, rddOpts.getMinSplits()); GeoWaveInputFormat.setMaximumSplitCount(conf, rddOpts.getMaxSplits()); } else { final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1); // Attempt to grab default partition count for spark and split data // along that. // Otherwise just fallback to default according to index strategy if (defaultSplitsSpark != -1) { GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark); GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark); } } final RDD<Tuple2<GeoWaveInputKey, SimpleFeature>> rdd = sc.newAPIHadoopRDD( conf, GeoWaveInputFormat.class, GeoWaveInputKey.class, SimpleFeature.class); final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = JavaPairRDD.fromJavaRDD(rdd.toJavaRDD()); return javaRdd; }
Example 4
Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0 | 4 votes |
public static JavaPairRDD<GeoWaveInputKey, GridCoverage> loadRawRasterRDD( final SparkContext sc, final DataStorePluginOptions storeOptions, final String indexName, final Integer minSplits, final Integer maxSplits) throws IOException { if (sc == null) { LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again."); return null; } if (storeOptions == null) { LOGGER.error("Must supply input store to load. Please set storeOptions and try again."); return null; } final Configuration conf = new Configuration(sc.hadoopConfiguration()); GeoWaveInputFormat.setStoreOptions(conf, storeOptions); if (indexName != null) { GeoWaveInputFormat.setQuery( conf, QueryBuilder.newBuilder().indexName(indexName).build(), storeOptions.createAdapterStore(), storeOptions.createInternalAdapterStore(), storeOptions.createIndexStore()); } if (((minSplits != null) && (minSplits > -1)) || ((maxSplits != null) && (maxSplits > -1))) { GeoWaveInputFormat.setMinimumSplitCount(conf, minSplits); GeoWaveInputFormat.setMaximumSplitCount(conf, maxSplits); } else { final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1); // Attempt to grab default partition count for spark and split data // along that. // Otherwise just fallback to default according to index strategy if (defaultSplitsSpark != -1) { GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark); GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark); } } final RDD<Tuple2<GeoWaveInputKey, GridCoverage>> rdd = sc.newAPIHadoopRDD( conf, GeoWaveInputFormat.class, GeoWaveInputKey.class, GridCoverage.class); final JavaPairRDD<GeoWaveInputKey, GridCoverage> javaRdd = JavaPairRDD.fromJavaRDD(rdd.toJavaRDD()); return javaRdd; }