org.apache.log4j.LogManager Scala Examples
The following examples show how to use org.apache.log4j.LogManager.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Parquet.scala From scylla-migrator with Apache License 2.0 | 5 votes |
package com.scylladb.migrator.readers import com.scylladb.migrator.config.SourceSettings import org.apache.log4j.LogManager import org.apache.spark.sql.{ DataFrame, SparkSession } object Parquet { val log = LogManager.getLogger("com.scylladb.migrator.readers.Parquet") def readDataFrame(spark: SparkSession, source: SourceSettings.Parquet): SourceDataFrame = { source.credentials.foreach { credentials => log.info("Loaded AWS credentials from config file") spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", credentials.accessKey) spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", credentials.secretKey) } SourceDataFrame(spark.read.parquet(source.path), None, false) } }
Example 2
Source File: CheckInfotonDataIntegrity.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonDataIntegrity import cmwell.analytics.util.{CmwellConnector, DatasetFilter} import cmwell.analytics.util.TimestampConversion.timestampConverter import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object CheckInfotonDataIntegrity { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CheckInfotonDataIntegrity.getClass) try { object Opts extends ScallopConf(args) { val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Check infoton data integrity" ).withSparkSessionDo { spark => val datasetFilter = DatasetFilter( lastModifiedGte = Opts.lastModifiedGteFilter.toOption, pathPrefix = Opts.pathPrefixFilter.toOption) val ds = InfotonDataIntegrity(Some(datasetFilter))(spark) val damagedInfotons = ds.filter(infoton => infoton.hasIncorrectUuid || infoton.hasDuplicatedSystemFields || infoton.hasInvalidContent || infoton.hasMissingOrIllFormedSystemFields ) damagedInfotons.select("uuid", "lastModified", "path", "hasIncorrectUuid", "hasMissingOrIllFormedSystemFields", "hasDuplicatedSystemFields", "hasInvalidContent", "hasUnknownSystemField") .write.csv(Opts.out()) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 3
Source File: DumpCompleteDocumentFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpCompleteDocumentFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpCompleteDocumentFromEs.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("dump-complete-document-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-filter", short = 'c', descr = "Filter on current status", default = None) val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithCompleteDocument val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = false) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 4
Source File: CopyIndex.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object CopyIndex { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CopyIndex.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("copy-index") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = true) val writeIndex: ScallopOption[String] = opt[String]("write-index", short = 'w', descr = "The name of the index to write to", required = true) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument]( indexName = Opts.writeIndex(), esEndpoint = esContactPoint) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), objectExtractor = IndexWithCompleteDocument, dataWriterFactory = dataWriterFactory, sourceFilter = false) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 5
Source File: DumpKeyFieldsFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithKeyFields} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpKeyFieldsFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpKeyFieldsFromEs.getClass) implicit val system: ActorSystem = ActorSystem("dump-key-fields-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids") val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default = Some(true), prefix = "no-", descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic") verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithKeyFields val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = Opts.sourceFilter()) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 6
Source File: DumpUuidOnlyFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithUuidOnly} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpUuidOnlyFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpUuidOnlyFromEs.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("dump-uuid-only-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids") val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-", descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic") verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithUuidOnly val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = Opts.sourceFilter()) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 7
Source File: DumpSystemFieldsFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithSystemFields} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpSystemFieldsFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpSystemFieldsFromEs.getClass) implicit val system: ActorSystem = ActorSystem("dump-system-fields-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids") val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-", descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic") verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithSystemFields val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = Opts.sourceFilter()) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 8
Source File: CopyIndexesWithMapping.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import com.fasterxml.jackson.databind.ObjectMapper import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.collection.JavaConverters._ import scala.concurrent.ExecutionContextExecutor object CopyIndexesWithMapping { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CopyIndexesWithMapping.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("copy-index-with-mapping") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val indexMap: ScallopOption[String] = opt[String]("index-map", short = 'i', descr = "A map from source to target index names, in JSON format", required = true) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) // Expect a map in the form: { "sourceIndex1": "targetIndex1", "sourceIndex2": "targetIndex2", ... } val indexMap: Map[String, String] = new ObjectMapper().readTree(Opts.indexMap()).fields.asScala.map { entry => entry.getKey -> entry.getValue.asText }.toMap val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexMap.keys.toSeq) // Validate that the index-map parameter specified valid index names, and not aliases. for (indexName <- indexMap.keys) if (!esTopology.allIndexNames.contains(indexName)) throw new RuntimeException(s"index-map parameter included $indexName as a source, which is not a valid index name.") for (indexName <- indexMap.values) if (!esTopology.allIndexNames.contains(indexName)) throw new RuntimeException(s"index-map parameter included $indexName as a target, which is not a valid index name.") val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument]( indexMap = indexMap, esEndpoint = esContactPoint) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), objectExtractor = IndexWithCompleteDocument, dataWriterFactory = dataWriterFactory, sourceFilter = false) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 9
Source File: CalculateXORSummary.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{IndexWithSourceHash, XORSummary, XORSummaryFactory} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.codec.binary.Hex import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object CalculateXORSummary { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CalculateXORSummary.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("xor-summary") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) val dataWriterFactory = new XORSummaryFactory() PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), objectExtractor = IndexWithSourceHash, dataWriterFactory = dataWriterFactory.apply, sourceFilter = false) // Summarize the summaries down to the index level. val summaryByIndex: Map[String, XORSummary] = dataWriterFactory.shardSummaries .groupBy { case (shard, _) => shard.indexName } .map { case (indexName, summaryMap) => indexName -> summaryMap.values.reduce(XORSummary.combine) } // TODO: Fix questionable JSON generation val r = "{" + summaryByIndex.map { case (index, summary) => val x = Hex.encodeHexString(summary.summary) s""" { "index": "$index", "summary": "$x" } """ }.mkString("\n") + "}" println(r) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 10
Source File: IndexWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import com.fasterxml.jackson.databind.JsonNode import com.typesafe.config.ConfigFactory import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} import org.apache.avro.generic.GenericRecord import org.apache.log4j.LogManager import org.joda.time.format.ISODateTimeFormat import scala.util.control.NonFatal case class IndexWithKeyFields(uuid: String, lastModified: java.sql.Timestamp, path: String) extends GenericRecord with CsvGenerator { override def put(key: String, v: scala.Any): Unit = ??? override def get(key: String): AnyRef = key match { case "uuid" => uuid case "lastModified" => java.lang.Long.valueOf(lastModified.getTime) case "path" => path } override def put(i: Int, v: scala.Any): Unit = ??? override def get(i: Int): AnyRef = i match { case 0 => uuid case 1 => java.lang.Long.valueOf(lastModified.getTime) case 2 => path case _ => throw new IllegalArgumentException } override def getSchema: Schema = IndexWithSystemFields.schema override def csv: String = (if (uuid == null) "" else uuid) + "," + (if (lastModified == null) "" else ISODateTimeFormat.dateTime.print(lastModified.getTime)) + "," + (if (path == null) "" else path) } object IndexWithKeyFields extends ObjectExtractor[IndexWithKeyFields] { private val logger = LogManager.getLogger(IndexWithSystemFields.getClass) // AVRO-2065 - doesn't allow union over logical type, so we can't make timestamp column nullable. val timestampMilliType: Schema = LogicalTypes.timestampMillis.addToSchema(Schema.create(Schema.Type.LONG)) val schema: Schema = SchemaBuilder .record("IndexWithSystemFields").namespace("cmwell.analytics") .fields .name("uuid").`type`.unionOf.stringType.and.nullType.endUnion.noDefault .name("lastModified").`type`(timestampMilliType).noDefault .name("path").`type`.unionOf.stringType.and.nullType.endUnion.noDefault .endRecord private val config = ConfigFactory.load val infotonSize: Int = config.getInt("extract-index-from-es.fetch-size-index-with-uuid-lastModified-path") def includeFields: String = { // Note that 'quad' is not included in this list val fields = "uuid,lastModified,path" .split(",") .map(name => s""""system.$name"""") .mkString(",") s""""_source": [$fields]""" } def extractFromJson(hit: JsonNode): IndexWithKeyFields = { val system = hit.findValue("_source").findValue("system") def extractString(name: String): String = system.findValue(name) match { case x: JsonNode => x.asText case _ => null } // Extracting date values as Long - as a java.sql.Date might be better def extractDate(name: String): java.sql.Timestamp = system.findValue(name) match { case x: JsonNode => try { new java.sql.Timestamp(ISODateTimeFormat.dateTime.parseDateTime(x.asText).getMillis) } catch { case NonFatal(ex) => logger.warn(s"Failed conversion of date value: $x", ex) throw ex } case _ => null } IndexWithKeyFields( uuid = extractString("uuid"), lastModified = extractDate("lastModified"), path = extractString("path")) } }
Example 11
Source File: Main.scala From example-spark-scala-read-and-write-from-hdfs with Apache License 2.0 | 5 votes |
package io.saagie.example.hdfs import org.apache.log4j.LogManager import org.apache.spark.sql.{SaveMode, SparkSession} object Main{ case class HelloWorld(message: String) def main(args: Array[String]): Unit = { val log = LogManager.getRootLogger // Creation of Spark Session val sparkSession = SparkSession.builder().appName("example-spark-scala-read-and-write-from-hdfs").getOrCreate() import sparkSession.implicits._ val hdfs_master = args(0) // ====== Creating a dataframe with 1 partition val df = Seq(HelloWorld("helloworld")).toDF().coalesce(1) // ======= Writing files // Writing file as parquet df.write.mode(SaveMode.Overwrite).parquet(hdfs_master + "user/hdfs/wiki/testwiki") // Writing file as csv df.write.mode(SaveMode.Overwrite).csv(hdfs_master + "user/hdfs/wiki/testwiki.csv") // ======= Reading files // Reading parquet files val df_parquet = sparkSession.read.parquet(hdfs_master + "user/hdfs/wiki/testwiki") log.info(df_parquet.show()) // Reading csv files val df_csv = sparkSession.read.option("inferSchema", "true").csv(hdfs_master + "user/hdfs/wiki/testwiki.csv") log.info(df_csv.show()) } }
Example 12
Source File: RequestSetup.scala From mist with Apache License 2.0 | 5 votes |
package io.hydrosphere.mist.worker import io.hydrosphere.mist.core.CommonData.RunJobRequest import io.hydrosphere.mist.worker.logging.{LogsWriter, RemoteAppender, RemoteLogsWriter} import org.apache.log4j.{LogManager, Logger} object RequestSetup { type ReqSetup = RunJobRequest => RunJobRequest => Unit val NOOP: ReqSetup = (_ : RunJobRequest) => (_: RunJobRequest) => () def loggingSetup(logger: Logger, writer: LogsWriter): ReqSetup = { (req: RunJobRequest) => { val app = RemoteAppender.create(req.id, writer) logger.addAppender(app) (req: RunJobRequest) => logger.removeAppender(req.id) } } def loggingSetup(writer: LogsWriter): ReqSetup = loggingSetup(LogManager.getRootLogger, writer) }
Example 13
Source File: RequestSetupSpec.scala From mist with Apache License 2.0 | 5 votes |
package io.hydrosphere.mist.worker import io.hydrosphere.mist.core.CommonData.{Action, JobParams, RunJobRequest} import io.hydrosphere.mist.core.logging.LogEvent import io.hydrosphere.mist.worker.logging.LogsWriter import mist.api.data.JsMap import org.apache.log4j.LogManager import org.scalatest.{FunSpec, Matchers} class RequestSetupSpec extends FunSpec with Matchers { it("should add/remove appender") { val logger = LogManager.getLogger("test") val setup = RequestSetup.loggingSetup(logger, new LogsWriter { override def close(): Unit = () override def write(e: LogEvent): Unit = () }) val req = RunJobRequest("id", JobParams("path", "MyClass", JsMap.empty, action = Action.Execute)) val cleanUp = setup(req) logger.getAppender("id") should not be null cleanUp(req) logger.getAppender("id") shouldBe null } }
Example 14
Source File: Writer.scala From scylla-migrator with Apache License 2.0 | 5 votes |
package com.scylladb.migrator.writer import com.datastax.spark.connector.writer._ import com.datastax.spark.connector._ import com.scylladb.migrator.Connectors import com.scylladb.migrator.config.{ CopyType, Rename, TargetSettings } import org.apache.log4j.LogManager import org.apache.spark.sql.{ DataFrame, SparkSession } object Writer { case class TimestampColumns(ttl: String, writeTime: String) val log = LogManager.getLogger("com.scylladb.migrator.writer") def writeDataframe( target: TargetSettings, renames: List[Rename], df: DataFrame, timestampColumns: Option[TimestampColumns], tokenRangeAccumulator: Option[TokenRangeAccumulator])(implicit spark: SparkSession): Unit = { val connector = Connectors.targetConnector(spark.sparkContext.getConf, target) val writeConf = WriteConf .fromSparkConf(spark.sparkContext.getConf) .copy( ttl = timestampColumns.map(_.ttl).fold(TTLOption.defaultValue)(TTLOption.perRow), timestamp = timestampColumns .map(_.writeTime) .fold(TimestampOption.defaultValue)(TimestampOption.perRow) ) // Similarly to createDataFrame, when using withColumnRenamed, Spark tries // to re-encode the dataset. Instead we just use the modified schema from this // DataFrame; the access to the rows is positional anyway and the field names // are only used to construct the columns part of the INSERT statement. val renamedSchema = renames .foldLeft(df) { case (acc, Rename(from, to)) => acc.withColumnRenamed(from, to) } .schema log.info("Schema after renames:") log.info(renamedSchema.treeString) val columnSelector = timestampColumns match { case None => SomeColumns(renamedSchema.fields.map(_.name: ColumnRef): _*) case Some(TimestampColumns(ttl, writeTime)) => SomeColumns( renamedSchema.fields .map(x => x.name: ColumnRef) .filterNot(ref => ref.columnName == ttl || ref.columnName == writeTime): _*) } df.rdd.saveToCassandra( target.keyspace, target.table, columnSelector, writeConf, tokenRangeAccumulator = tokenRangeAccumulator )(connector, SqlRowWriter.Factory) } }
Example 15
Source File: DumpInfotonWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonWithKeyFields import cmwell.analytics.util.{CmwellConnector, DatasetFilter} import cmwell.analytics.util.DatasetFilter._ import cmwell.analytics.util.TimestampConversion.timestampConverter import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpInfotonWithKeyFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpInfotonWithKeyFields.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to ", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump infoton table - uuid, lastModified, path", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val datasetFilter = DatasetFilter( lastModifiedGte = Opts.lastModifiedGteFilter.toOption, pathPrefix = Opts.pathPrefixFilter.toOption) val ds = InfotonWithKeyFields(Some(datasetFilter))(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 16
Source File: SparkSQLDriver.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{Arrays, ArrayList => JArrayList, List => JList} import org.apache.log4j.LogManager import org.apache.spark.sql.AnalysisException import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 17
Source File: IntervalTreeJoinOptimImpl.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.IntervalTree import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.bdgenomics.utils.instrumentation.Metrics import org.bdgenomics.utils.instrumentation.{MetricsListener, RecordedMetrics} import org.apache.spark.rdd.MetricsContext._ import org.biodatageeks.sequila.rangejoins.common.performance.timers.IntervalTreeTimer._ import scala.collection.JavaConversions._ import htsjdk.samtools.util.IntervalTree import org.apache.log4j.{LogManager, Logger} import org.biodatageeks.sequila.rangejoins.methods.IntervalTree.IntervalTreeHTS import org.biodatageeks.sequila.rangejoins.optimizer.{JoinOptimizer, RangeJoinMethod} object IntervalTreeJoinOptimImpl extends Serializable { val logger = Logger.getLogger(this.getClass.getCanonicalName) val intervalTree = IntervalTreeHTSBuild.time { val tree = new IntervalTreeHTS[Long]() localIntervals .foreach(r => tree.put(r._1._1,r._1._2,r._2)) sc.broadcast(tree) } val kvrdd2: RDD[(Long, Iterable[InternalRow])] = rdd2 .instrument() .mapPartitions(p => { p.map(r => { IntervalTreeHTSLookup.time { val record = intervalTree.value.overlappers(r.start, r.end) record .flatMap(k => (k.getValue.map(s=>(s,Iterable(r.row)))) ) } }) }) .flatMap(r => r) .reduceByKey((a,b) => a ++ b) intervalsWithId .map(_.swap) .join(kvrdd2) .flatMap(l => l._2._2.map(r => (l._2._1.row, r))) } } }
Example 18
Source File: myCustomLogwithClosure.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession object myCustomLogwithClosure extends Serializable { def main(args: Array[String]): Unit = { val log = LogManager.getRootLogger //Everything is printed as INFO onece the log level is set to INFO untill you set the level to new level for example WARN. log.setLevel(Level.INFO) log.info("Let's get started!") // Setting logger level as WARN: after that nothing prints other then WARN log.setLevel(Level.WARN) // Creating Spark Session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Logging") .getOrCreate() // These will note be printed! log.info("Get prepared!") log.trace("Show if there is any ERROR!") //Started the computation and printing the logging information log.warn("Started") val data = spark.sparkContext.parallelize(0 to 100000) data.foreach(i => log.info("My number"+ i)) data.collect() log.warn("Finished") } }
Example 19
Source File: MakingTaskSerilazible.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.spark.sql.SparkSession import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.log4j.Logger class MultiplicaitonOfTwoNumber { def multiply(a: Int, b: Int): Int = { val product = a * b product } } object MakingTaskSerilazible { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val myRDD = spark.sparkContext.parallelize(0 to 100) myRDD.foreachPartition(s => { val notSerializable = new MultiplicaitonOfTwoNumber println(notSerializable.multiply(s.next(), s.next())) }) } }
Example 20
Source File: myCustomLog.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.spark.sql.SparkSession object myCustomLogwithoutSerializable { def main(args: Array[String]): Unit = { val log = LogManager.getRootLogger //Everything is printed as INFO onece the log level is set to INFO untill you set the level to new level for example WARN. log.setLevel(Level.INFO) log.info("Let's get started!") // Setting logger level as WARN: after that nothing prints other then WARN log.setLevel(Level.WARN) // Creating Spark Session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Logging") .getOrCreate() // These will note be printed! log.info("Get prepared!") log.trace("Show if there is any ERROR!") //Started the computation and printing the logging information log.warn("Started") spark.sparkContext.parallelize(1 to 5).foreach(println) log.warn("Finished") } }
Example 21
Source File: myCustomLogwithClosureSerializable.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter16.SparkTesting import org.apache.log4j.{ Level, LogManager } import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession class MyMapper(n: Int) extends Serializable { @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") def logMapper(rdd: RDD[Int]): RDD[String] = rdd.map { i => log.warn("mapping: " + i) (i + n).toString } } //Companion object object MyMapper { def apply(n: Int): MyMapper = new MyMapper(n) } //Main object object myCustomLogwithClosureSerializable { def main(args: Array[String]) { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Testing") .getOrCreate() log.warn("Started") val data = spark.sparkContext.parallelize(1 to 100000) val mapper = MyMapper(1) val other = mapper.logMapper(data) other.collect() log.warn("Finished") } }
Example 22
Source File: KyroRegistrationDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.log4j.{ Level, LogManager, PropertyConfigurator } import org.apache.spark._ import org.apache.spark.rdd.RDD class MyMapper2(n: Int) { @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") def MyMapperDosomething(rdd: RDD[Int]): RDD[String] = rdd.map { i => log.warn("mapping: " + i) (i + n).toString } } //Companion object object MyMapper2 { def apply(n: Int): MyMapper = new MyMapper(n) } //Main object object KyroRegistrationDemo { def main(args: Array[String]) { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val conf = new SparkConf() .setAppName("My App") .setMaster("local[*]") conf.registerKryoClasses(Array(classOf[MyMapper2])) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(conf) log.warn("Started") val data = sc.parallelize(1 to 100000) val mapper = MyMapper2(10) val other = mapper.MyMapperDosomething(data) other.collect() log.warn("Finished") } }
Example 23
Source File: MyLog.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.spark.{SparkConf, SparkContext} import org.apache.log4j.LogManager import org.apache.log4j.Level import org.apache.log4j.Logger object MyLog1 extends Serializable { def main(args: Array[String]):Unit= { // Stting logger level as WARN val log = LogManager.getRootLogger log.setLevel(Level.WARN) @transient lazy val log2 = org.apache.log4j.LogManager.getLogger("myLogger") // Creating Spark Context val conf = new SparkConf().setAppName("My App").setMaster("local[*]") val sc = new SparkContext(conf) //Started the computation and printing the logging inforamtion //log.warn("Started") //val i = 0 val data = sc.parallelize(0 to 100000) data.foreach(i => log.info("My number"+ i)) data.collect() log.warn("Finished") } }
Example 24
Source File: MyLogCompleteDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter14.Serilazition import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.apache.spark._ import org.apache.spark.rdd.RDD class MyMapper(n: Int) extends Serializable{ @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") def MyMapperDosomething(rdd: RDD[Int]): RDD[String] = rdd.map{ i => log.warn("mapping: " + i) (i + n).toString } } //Companion object object MyMapper { def apply(n: Int): MyMapper = new MyMapper(n) } //Main object object MyLog { def main(args: Array[String]) { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val conf = new SparkConf() .setAppName("My App") .setMaster("local[*]") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(conf) log.warn("Started") val data = sc.parallelize(1 to 100000) val mapper = MyMapper(1) val other = mapper.MyMapperDosomething(data) other.collect() log.warn("Finished") } }
Example 25
Source File: SparkPredictionTrainer.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming._ import io.nats.client.ConnectionFactory._ import java.nio.ByteBuffer import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import com.logimethods.connector.nats.to_spark._ import com.logimethods.scala.connector.spark.to_nats._ import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import java.util.function._ import java.time.{LocalDateTime, ZoneOffset} import java.time.DayOfWeek._ import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel object SparkPredictionTrainer extends App with SparkPredictionProcessor { log.setLevel(Level.WARN) val (properties, targets, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) = setup(args) val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt println("STREAMING_DURATION = " + streamingDuration) new Thread(new Runnable { def run() { while( true ){ try { val data = SparkPredictionProcessor.getData(sc, THRESHOLD) val model = trainer.fit(data) model.write.overwrite.save(PREDICTION_MODEL_PATH) println("New model of size " + data.count() + " trained: " + model.uid) Thread.sleep(streamingDuration) } catch { case e: Throwable => log.error(e) } } } }).start() }
Example 26
Source File: SparkProcessor.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming._ import io.nats.client.Nats._ import io.nats.client.ConnectionFactory._ import java.nio.ByteBuffer import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import com.logimethods.connector.nats.to_spark._ import com.logimethods.scala.connector.spark.to_nats._ import java.util.function._ import java.time.{LocalDateTime, ZoneOffset} trait SparkProcessor { def setup(args: Array[String]) = { val inputSubject = args(0) // val inputNatsStreaming = inputSubject.toUpperCase.contains("STREAMING") val outputSubject = args(1) // val outputNatsStreaming = outputSubject.toUpperCase.contains("STREAMING") println("Will process messages from '" + inputSubject + "' to '" + outputSubject + "'") val logLevel = scala.util.Properties.envOrElse("LOG_LEVEL", "INFO") println("LOG_LEVEL = " + logLevel) val targets = scala.util.Properties.envOrElse("TARGETS", "ALL") println("TARGETS = " + targets) val cassandraUrl = System.getenv("CASSANDRA_URL") println("CASSANDRA_URL = " + cassandraUrl) val sparkMasterUrl = System.getenv("SPARK_MASTER_URL") println("SPARK_MASTER_URL = " + sparkMasterUrl) val sparkCoresMax = System.getenv("SPARK_CORES_MAX") println("SPARK_CORES_MAX = " + sparkCoresMax) val conf = new SparkConf() .setAppName(args(2)) .setMaster(sparkMasterUrl) .set("spark.cores.max", sparkCoresMax) .set("spark.cassandra.connection.host", cassandraUrl); val sc = new SparkContext(conf); // val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt // val ssc = new StreamingContext(sc, new Duration(streamingDuration)); /// ssc.checkpoint("/spark/storage") val properties = new Properties(); val natsUrl = System.getenv("NATS_URI") println("NATS_URI = " + natsUrl) properties.put("servers", natsUrl) properties.put(PROP_URL, natsUrl) val clusterId = System.getenv("NATS_CLUSTER_ID") val inputNatsStreaming = inputSubject.toUpperCase.contains("STREAMING") val outputNatsStreaming = outputSubject.toUpperCase.contains("STREAMING") (properties, targets, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) } def dataDecoder: Array[Byte] => Tuple2[Long,Float] = bytes => { val buffer = ByteBuffer.wrap(bytes); val epoch = buffer.getLong() val value = buffer.getFloat() (epoch, value) } } trait SparkStreamingProcessor extends SparkProcessor { def setupStreaming(args: Array[String]) = { val (properties, target, logLevel, sc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl) = setup(args) val streamingDuration = scala.util.Properties.envOrElse("STREAMING_DURATION", "2000").toInt println("STREAMING_DURATION = " + streamingDuration) val ssc = new StreamingContext(sc, new Duration(streamingDuration)); // ssc.checkpoint("/spark/storage") (properties, target, logLevel, sc, ssc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl, streamingDuration) } }
Example 27
Source File: SparkTemperatureProcessor.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming._ import com.datastax.spark.connector.streaming._ import com.datastax.spark.connector.SomeColumns import io.nats.client.ConnectionFactory._ import java.nio.ByteBuffer import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import com.logimethods.connector.nats.to_spark._ import com.logimethods.scala.connector.spark.to_nats._ import java.util.function._ import java.time.{LocalDateTime, ZoneOffset} object SparkTemperatureProcessor extends App with SparkStreamingProcessor { val log = LogManager.getRootLogger log.setLevel(Level.WARN) val (properties, target, logLevel, sc, ssc, inputNatsStreaming, inputSubject, outputSubject, clusterId, outputNatsStreaming, natsUrl, streamingDuration) = setupStreaming(args) // Temperatures // val temperatures = if (inputNatsStreaming) { NatsToSparkConnector .receiveFromNatsStreaming(classOf[Tuple2[Long,Float]], StorageLevel.MEMORY_ONLY, clusterId) .withNatsURL(natsUrl) .withSubjects(inputSubject) .withDataDecoder(dataDecoder) .asStreamOf(ssc) } else { NatsToSparkConnector .receiveFromNats(classOf[Tuple2[Long,Float]], StorageLevel.MEMORY_ONLY) .withProperties(properties) .withSubjects(inputSubject) .withDataDecoder(dataDecoder) .asStreamOf(ssc) } // Ideally, should be the AVG val singleTemperature = temperatures.reduceByKey(Math.max(_,_)) if (logLevel.contains("TEMPERATURE")) { singleTemperature.print() } singleTemperature.saveToCassandra("smartmeter", "temperature") val temperatureReport = singleTemperature.map({case (epoch, temperature) => (s"""{"epoch": $epoch, "temperature": $temperature}""") }) SparkToNatsConnectorPool.newPool() .withProperties(properties) .withSubjects(outputSubject) // "smartmeter.extract.temperature" .publishToNats(temperatureReport) // Start // ssc.start(); ssc.awaitTermination() }
Example 28
Source File: SparkBatch.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession //import com.datastax.spark.connector._ //import com.datastax.spark.connector.cql.CassandraConnector // @see http://stackoverflow.com/questions/39423131/how-to-use-cassandra-context-in-spark-2-0 // @see https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html // @see https://dzone.com/articles/cassandra-with-spark-20-building-rest-api object SparkBatch extends App { val logLevel = System.getenv("APP_BATCH_LOG_LEVEL") println("APP_BATCH_LOG_LEVEL = " + logLevel) if ("DEBUG" != logLevel) { Logger.getLogger("org").setLevel(Level.OFF) } val cassandraUrl = System.getenv("CASSANDRA_URL") println("CASSANDRA_URL = " + cassandraUrl) val sparkMasterUrl = System.getenv("SPARK_MASTER_URL") println("SPARK_MASTER_URL = " + sparkMasterUrl) val spark = SparkSession .builder() .master(sparkMasterUrl) .appName("Smartmeter Batch") .config("spark.cassandra.connection.host", cassandraUrl) // .config("spark.sql.warehouse.dir", warehouseLocation) //.enableHiveSupport() .getOrCreate() spark .read .format("org.apache.spark.sql.cassandra") .options(Map("keyspace" -> "smartmeter", "table" -> "raw_data")) .load .createOrReplaceTempView("raw_data") val rawVoltageData = spark.sql("select * from raw_data") rawVoltageData.show(10) // @see http://stackoverflow.com/questions/40324153/what-is-the-best-way-to-insert-update-rows-in-cassandra-table-via-java-spark //Save data to Cassandra import org.apache.spark.sql.SaveMode avgByTransformer.write.format("org.apache.spark.sql.cassandra").options(Map("keyspace" -> "smartmeter", "table" -> "avg_voltage_by_transformer")).mode(SaveMode.Overwrite).save(); }
Example 29
Source File: BigQueryPartitionUtils.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery.utils import com.google.api.client.googleapis.json.GoogleJsonResponseException import com.google.api.services.bigquery.Bigquery import com.google.api.services.bigquery.model.{Table, TableReference, TableSchema, TimePartitioning} import com.google.cloud.hadoop.io.bigquery.BigQueryStrings import org.apache.log4j.LogManager import scala.util.control.NonFatal class BigQueryPartitionUtils(bqService: Bigquery) { private val logger = LogManager.getRootLogger() val DEFAULT_TABLE_EXPIRATION_MS = 259200000L def createBigQueryPartitionedTable(targetTable: TableReference, timePartitionExpiration: Long = 0, tableSchema: TableSchema = null, timePartitioningField:String = null): Any = { val fullyQualifiedOutputTableId = BigQueryStrings.toString(targetTable) val decoratorsRegex = ".+?(?=\\$)".r val cleanTableName = BigQueryStrings .parseTableReference(decoratorsRegex.findFirstIn(fullyQualifiedOutputTableId) .getOrElse(fullyQualifiedOutputTableId)) val projectId = cleanTableName.getProjectId val datasetId = cleanTableName.getDatasetId val tableId = cleanTableName.getTableId if(doesTableAlreadyExist(projectId,datasetId,tableId)) { return } else { logger.info(s"Creating Table $tableId") val table = new Table() table.setTableReference(cleanTableName) val timePartitioning = new TimePartitioning() timePartitioning.setType("DAY") if(timePartitioningField != null) { timePartitioning.setField(timePartitioningField) } table.setTimePartitioning(timePartitioning) if (timePartitionExpiration > 0) { table.setExpirationTime(timePartitionExpiration) } table.setSchema(tableSchema) bqService.tables().insert(cleanTableName.getProjectId, cleanTableName.getDatasetId, table).execute() logger.info(s"Table $tableId created") } } def doesTableAlreadyExist(projectId: String, datasetId: String, tableId: String): Boolean = { try { bqService.tables().get(projectId,datasetId,tableId).execute() return true } catch { case e: GoogleJsonResponseException if e.getStatusCode == 404 => logger.info(s"$projectId:$datasetId.$tableId does not exist") return false case NonFatal(e) => throw e } } }
Example 30
Source File: MySqlPool.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import java.sql.{Connection, DriverManager} import java.util import org.apache.log4j.{LogManager, Logger} /** * 从mysql连接池中获取连接 */ class MySqlPool(url: String, user: String, pwd: String) extends Serializable { //连接池连接总数 private val max = 3 //每次产生连接数 private val connectionNum = 1 //当前连接池已产生的连接数 private var conNum = 0 private val pool = new util.LinkedList[Connection]() //连接池 val LOGGER :Logger = LogManager.getLogger("vita") //获取连接 def getJdbcConn(): Connection = { LOGGER.info("getJdbcConn") //同步代码块,AnyRef为所有引用类型的基类,AnyVal为所有值类型的基类 AnyRef.synchronized({ if (pool.isEmpty) { //加载驱动 preGetConn() for (i <- 1 to connectionNum) { val conn = DriverManager.getConnection(url, user, pwd) pool.push(conn) conNum += 1 } } pool.poll() }) } //释放连接 def releaseConn(conn: Connection): Unit = { pool.push(conn) } //加载驱动 private def preGetConn(): Unit = { //控制加载 if (conNum < max && !pool.isEmpty) { LOGGER.info("Jdbc Pool has no connection now, please wait a moments!") Thread.sleep(2000) preGetConn() } else { Class.forName("com.mysql.jdbc.Driver") } } }
Example 31
Source File: StructuredStreamingOffset.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.streaming import com.vita.Constants import com.vita.redies.RedisSingle import com.vita.spark.streaming.writer.RedisWriteKafkaOffset import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} object StructuredStreamingOffset { val LOGGER: Logger = LogManager.getLogger("StructuredStreamingOffset") //topic val SUBSCRIBE = "log" case class readLogs(context: String, offset: String) def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName("StructuredStreamingOffset") .getOrCreate() //开始 offset var startOffset = -1 //init val redisSingle: RedisSingle = new RedisSingle() redisSingle.init(Constants.IP, Constants.PORT) //get redis if (redisSingle.exists(Constants.REDIDS_KEY) && redisSingle.getTime(Constants.REDIDS_KEY) != -1) { startOffset = redisSingle.get(Constants.REDIDS_KEY).toInt } //sink val df = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", SUBSCRIBE) .option("startingOffsets", "{\"" + SUBSCRIBE + "\":{\"0\":" + startOffset + "}}") .load() import spark.implicits._ //row 包含: key、value 、topic、 partition、offset、timestamp、timestampType val lines = df.selectExpr("CAST(value AS STRING)", "CAST(offset AS LONG)").as[(String, Long)] val content = lines.map(x => readLogs(x._1, x._2.toString)) val count = content.toDF("context", "offset") //sink foreach 记录offset val query = count .writeStream .foreach(new RedisWriteKafkaOffset) .outputMode("update") .trigger(Trigger.ProcessingTime("5 seconds")) .format("console") .start() query.awaitTermination() } }
Example 32
Source File: Logging.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata.objects import org.apache.log4j.{Level, LogManager, Logger} trait Logging { // Method to get the logger name for this object protected def logName: String = { // Ignore trailing $'s in the class names for Scala objects this.getClass.getName.stripSuffix("$") } private val log: Logger = LogManager.getLogger(logName) // Set logger level protected def setLogLevel(level: Level): Unit = log.setLevel(level) // Log methods that take only a String protected def logInfo(msg: => String) { if (log.isInfoEnabled) log.info(msg) } protected def logDebug(msg: => String) { if (log.isDebugEnabled) log.debug(msg) } protected def logTrace(msg: => String) { if (log.isTraceEnabled) log.trace(msg) } protected def logWarning(msg: => String) { log.warn(msg) } protected def logError(msg: => String) { log.error(msg) } // Log methods that take Throwables (Exceptions/Errors) too protected def logInfo(msg: => String, throwable: Throwable) { if (log.isInfoEnabled) log.info(msg, throwable) } protected def logDebug(msg: => String, throwable: Throwable) { if (log.isDebugEnabled) log.debug(msg, throwable) } protected def logTrace(msg: => String, throwable: Throwable) { if (log.isTraceEnabled) log.trace(msg, throwable) } protected def logWarning(msg: => String, throwable: Throwable) { log.warn(msg, throwable) } protected def logError(msg: => String, throwable: Throwable) { log.error(msg, throwable) } protected def isTraceEnabled: Boolean = { log.isTraceEnabled } }
Example 33
Source File: L3-DStreamMapping.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditMappingApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditMappingApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val sdf = new SimpleDateFormat("yyyy-MM-dd") val tsKey = "created_utc" val secs = 1000L val keyedByDay = comments.map(rec => { val ts = (parse(rec) \ tsKey).values (sdf.format(new Date(ts.toString.toLong * secs)), rec) }) val keyedByDayPart = comments.mapPartitions(iter => { var ret = List[(String, String)]() while (iter.hasNext) { val rec = iter.next val ts = (parse(rec) \ tsKey).values ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) } ret.iterator }) val wordTokens = comments.map(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val wordTokensFlat = comments.flatMap(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val filterSubreddit = comments.filter(rec => (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) val sortedByAuthor = comments.transform(rdd => (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) ssc.start() ssc.awaitTermination() } }
Example 34
Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditKeyValueApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>") System.exit(1) } val Seq(appName, inputPath, inputPathPopular) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .groupByKey() .map(r => (r._2.sum, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .reduceByKey(_ + _) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length)) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubreddit2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2) val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubredditCo2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) ssc.start() ssc.awaitTermination() } }
Example 35
Source File: L3-DStreamVariation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditVariationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditVariationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val merged = comments.union(comments) val repartitionedComments = comments.repartition(4) val rddMin = comments.glom().map(arr => arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) ssc.start() ssc.awaitTermination() } }
Example 36
Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditWindowAndActionApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditWindowAndActionApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) val windowedCounts = windowedRecs.countByValue() windowedCounts.print(10) windowedCounts.saveAsObjectFiles("subreddit", "obj") windowedCounts.saveAsTextFiles("subreddit", "txt") globalCount.saveAsHadoopFiles("subreddit", "hadoop", classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) comments.foreachRDD(rdd => { LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) }) ssc.start() ssc.awaitTermination() } }
Example 37
Source File: L3-DStreamAggregation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditAggregationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditAggregationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val recCount = comments.count() val recCountValue = comments.countByValue() val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) .flatMap(body => body.split(" ")) .map(word => 1) .reduce(_ + _) ssc.start() ssc.awaitTermination() } }
Example 38
Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0 | 5 votes |
package SparkER.BlockRefinementMethods import SparkER.DataStructures.{BlockWithComparisonSize, ProfileBlocks} import SparkER.Utilities.BoundedPriorityQueue import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD } } def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = { profilesWithBlocks map { profileWithBlocks => val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons) val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet) } } }
Example 39
Source File: CustomPartitioner2.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Utilities import org.apache.log4j.LogManager class CustomPartitioner2(override val numPartitions : Int) extends MyPartitioner { val partitions = Array.ofDim[Double](numPartitions) override def getPartition(key: Any): Int = { val num = key.asInstanceOf[Double] val partition = partitions.indexOf(partitions.min) partitions.update(partition, partitions(partition)+num) //val log = LogManager.getRootLogger //log.info("SPARKER - situazione carico partizioni "+partitions.toList) return partition } def getName(): String ={ return "Custom partitioner 2" } }
Example 40
Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0 | 5 votes |
package BlockRefinementMethods import DataStructures.{BlockWithComparisonSize, ProfileBlocks} import Utilities.BoundedPriorityQueue import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD } } def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = { profilesWithBlocks map { profileWithBlocks => val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons) val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet) } } }
Example 41
Source File: CustomPartitioner2.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Utilities import org.apache.log4j.LogManager class CustomPartitioner2(override val numPartitions : Int) extends MyPartitioner { val partitions = Array.ofDim[Double](numPartitions) override def getPartition(key: Any): Int = { val num = key.asInstanceOf[Double] val partition = partitions.indexOf(partitions.min) partitions.update(partition, partitions(partition)+num) //val log = LogManager.getRootLogger //log.info("SPARKER - situazione carico partizioni "+partitions.toList) return partition } def getName(): String ={ return "Custom partitioner 2" } }
Example 42
Source File: BlockFiltering.scala From sparker with GNU General Public License v3.0 | 5 votes |
package BlockRefinementMethods import DataStructures.{BlockWithComparisonSize, ProfileBlocks} import Utilities.BoundedPriorityQueue import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD } } def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = { profilesWithBlocks map { profileWithBlocks => val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons) val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet) } } }
Example 43
Source File: JDBCSink.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark import java.sql.{Connection, ResultSet, SQLException, Statement} import org.apache.log4j.{LogManager, Logger} import org.apache.spark.sql.{ForeachWriter, Row} /** * 处理从StructuredStreaming中向mysql中写入数据 */ class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] { var statement: Statement = _ var resultSet: ResultSet = _ var connection: Connection = _ override def open(partitionId: Long, version: Long): Boolean = { connection = new MySqlPool(url, username, password).getJdbcConn() statement = connection.createStatement(); print("open") return true } override def process(value: Row): Unit = { println("process step one") val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "") val count = value.getAs[Long]("count") val querySql = "select 1 from webCount where titleName = '" + titleName + "'" val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')" val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'" println("process step two") try { //查看连接是否成功 var resultSet = statement.executeQuery(querySql) if (resultSet.next()) { println("updateSql") statement.executeUpdate(updateSql) } else { println("insertSql") statement.execute(insertSql) } } catch { case ex: SQLException => { println("SQLException") } case ex: Exception => { println("Exception") } case ex: RuntimeException => { println("RuntimeException") } case ex: Throwable => { println("Throwable") } } } override def close(errorOrNull: Throwable): Unit = { if (statement == null) { statement.close() } if (connection == null) { connection.close() } } }
Example 44
Source File: Main.scala From stellar-random-walk with Apache License 2.0 | 5 votes |
package au.csiro.data61.randomwalk import au.csiro.data61.randomwalk.algorithm.{UniformRandomWalk, VCutRandomWalk} import au.csiro.data61.randomwalk.common.CommandParser.TaskName import au.csiro.data61.randomwalk.common.{CommandParser, Params, Property} import com.typesafe.config.Config import org.apache.log4j.LogManager import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.scalactic.{Every, Good, Or} import spark.jobserver.SparkJobInvalid import spark.jobserver.api._ object Main extends SparkJob { lazy val logger = LogManager.getLogger("myLogger") def main(args: Array[String]) { CommandParser.parse(args) match { case Some(params) => val conf = new SparkConf().setAppName("stellar-random-walk") val context: SparkContext = new SparkContext(conf) runJob(context, null, params) case None => sys.exit(1) } } override def validate(sc: SparkContext, runtime: JobEnvironment, config: Config): JobData Or Every[SparkJobInvalid] = { val args = config.getString("rw.input").split("\\s+") CommandParser.parse(args) match { case Some(params) => Good(params) } } }
Example 45
Source File: Logging.scala From hail with MIT License | 5 votes |
package is.hail.utils import org.apache.log4j.{LogManager, Logger} trait Logging { @transient private var logger: Logger = _ @transient private var consoleLogger: Logger = _ def log: Logger = { if (logger == null) logger = LogManager.getRootLogger logger } def consoleLog: Logger = { if (consoleLogger == null) consoleLogger = LogManager.getLogger("Hail") consoleLogger } def info(msg: String) { consoleLog.info(msg) } def info(msg: String, t: Truncatable) { val (screen, logged) = t.strings if (screen == logged) consoleLog.info(format(msg, screen)) else { // writes twice to the log file, but this isn't a big problem consoleLog.info(format(msg, screen)) log.info(format(msg, logged)) } } def warn(msg: String) { consoleLog.warn(msg) } def warn(msg: String, t: Truncatable) { val (screen, logged) = t.strings if (screen == logged) consoleLog.warn(format(msg, screen)) else { // writes twice to the log file, but this isn't a big problem consoleLog.warn(format(msg, screen)) log.warn(format(msg, logged)) } } def error(msg: String) { consoleLog.error(msg) } }
Example 46
Source File: package.scala From hail with MIT License | 5 votes |
package is.hail.services import is.hail.utils._ import org.json4s.{DefaultFormats, Formats} import java.io.{File, FileInputStream} import java.security.KeyStore import javax.net.ssl.{KeyManagerFactory, SSLContext, TrustManagerFactory} import org.apache.log4j.{LogManager, Logger} import org.json4s.jackson.JsonMethods class NoSSLConfigFound( message: String, cause: Throwable ) extends Exception(message, cause) { def this() = this(null, null) def this(message: String) = this(message, null) } case class SSLConfig( outgoing_trust: String, outgoing_trust_store: String, incoming_trust: String, incoming_trust_store: String, key: String, cert: String, key_store: String) package object tls { lazy val log: Logger = LogManager.getLogger("is.hail.tls") private[this] lazy val _getSSLConfig: SSLConfig = { var configFile = System.getenv("HAIL_SSL_CONFIG_FILE") if (configFile == null) configFile = "/ssl-config/ssl-config.json" if (!new File(configFile).isFile) throw new NoSSLConfigFound(s"no ssl config file found at $configFile") log.info(s"ssl config file found at $configFile") using(new FileInputStream(configFile)) { is => implicit val formats: Formats = DefaultFormats JsonMethods.parse(is).extract[SSLConfig] } } lazy val getSSLContext: SSLContext = { val sslConfig = _getSSLConfig val pw = "dummypw".toCharArray val ks = KeyStore.getInstance("PKCS12") using(new FileInputStream(sslConfig.key_store)) { is => ks.load(is, pw) } val kmf = KeyManagerFactory.getInstance("SunX509") kmf.init(ks, pw) val ts = KeyStore.getInstance("JKS") using(new FileInputStream(sslConfig.outgoing_trust_store)) { is => ts.load(is, pw) } val tmf = TrustManagerFactory.getInstance("SunX509") tmf.init(ts) val ctx = SSLContext.getInstance("TLS") ctx.init(kmf.getKeyManagers, tmf.getTrustManagers, null) ctx } }
Example 47
Source File: Tokens.scala From hail with MIT License | 5 votes |
package is.hail.services import is.hail.utils._ import java.io.{File, FileInputStream} import org.apache.http.client.methods.HttpUriRequest import org.apache.log4j.{LogManager, Logger} import org.json4s.{DefaultFormats, Formats} import org.json4s.jackson.JsonMethods object Tokens { lazy val log: Logger = LogManager.getLogger("Tokens") def get: Tokens = { val file = getTokensFile() if (new File(file).isFile) { using(new FileInputStream(file)) { is => implicit val formats: Formats = DefaultFormats new Tokens(JsonMethods.parse(is).extract[Map[String, String]]) } } else { log.info(s"tokens file not found: $file") new Tokens(Map()) } } def getTokensFile(): String = { if (DeployConfig.get.location == "external") s"${ System.getenv("HOME") }/.hail/tokens.json" else "/user-tokens/tokens.json" } } class Tokens( tokens: Map[String, String] ) { def namespaceToken(ns: String): String = tokens(ns) def addNamespaceAuthHeaders(ns: String, req: HttpUriRequest): Unit = { val token = namespaceToken(ns) req.addHeader("Authorization", s"Bearer $token") val location = DeployConfig.get.location if (location == "external" && ns != "default") req.addHeader("X-Hail-Internal-Authorization", s"Bearer ${ namespaceToken("default") }") } def addServiceAuthHeaders(service: String, req: HttpUriRequest): Unit = { addNamespaceAuthHeaders(DeployConfig.get.getServiceNamespace(service), req) } }
Example 48
Source File: package.scala From hail with MIT License | 5 votes |
package is.hail import is.hail.services.batch_client.ClientResponseException import org.apache.http.conn.HttpHostConnectException import org.apache.log4j.{LogManager, Logger} import scala.util.Random package object services { lazy val log: Logger = LogManager.getLogger("is.hail.services") val RETRYABLE_HTTP_STATUS_CODES: Set[Int] = { val s = Set(408, 500, 502, 503, 504) if (System.getenv("HAIL_DONT_RETRY_500") == "1") s - 500 else s } def sleepAndBackoff(delay: Double): Double = { val t = delay * Random.nextDouble() Thread.sleep((t * 1000).toInt) // in ms math.min(delay * 2, 60.0) } def isTransientError(e: Exception): Boolean = { e match { case e: ClientResponseException => RETRYABLE_HTTP_STATUS_CODES.contains(e.status) case e: HttpHostConnectException => true case _ => false } } def retryTransientErrors[T](f: => T): T = { var delay = 0.1 var errors = 0 while (true) { try { return f } catch { case e: Exception => if (!isTransientError(e)) throw e errors += 1 if (errors % 10 == 0) log.warn(s"encountered $errors transient errors, most recent one was $e") } delay = sleepAndBackoff(delay) } throw new AssertionError("unreachable") } }
Example 49
Source File: DumpIndexWithSystemFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.IndexWithSystemFields import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpIndexWithSystemFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpIndexWithSystemFields.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump system fields from Elasticsearch indexes", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = IndexWithSystemFields()(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 50
Source File: DumpInfotonWithUuidOnly.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonWithUuidOnly import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpInfotonWithUuidOnly { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpInfotonWithUuidOnly.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump infoton table - uuid only", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = InfotonWithUuidOnly()(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 51
Source File: FindInfotonIndexInconsistencies.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonAndIndexWithSystemFields import cmwell.analytics.data.InfotonAndIndexWithSystemFields.{isConsistent, isWellFormed} import cmwell.analytics.util.CmwellConnector import cmwell.analytics.util.ConsistencyThreshold.defaultConsistencyThreshold import cmwell.analytics.util.ISO8601.{instantToMillis, instantToText} import org.apache.log4j.LogManager import org.apache.spark.sql.Row import org.apache.spark.sql.functions._ import org.joda.time.format.ISODateTimeFormat import org.rogach.scallop.{ScallopConf, ScallopOption, ValueConverter, singleArgConverter} import scala.util.control.NonFatal object FindInfotonIndexInconsistencies { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(FindInfotonIndexInconsistencies.getClass) try { object Opts extends ScallopConf(args) { private val instantConverter: ValueConverter[Long] = singleArgConverter[Long](instantToMillis) // If this parameter is not supplied, the (unreliable) ES Spark connector is used to extract the data from the es index. val esExtract: ScallopOption[String] = opt[String]("es", short = 'e', descr = "The path where the (parquet) extract of system fields the es index are stored", required = false) val consistencyThreshold: ScallopOption[Long] = opt[Long]("consistency-threshold", short = 'c', descr = "Ignore any inconsistencies at or after this instant", default = Some(defaultConsistencyThreshold))(instantConverter) val outParquet: ScallopOption[String] = opt[String]("out-parquet", short = 'p', descr = "The path to save the output to (in parquet format)", required = false) val outCsv: ScallopOption[String] = opt[String]("out-csv", short = 'v', descr = "The path to save the output to (in CSV format)", required = false) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Find inconsistencies between system fields in Infoton and Index", sparkShell = Opts.shell() ).withSparkSessionDo { spark => logger.info(s"Using a consistency threshold of ${instantToText(Opts.consistencyThreshold())}.") val ds = InfotonAndIndexWithSystemFields(esExtractPath = Opts.esExtract.toOption)(spark) // Filter out any inconsistencies found if more current than this point in time. val i = ds.schema.indexWhere(_.name == "infoton_lastModified") val filterCurrent: Row => Boolean = { row: Row => val parser = ISODateTimeFormat.dateTimeParser if (row.isNullAt(i)) true // Shouldn't be null, but don't filter out if we can't get a lastModified else try { parser.parseMillis(row.getAs[String](i)) < Opts.consistencyThreshold() } catch { case NonFatal(_) => true // Don't filter out if lastModified couldn't be converted } } val inconsistentData = ds.filter(not(isConsistent(ds) && isWellFormed(ds))) .filter(filterCurrent) .cache() // Save the inconsistent data in Parquet format suitable for additional analysis if (Opts.outParquet.isDefined) inconsistentData .write .parquet(Opts.outParquet()) // Save the inconsistent data to a single CSV file suitable for reporting. if (Opts.outCsv.isDefined) inconsistentData .coalesce(1) .write .option("header", value = true) .csv(Opts.outCsv()) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 52
Source File: DumpPathWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.PathWithKeyFields import cmwell.analytics.util.CmwellConnector import cmwell.analytics.util.DatasetFilter import cmwell.analytics.util.TimestampConversion.timestampConverter import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpPathWithKeyFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpPathWithKeyFields.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump path table - key fields", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val datasetFilter = DatasetFilter( lastModifiedGte = Opts.lastModifiedGteFilter.toOption, pathPrefix = Opts.pathPrefixFilter.toOption) val ds = PathWithKeyFields(Some(datasetFilter))(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 53
Source File: DumpPathWithUuidOnly.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.PathWithUuidOnly import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpPathWithUuidOnly { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpPathWithUuidOnly.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump path table - uuid only", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = PathWithUuidOnly()(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 54
Source File: FindDuplicatedSystemFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonWithDuplicatedSystemFields import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object FindDuplicatedSystemFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(FindDuplicatedSystemFields.getClass) try { object Opts extends ScallopConf(args) { val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Find infotons with duplicated system fields" ).withSparkSessionDo { spark => import spark.implicits._ InfotonWithDuplicatedSystemFields()(spark) .toDF .write.csv(Opts.out()) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 55
Source File: DumpIndexWithUuidOnly.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.IndexWithUuidsOnly import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpIndexWithUuidOnly { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpIndexWithUuidOnly.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only include current", required = false, default = Some(true)) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump UUIDs from Elasticsearch indexes", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = IndexWithUuidsOnly(currentOnly = Opts.currentOnly())(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 56
Source File: AnalyzeInconsistenciesResult.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.io.File import java.nio.charset.StandardCharsets.UTF_8 import cmwell.analytics.data.InfotonAndIndexWithSystemFields import cmwell.analytics.util.Connector import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.apache.spark.sql.{Column, DataFrame, Row} import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.collection.breakOut object AnalyzeInconsistenciesResult { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass) try { object Opts extends ScallopConf(args) { val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) verify() } Connector( appName = "Analyze InfotonAndIndexWithSystemFields Output", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds: DataFrame = spark.read.parquet(Opts.in()) import org.apache.spark.sql.functions._ // A column expression that counts the number of failures for each constraint. // This will also include null counts, needed to interpret the results. val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) => name -> sum(when(predicate, 0L).otherwise(1L)).as(name) }(breakOut) // Compute the failure counts val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head val results = for { i <- constraints.indices constraintName = constraints(i)._1 failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i) } yield s"$constraintName,$failureCount" FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }