org.rogach.scallop.ScallopConf Scala Examples
The following examples show how to use org.rogach.scallop.ScallopConf.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FeatureCounts.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.apps import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.apache.spark.sql.SparkSession import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.biodatageeks.sequila.utils.Columns import org.rogach.scallop.ScallopConf import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} import org.seqdoop.hadoop_bam.util.SAMHeaderReader object FeatureCounts { case class Region(contig:String, pos_start:Int, pos_end:Int) class RunConf(args:Array[String]) extends ScallopConf(args){ val output = opt[String](required = true) val annotations = opt[String](required = true) val readsFile = trailArg[String](required = true) val Format = trailArg[String](required = false) verify() } def main(args: Array[String]): Unit = { val runConf = new RunConf(args) val spark = SparkSession .builder() .appName("SeQuiLa-FC") .getOrCreate() spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder","true") //spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (1024).toString) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil val query ="""SELECT targets.GeneId AS GeneId, targets.Chr AS Chr, targets.Start AS Start, targets.End AS End, targets.Strand AS Strand, CAST(targets.End AS INTEGER)-CAST(targets.Start AS INTEGER) + 1 AS Length, count(*) AS Counts FROM reads JOIN targets |ON ( | targets.Chr=reads.contigName | AND | reads.end >= CAST(targets.Start AS INTEGER) | AND | reads.start <= CAST(targets.End AS INTEGER) |) |GROUP BY targets.GeneId,targets.Chr,targets.Start,targets.End,targets.Strand""".stripMargin spark .sparkContext .setLogLevel("ERROR") spark .sparkContext .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) val alignments = spark .sparkContext.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat](runConf.readsFile()) .map(_._2.get) .map(r => Region(r.getContig, r.getStart, r.getEnd)) val readsTable = spark.sqlContext.createDataFrame(alignments) readsTable.createOrReplaceTempView("reads") val targets = spark .read .option("header", "true") .option("delimiter", "\t") .csv(runConf.annotations()) targets .withColumnRenamed("contigName", Columns.CONTIG) .createOrReplaceTempView("targets") spark.sql(query) .orderBy("GeneId") .coalesce(1) .write .option("header", "true") .option("delimiter", "\t") .csv(runConf.output()) } }
Example 2
Source File: DumpInfotonWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonWithKeyFields import cmwell.analytics.util.{CmwellConnector, DatasetFilter} import cmwell.analytics.util.DatasetFilter._ import cmwell.analytics.util.TimestampConversion.timestampConverter import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpInfotonWithKeyFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpInfotonWithKeyFields.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to ", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump infoton table - uuid, lastModified, path", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val datasetFilter = DatasetFilter( lastModifiedGte = Opts.lastModifiedGteFilter.toOption, pathPrefix = Opts.pathPrefixFilter.toOption) val ds = InfotonWithKeyFields(Some(datasetFilter))(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 3
Source File: CheckInfotonDataIntegrity.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonDataIntegrity import cmwell.analytics.util.{CmwellConnector, DatasetFilter} import cmwell.analytics.util.TimestampConversion.timestampConverter import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object CheckInfotonDataIntegrity { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CheckInfotonDataIntegrity.getClass) try { object Opts extends ScallopConf(args) { val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Check infoton data integrity" ).withSparkSessionDo { spark => val datasetFilter = DatasetFilter( lastModifiedGte = Opts.lastModifiedGteFilter.toOption, pathPrefix = Opts.pathPrefixFilter.toOption) val ds = InfotonDataIntegrity(Some(datasetFilter))(spark) val damagedInfotons = ds.filter(infoton => infoton.hasIncorrectUuid || infoton.hasDuplicatedSystemFields || infoton.hasInvalidContent || infoton.hasMissingOrIllFormedSystemFields ) damagedInfotons.select("uuid", "lastModified", "path", "hasIncorrectUuid", "hasMissingOrIllFormedSystemFields", "hasDuplicatedSystemFields", "hasInvalidContent", "hasUnknownSystemField") .write.csv(Opts.out()) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 4
Source File: ExtractFromParquet.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import java.io.File import java.nio.charset.StandardCharsets.UTF_8 import cmwell.analytics.util.Connector import cmwell.analytics.util.StringUtil._ import org.apache.commons.io.FileUtils import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.rogach.scallop.{ScallopConf, ScallopOption} object ExtractFromParquet { def main(args: Array[String]): Unit = { object Opts extends ScallopConf(args) { val pathsToFind: ScallopOption[String] = opt[String]("paths-to-find", short = 'f', descr = "A file containing the list of paths to look for", required = true) val parquetData: ScallopOption[String] = opt[String]("parquet-file", short = 'p', descr = "A Parquet file containing the data; single string column rdfStatement", required = true) val extractedData: ScallopOption[String] = opt[String]("extracted-data", short = 'd', descr = "The file that extracted data will be written to (in nquads format)", required = true) val pathsNotFound: ScallopOption[String] = opt[String]("paths-not-found", short = 'n', descr = "The output file that any paths that were not found are written to", required = true) val pathsFound: ScallopOption[String] = opt[String]("paths-found", short = 'a', descr = "The output file containing the paths that we found are written to", required = true) verify() } Connector(sparkShell = true, appName = "Extract from parquet").withSparkSessionDo { spark: SparkSession => val pathsToFind = Set(splitLines(FileUtils.readFileToString(new File(Opts.pathsToFind()), UTF_8)): _*) val ds: DataFrame = spark.read.parquet(Opts.parquetData()) // Cheesy parsing of path from an RDF nquad, but sufficient for this purpose def extractPath(rdfStatement: String): String = rdfStatement.substring(7, rdfStatement.indexOf(">")) val statementsFound = ds.rdd.filter { row: Row => val statement = row.getAs[String]("rdfStatement") val path = extractPath(statement) pathsToFind.contains(path) }.collect() // expect the result to be small, so collect is OK // Save all the paths that were not found to file - look for them in other files. val pathsFound: Set[String] = Set(statementsFound.map(row => extractPath(row.getString(0))): _*) println(s"There were ${pathsFound.size} paths found (out of ${pathsToFind.size}).") FileUtils.writeStringToFile(new File(Opts.pathsFound()), pathsFound.mkString("\n"), UTF_8, false) val pathsNotFound = pathsToFind.diff(pathsFound) println(s"There were ${pathsNotFound.size} paths not found.") FileUtils.writeStringToFile(new File(Opts.pathsNotFound()), pathsNotFound.mkString("\n"), UTF_8, false) // Save the RDF statements for the paths that were found val x = statementsFound.map(row => row.getString(0)).mkString("\n") FileUtils.writeStringToFile(new File(Opts.extractedData()), x, UTF_8, false) } } }
Example 5
Source File: DumpCompleteDocumentFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpCompleteDocumentFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpCompleteDocumentFromEs.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("dump-complete-document-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-filter", short = 'c', descr = "Filter on current status", default = None) val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithCompleteDocument val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = false) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 6
Source File: CopyIndex.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object CopyIndex { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CopyIndex.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("copy-index") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = true) val writeIndex: ScallopOption[String] = opt[String]("write-index", short = 'w', descr = "The name of the index to write to", required = true) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument]( indexName = Opts.writeIndex(), esEndpoint = esContactPoint) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), objectExtractor = IndexWithCompleteDocument, dataWriterFactory = dataWriterFactory, sourceFilter = false) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 7
Source File: DumpKeyFieldsFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithKeyFields} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpKeyFieldsFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpKeyFieldsFromEs.getClass) implicit val system: ActorSystem = ActorSystem("dump-key-fields-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids") val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default = Some(true), prefix = "no-", descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic") verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithKeyFields val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = Opts.sourceFilter()) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 8
Source File: DumpUuidOnlyFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithUuidOnly} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpUuidOnlyFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpUuidOnlyFromEs.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("dump-uuid-only-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids") val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-", descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic") verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithUuidOnly val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = Opts.sourceFilter()) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 9
Source File: DumpSystemFieldsFromEs.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithSystemFields} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.TimestampConversion.timestampConverter import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object DumpSystemFieldsFromEs { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpSystemFieldsFromEs.getClass) implicit val system: ActorSystem = ActorSystem("dump-system-fields-from-es") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids") val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet")) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-", descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic") verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) // Calling script should clear output directory as necessary. val objectExtractor = IndexWithSystemFields val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out()) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), currentOnly = Opts.currentOnly(), lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption, pathPrefixFilter = Opts.pathPrefixFilter.toOption, objectExtractor = objectExtractor, dataWriterFactory = dataWriterFactory, sourceFilter = Opts.sourceFilter()) // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion. FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 10
Source File: CopyIndexesWithMapping.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import com.fasterxml.jackson.databind.ObjectMapper import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.collection.JavaConverters._ import scala.concurrent.ExecutionContextExecutor object CopyIndexesWithMapping { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CopyIndexesWithMapping.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("copy-index-with-mapping") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val indexMap: ScallopOption[String] = opt[String]("index-map", short = 'i', descr = "A map from source to target index names, in JSON format", required = true) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) // Expect a map in the form: { "sourceIndex1": "targetIndex1", "sourceIndex2": "targetIndex2", ... } val indexMap: Map[String, String] = new ObjectMapper().readTree(Opts.indexMap()).fields.asScala.map { entry => entry.getKey -> entry.getValue.asText }.toMap val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexMap.keys.toSeq) // Validate that the index-map parameter specified valid index names, and not aliases. for (indexName <- indexMap.keys) if (!esTopology.allIndexNames.contains(indexName)) throw new RuntimeException(s"index-map parameter included $indexName as a source, which is not a valid index name.") for (indexName <- indexMap.values) if (!esTopology.allIndexNames.contains(indexName)) throw new RuntimeException(s"index-map parameter included $indexName as a target, which is not a valid index name.") val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument]( indexMap = indexMap, esEndpoint = esContactPoint) PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), objectExtractor = IndexWithCompleteDocument, dataWriterFactory = dataWriterFactory, sourceFilter = false) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 11
Source File: CalculateXORSummary.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.data.{IndexWithSourceHash, XORSummary, XORSummaryFactory} import cmwell.analytics.downloader.PartitionedDownloader import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints} import org.apache.commons.codec.binary.Hex import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.concurrent.ExecutionContextExecutor object CalculateXORSummary { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(CalculateXORSummary.getClass) // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) implicit val system: ActorSystem = ActorSystem("xor-summary") implicit val executionContext: ExecutionContextExecutor = system.dispatcher implicit val actorMaterializer: ActorMaterializer = ActorMaterializer() try { object Opts extends ScallopConf(args) { val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = false) val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } val esContactPoint = FindContactPoints.es(Opts.url()) val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_)) val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead) val dataWriterFactory = new XORSummaryFactory() PartitionedDownloader.runDownload( esTopology = esTopology, parallelism = Opts.parallelism(), objectExtractor = IndexWithSourceHash, dataWriterFactory = dataWriterFactory.apply, sourceFilter = false) // Summarize the summaries down to the index level. val summaryByIndex: Map[String, XORSummary] = dataWriterFactory.shardSummaries .groupBy { case (shard, _) => shard.indexName } .map { case (indexName, summaryMap) => indexName -> summaryMap.values.reduce(XORSummary.combine) } // TODO: Fix questionable JSON generation val r = "{" + summaryByIndex.map { case (index, summary) => val x = Hex.encodeHexString(summary.summary) s""" { "index": "$index", "summary": "$x" } """ }.mkString("\n") + "}" println(r) } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } finally { system.terminate() } } }
Example 12
Source File: TestResults.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.util import java.nio.file.Files import java.nio.file.Paths import java.nio.charset.Charset import java.nio.charset.StandardCharsets import java.io.ByteArrayOutputStream import java.io.PrintWriter import org.rogach.scallop.ScallopConf object TestResults { def main(args: Array[String]) { val config = new TestResultConfig(args) println("running tests....") parseTestResults(config.sbtPath(),config.sbtCmd()) } def parseTestResults(sbtPath:String = "/opt/local/bin/sbt", sbtCmd:String = "test") = { val procOutput = runCommand(Seq(sbtPath,sbtCmd))._2.replaceAll("""\x1b\[[0-9;]*[a-zA-Z]""", "") val pattern = """(?m)^.*\[info\] Total.*$|^.*\[info\] Finished.*$|^.*\[info\] [\d]+ examp.*$""".r val header = "test_name,seconds,examples,expectations,failures,errors,skipped\n" val pattern2 = """\[info\] Total for specification (\w+)\s+\[info\] Finished in (.+)\R\[info\] (.+)\R""".r val pattern3 = """([a-zA-Z]+): (?:(\d+) minutes? )?(?:(\d+) seconds?[,:] )?(?:(\d+) ms[,:] )?(\d+) examples?, (?:(\d+) expectations?, )?(\d+) failures?, (\d+) errors?(?:, (\d+) skipped)?""".r val string = pattern2.findAllMatchIn(procOutput).map(mat => s"${mat.group(1)}: ${mat.group(2)}: ${mat.group(3)}") .map(nline => nline match { case pattern3(test_name,minutes,seconds,ms,examples,expectations,failures,errors,skipped) => { val allseconds = (minutes match { case "" => 0 case null => 0 case x => x.toInt*60 }) + (seconds match { case "" => 0 case null => 0 case x => x.toInt }) + (ms match { case "" => 0.0 case null => 0.0 case x => x.toDouble/1000.0 }) s"$test_name,$allseconds,$examples,$expectations,$failures,$errors,$skipped" } }).mkString("\n") val outStr = header + string println(outStr) Files.write(Paths.get("test_output.csv"), outStr.getBytes(StandardCharsets.UTF_8)) } import sys.process._ def runCommand(cmd: Seq[String]): (Int, String, String) = { val stdoutStream = new ByteArrayOutputStream val stderrStream = new ByteArrayOutputStream val stdoutWriter = new PrintWriter(stdoutStream) val stderrWriter = new PrintWriter(stderrStream) val exitValue = cmd.!(ProcessLogger(stdoutWriter.println, stderrWriter.println)) stdoutWriter.close() stderrWriter.close() (exitValue, stdoutStream.toString, stderrStream.toString) } } class TestResultConfig(arguments: Seq[String]) extends ScallopConf(arguments) { val experimental = opt[List[String]]("X", default = Some(List[String]())) val sparkHost = opt[String]("sparkHost", descr = "The IP or hostname of the spark master", default = Some("spark-master.local")) val sparkPort = opt[String]("sparkPort", descr = "The port of the spark master", default = Some("7077")) val sbtPath = opt[String]("sbtPath", descr = "The path to sbt binary", default = Some("/opt/local/bin/sbt")) val sbtCmd = opt[String]("sbtCmd", descr = "The sbt command to run", default = Some("test")) }
Example 13
Source File: AnalyzeInconsistenciesResult.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import java.io.File import java.nio.charset.StandardCharsets.UTF_8 import cmwell.analytics.data.InfotonAndIndexWithSystemFields import cmwell.analytics.util.Connector import org.apache.commons.io.FileUtils import org.apache.log4j.LogManager import org.apache.spark.sql.{Column, DataFrame, Row} import org.rogach.scallop.{ScallopConf, ScallopOption} import scala.collection.breakOut object AnalyzeInconsistenciesResult { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass) try { object Opts extends ScallopConf(args) { val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) verify() } Connector( appName = "Analyze InfotonAndIndexWithSystemFields Output", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds: DataFrame = spark.read.parquet(Opts.in()) import org.apache.spark.sql.functions._ // A column expression that counts the number of failures for each constraint. // This will also include null counts, needed to interpret the results. val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) => name -> sum(when(predicate, 0L).otherwise(1L)).as(name) }(breakOut) // Compute the failure counts val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head val results = for { i <- constraints.indices constraintName = constraints(i)._1 failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i) } yield s"$constraintName,$failureCount" FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 14
Source File: Options.scala From ai.vitk.ner with GNU General Public License v3.0 | 5 votes |
package ai.vitk.ner import org.rogach.scallop.ScallopConf class Options(arguments: Seq[String]) extends ScallopConf(arguments) { val master = opt[String](default = Some("local[*]"), descr = "the Spark master URL") val memory = opt[String](default = Some("8g"), descr = "executor memory") val mode = opt[String](default = Some("tag"), descr = "mode of the tagger, either 'train', 'tag' or 'eval'") val verbose = opt[Boolean](default = Some(false), descr = "verbose mode") val language = opt[String](default = Some("vi"), descr = "natural language in use, either 'vi', 'en' or 'ja'") val dimension = opt[Int](default = Some(32768), descr = "domain dimension for feature hashing") val iteration = opt[Int](default = Some(600), descr = "max number of iterations in training") val independent = opt[Boolean](default = Some(false), descr = "use only independent features") val reversed = opt[Boolean](default = Some(false), descr = "backward model") val input = opt[String](default = Some("test.txt"), descr = "input file for tagging") verify() }
Example 15
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.run.GraphRunner import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus} import com.flaminem.flamy.model.ItemArgs import com.flaminem.flamy.model.names.ItemName import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} import scala.language.reflectiveCalls class Run extends Subcommand("run") with FlamySubcommand{ banner("Perform a run on the specified environment") private val environment: ScallopOption[Environment] = opt(name="on", default=None, descr="Specifies environment to run on.", required=false, noshort=true) private val dryRun: ScallopOption[Boolean] = opt(name="dry", default=Some(false), descr="Perform a dry-run", noshort=true) validateOpt(environment, dryRun) { case (None,Some(false)) => Left("Please specify an environment to run on (with the --on option), or use the --dry option to perform a local dry-run") case _ => Right(()) } private val from: ScallopOption[List[ItemName]] = opt[List[ItemName]](name="from", default=Some(Nil), descr="start from the given schemas/tables.", noshort=true, argName = "items") private val to: ScallopOption[List[ItemName]] = opt[List[ItemName]](name="to", default=Some(Nil), descr="stop at the given schemas/tables.", noshort=true, argName = "items") codependent(from,to) private val items: ScallopOption[List[ItemName]] = trailArg[List[ItemName]](default=Some(Nil),required=false) lazy val itemArgs = ItemArgs(items(), from(), to()) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { val context = new FlamyContext(globalOptions, this.environment.get) context.dryRun = this.dryRun() if (itemArgs.isEmpty) { System.err.println("Please specify items to run on") ReturnFailure } else { val graphRunner = GraphRunner(itemArgs, context) graphRunner.run() } } }
Example 16
Source File: Check.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.spark.ModelSparkContext import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.FlamyRunner import com.flaminem.flamy.exec.files.{FileRunner, ItemFileAction} import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, ModelHivePartitionFetcher} import com.flaminem.flamy.exec.utils._ import com.flaminem.flamy.exec.utils.io.FlamyOutput import com.flaminem.flamy.graph.TableGraph import com.flaminem.flamy.model._ import com.flaminem.flamy.model.core.Model import com.flaminem.flamy.model.files.FilePath import com.flaminem.flamy.model.names.ItemName import org.apache.spark.sql.SQLContext import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} import scala.language.reflectiveCalls val runGraph: TableGraph = baseGraph.subGraph(items()) val dryRunner: FlamyRunner = FlamyRunner(context) println("Creating schemas and tables ...") try { dryRunner.checkAll(baseGraph) } finally{ //TODO: For some strange reason, closing the connection here will result in ClassNotFoundErrors for udfs in the RunActions... // dryRunner.close() } FlamyOutput.out.info("Running Populates ...") dryRunner.populateAll(runGraph.model, context) dryRunner.close() ReturnStatus(success = dryRunner.getStats.getFailCount==0) } } override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { subCommands match { case (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil) case Nil => throw new IllegalArgumentException("A subcommand is expected") case _ => printHelp() ReturnFailure } } }
Example 17
Source File: Drop.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.FlamyRunner import com.flaminem.flamy.exec.actions.{DropSchemaAction, DropTableAction} import com.flaminem.flamy.exec.hive.HiveTableFetcher import com.flaminem.flamy.exec.utils.{Action, _} import com.flaminem.flamy.exec.utils.io.FlamyOutput import com.flaminem.flamy.model.{ItemFilter, TableInfo} import com.flaminem.flamy.model.exceptions.FlamyException import com.flaminem.flamy.model.names.{ItemName, SchemaName, TableName} import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} class Drop extends Subcommand("drop") with FlamySubcommand { val schemas: Subcommand = new Subcommand("schemas") with FlamySubcommand { banner("Drop the specified schemas on the specified environment") val environment: ScallopOption[Environment] = opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true) val dryRun: ScallopOption[Boolean] = opt(name = "dry", default = Some(false), descr = "Perform a dry-run", required = false, noshort = true) val all: ScallopOption[Boolean] = opt( name = "all", default = Some(false), descr = "Unlike other commands, not providing any schema name will not do anything. Unless you use this option.", noshort = true ) val items: ScallopOption[List[String]] = trailArg[List[String]](default = Some(List()), required = false) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { if(all() && items().nonEmpty) { throw new FlamyException("Using the --all option will drop all schemas, and no schema name should be specified.") } val context = new FlamyContext(globalOptions, environment.get) context.dryRun = dryRun() val itemFilter = ItemFilter(items(), acceptIfEmpty = all()) val fetcher = HiveTableFetcher(context) val schemaNames: Iterable[SchemaName] = fetcher.listSchemaNames.filter{itemFilter}.filterNot{_.fullName == "default"} val flamyRunner: FlamyRunner = FlamyRunner(context) val actionRunner = new ActionRunner(silentOnSuccess = false, silentOnFailure = false) val dropActions = schemaNames.map{schemaName => new DropSchemaAction(schemaName, flamyRunner)} actionRunner.run(dropActions) ReturnStatus(success = actionRunner.getStats.getFailCount == 0) } } val tables: Subcommand = new Subcommand("tables") with FlamySubcommand { banner("Drop the specified tables on the specified environment") val environment: ScallopOption[Environment] = opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true) val dryRun: ScallopOption[Boolean] = opt(name = "dry", default = Some(false), descr = "Perform a dry-run", required = false, noshort = true) val all: ScallopOption[Boolean] = opt( name = "all", default = Some(false), descr = "Unlike other commands, not providing any table name will not do anything. Unless you use this option.", noshort = true ) val items: ScallopOption[List[ItemName]] = trailArg[List[ItemName]](default = Some(List()), required = false) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { if(all() && items().nonEmpty) { throw new FlamyException("Using the --all option will drop all tables, and no table name should be specified.") } if(!all() && items().isEmpty) { throw new FlamyException("If you really want to drop all the tables, you should add the --all option.") } val context = new FlamyContext(globalOptions, environment.get) context.dryRun = dryRun() val itemFilter = ItemFilter(items(), acceptIfEmpty = all()) val fetcher = HiveTableFetcher(context) val tables: Iterable[TableInfo] = fetcher.listTables(itemFilter) val flamyRunner: FlamyRunner = FlamyRunner(context) val actionRunner = new ActionRunner(silentOnSuccess = false, silentOnFailure = false) val dropActions = tables.map{table => new DropTableAction(table, flamyRunner)} actionRunner.run(dropActions) ReturnStatus(success = actionRunner.getStats.getFailCount == 0) } } override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { subCommands match { case (command: FlamySubcommand) :: Nil => command.doCommand(globalOptions, Nil) case _ => printHelp() } ReturnSuccess } }
Example 18
Source File: GatherInfo.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.hive.HivePartitionFetcher import com.flaminem.flamy.exec.utils.{ReturnStatus, ReturnSuccess} import com.flaminem.flamy.model.ItemFilter import com.flaminem.flamy.model.names.ItemName import com.flaminem.flamy.utils.AutoClose import com.flaminem.flamy.utils.time.TimeUtils import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} import scala.language.reflectiveCalls class GatherInfo extends Subcommand("gather-info") with FlamySubcommand { banner("Gather all partitioning information on specified items (everything if no argument is given) and output this as csv on stdout.") val environment: ScallopOption[Environment] = opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true) val items: ScallopOption[List[ItemName]] = trailArg[List[ItemName]](default = Some(List()), required = false) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { val context = new FlamyContext(globalOptions, this.environment.get) for{ fetcher: HivePartitionFetcher <- AutoClose(HivePartitionFetcher(context)) } { val itemFilter = new ItemFilter(this.items(), true) for { tpInfo <- fetcher.listTableNames.filter{itemFilter}.map{fetcher.getTablePartitioningInfo} partition <- tpInfo.sortedTablePartitions } { println( Seq( tpInfo.tableName.schemaName, tpInfo.tableName.name, partition.partitionName, partition.getFileSize.getOrElse("\\N"), partition.getModificationTime(context, refresh = false).map { TimeUtils.timestampToUniversalTime }.getOrElse("\\N") ).mkString("\t") ) } } ReturnSuccess } }
Example 19
Source File: Export.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands.tools import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyContextFormatter, FlamyGlobalOptions} import com.flaminem.flamy.exec.FlamyRunner import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus, ReturnSuccess} import com.flaminem.flamy.exec.utils.io.FlamyOutput import com.flaminem.flamy.graph.TableGraph import com.flaminem.flamy.model.core.Model import com.flaminem.flamy.model.exceptions.UnexpectedBehaviorException import com.flaminem.flamy.model.names.ItemName import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} class Export extends Subcommand("export") with FlamySubcommand { val conf = new Subcommand("conf") with FlamySubcommand { banner("Automatically generate a configuration template or doc") private lazy val template: ScallopOption[Boolean] = toggle(name = "template", default = Some(false), noshort = true) private lazy val markdown: ScallopOption[Boolean] = toggle(name = "markdown", default = Some(false), noshort = true) private lazy val rst: ScallopOption[Boolean] = toggle(name = "rst", default = Some(false), noshort = true) requireOne(template, markdown, rst) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { val context = new FlamyContext(globalOptions, Some(Environment("<ENV>"))) if(template()) { FlamyOutput.out.println(new FlamyContextFormatter(context).toTemplate) } else if(markdown()) { FlamyOutput.out.println(new FlamyContextFormatter(context).toMarkdown) } else if(rst()) { FlamyOutput.out.println(new FlamyContextFormatter(context).toRST) } else { throw new UnexpectedBehaviorException("Either --template or --markdown option should be used") } ReturnSuccess } } override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { subCommands match { case (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil) case Nil => throw new IllegalArgumentException("A subcommand is expected") case _ => printHelp() ReturnFailure } } }
Example 20
Source File: Repair.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.FlamyRunner import com.flaminem.flamy.exec.hive.HiveTableFetcher import com.flaminem.flamy.exec.utils.{Action, ActionRunner, ReturnStatus, ReturnSuccess} import com.flaminem.flamy.model.ItemFilter import com.flaminem.flamy.model.names.{ItemName, TableName} import com.flaminem.flamy.utils.AutoClose import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} import scala.language.reflectiveCalls class Repair extends Subcommand("repair") with FlamySubcommand{ val tables = new Subcommand("tables") { banner("Execute a msck repair table on every specified table. " + "This will automatically add to the metastore the partitions that exists on hdfs but not yet in the metastore.") val environment: ScallopOption[Environment] = opt(name="on", descr="Specifies environment to run on", required = true, noshort=true) val dryRun: ScallopOption[Boolean] = opt(name="dry", default=Some(false), descr="Perform a dry-run", required = false, noshort=true) val items: ScallopOption[List[ItemName]] = trailArg[List[ItemName]](default=Some(List()),required = false) } private class RepairTableAction(runner: FlamyRunner, tableName: TableName) extends Action{ @throws(classOf[Exception]) override def run(): Unit = { runner.runText(f"use ${tableName.schemaName} ; MSCK REPAIR TABLE ${tableName.name}") } override val name: String = tableName.fullName override val logPath: String = f"${tableName.schemaName}.db/${tableName.name}/REPAIR.hql" } private def repairTables(context: FlamyContext, items: ItemName*): Unit = { val itemFilter = new ItemFilter(items, acceptIfEmpty = true) val fetcher = HiveTableFetcher(context) val tables: Iterable[TableName] = fetcher.listTables(itemFilter).filterNot{_.isView}.filter{_.isPartitioned}.map{_.tableName} val actionRunner: ActionRunner = new ActionRunner(silentOnSuccess = false) for { flamyRunner: FlamyRunner <- AutoClose(FlamyRunner(context)) } { val actions = tables.map{tableName => new RepairTableAction(flamyRunner, tableName)} actionRunner.run(actions) } } override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { subCommands match { case ([email protected]) :: Nil => val context = new FlamyContext(globalOptions, command.environment.get) context.dryRun = command.dryRun() repairTables(context, command.items():_*) case _ => printHelp() } ReturnSuccess } }
Example 21
Source File: Count.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, HiveTableFetcher, RemoteHiveRunner} import com.flaminem.flamy.exec.utils.io.FlamyOutput import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus, ReturnSuccess} import com.flaminem.flamy.model.ItemFilter import com.flaminem.flamy.model.names.{ItemName, TableName, TablePartitionName} import com.flaminem.flamy.utils.AutoClose import com.flaminem.flamy.utils.prettyprint.Tabulator import com.flaminem.flamy.utils.sql.hive.StreamedResultSet import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} import scala.language.reflectiveCalls class Count extends Subcommand("count") with FlamySubcommand { val tables = new Subcommand("tables") with FlamySubcommand { banner("Execute a select count(1) on every specified table.") val environment: ScallopOption[Environment] = opt(name="on", descr="Specifies environment to run on", required=false, noshort=true) val items: ScallopOption[List[ItemName]] = trailArg[List[ItemName]](default = Some(List()), required = false) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { val context = new FlamyContext(globalOptions, environment.get) val itemFilter = ItemFilter(items(), acceptIfEmpty = true) val fetcher = HiveTableFetcher(context) val tables: Iterable[TableName] = fetcher.listTableNames.filter{itemFilter} val hiveRunner: RemoteHiveRunner = new RemoteHiveRunner(context) try { for { tableName <- tables if !Thread.currentThread().isInterrupted } try { val res: StreamedResultSet = hiveRunner.executeQuery(f"SELECT COUNT(1) FROM $tableName") val row = res.next() FlamyOutput.out.success(f"ok: $tableName : ${row(0)}") } catch { case e: Throwable => e.printStackTrace() FlamyOutput.err.failure(f"not ok: $tableName : ${e.getMessage}") } } finally{ hiveRunner.close() } ReturnSuccess } } override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { subCommands match { case (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil) case Nil => throw new IllegalArgumentException("A subcommand is expected") case _ => printHelp() ReturnFailure } } }
Example 22
Source File: WaitForPartition.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.commands import com.flaminem.flamy.commands.utils.FlamySubcommand import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.hive.PartitionWaiter import com.flaminem.flamy.exec.utils.ReturnStatus import com.flaminem.flamy.model.names.ItemName import com.flaminem.flamy.utils.AutoClose import com.flaminem.flamy.utils.time.TimeUtils import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand} import scala.language.reflectiveCalls class WaitForPartition extends Subcommand("wait-for-partition") with FlamySubcommand{ banner("Wait for a partition to be created.") val environment: ScallopOption[Environment] = opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true) val timeout: ScallopOption[Long] = opt( name = "timeout", descr = "Number of seconds after which flamy will fail if the partitions still does not exist", default = Some(12 * 3600), noshort = true ) val after: ScallopOption[String] = opt( name = "after", argName = "yyyy-MM-dd HH:mm:ss", descr = """Wait for the partition to be created or refreshed after this time. Expected format is "yyyy-MM-dd HH:mm:ss"""", default = None, noshort = true ) val retryInterval: ScallopOption[Long] = opt( name = "retry-interval", argName = "INTERVAL", descr = "When a partition is not found, retry after INTERVAL seconds", default = Some(60), noshort = true ) val items: ScallopOption[List[ItemName]] = trailArg[List[ItemName]](required = true) override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = { val context = new FlamyContext(globalOptions, environment.get) val waiter = new PartitionWaiter(context) for{ waiter: PartitionWaiter <- AutoClose(new PartitionWaiter(context)) } yield { waiter.waitForPartition(items(), timeout(), after.get.map{TimeUtils.universalTimeToTimeStamp}, retryInterval()) } } }
Example 23
Source File: CliArgs.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.exec.shell import com.flaminem.flamy.Launcher import com.flaminem.flamy.Launcher.Options import org.rogach.scallop.{CliOption, Scallop, ScallopConf} val lastOptionArgs: Seq[String] = if(lastOptionAndArgs.isEmpty) { Nil } else { lastOptionAndArgs.tail } val trailArgs: Seq[String] = if(lastOption.isDefined) { Nil } else { subCommands.foldLeft(args){ case (args, command) => args.dropWhile(_ != command).drop(1) } } new CliArgs( builder = builder, args = args, lastOption = lastOption, lastOptionArgs = lastOptionArgs, previousOptions = previousOptions, trailArgs = trailArgs, lastWord = lastWord ) } }
Example 24
Source File: VariablesTest.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.model import org.rogach.scallop.{ScallopConf, Subcommand} import org.scalatest.{FreeSpec, Matchers} class VariablesTest extends FreeSpec with Matchers{ "replaceInText should work" in { val variables = new Variables variables += ("TO_REPLACE" -> "REPLACED") val text = "TO_REPLACE has been ${TO_REPLACE}" val expected = "TO_REPLACE has been REPLACED" assert(variables.replaceInText(text)===expected) } "subsetInText should work" in { val variables = new Variables variables += ("IN_KEY" -> "IN_VALUE") variables += ("OUT_KEY" -> "OUT_VALUE") val text = "this text contains ${IN_KEY} but does not contains OUT_KEY" val expectedVariables = new Variables expectedVariables += ("IN_KEY" -> "IN_VALUE") assert(variables.subsetInText(text, Nil) === expectedVariables) } "replaceInText should preserve partition variables" in { val text: String = """INSERT OVERWRITE TABLE db1.dest PARTITION(part=${partition:toto}) SELECT ${partition:toto} as num FROM db2.source""" val vars = new Variables() vars += "partition:toto" -> "${partition:toto}0" val expected: String = """INSERT OVERWRITE TABLE db1.dest PARTITION(part="${partition:toto}") SELECT "${partition:toto}" as num FROM db2.source""" assert(vars.replaceInText(text) == expected) } "the scallopConverter should work" in { object Conf extends ScallopConf(Seq("--variables", "HELLO=yes", "LIST=(1,2,3)", "sub")) { val variables = opt[Variables](name = "variables")(Variables.scallopConverter) val sub = new Subcommand("sub") { banner("Print version information of this program") override def toString = "sub" } } assert(Conf.variables() === Variables("HELLO" -> "yes", "LIST" -> "(1,2,3)")) assert(Conf.subcommand.get.toString === "sub") } }
Example 25
Source File: Main.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.dc.stream import akka.actor.ActorSystem import cmwell.ctrl.hc.HealthControl import cmwell.dc.{LazyLogging, Settings} import cmwell.tools.data.sparql.{SparqlProcessorManager, SparqlProcessorManagerSettings} import cmwell.tracking.ResurrectorActor import k.grid.service.ServiceTypes import k.grid.{Grid, GridConnection} import org.rogach.scallop.ScallopConf import uk.org.lidalia.sysoutslf4j.context.SysOutOverSLF4J object Main extends App with LazyLogging { import Settings._ logger.info("Starting Dc-Sync using stream") //SLF4J initialization is not thread safe, so it's "initialized" by writing some log and only then using sendSystemOutAndErrToSLF4J. //Without it there will be en error in stderr and some log line at the beginning will be lost SysOutOverSLF4J.sendSystemOutAndErrToSLF4J() Grid.setGridConnection(GridConnection(memberName = "dc")) Grid.declareServices( ServiceTypes() .add("DataCenterSyncManager", classOf[DataCenterSyncManager], destinationHostsAndPorts(rawTarget), None) .add(HealthControl.services) .add(SparqlProcessorManager.name, classOf[SparqlProcessorManager], new SparqlProcessorManagerSettings) .add("Resurrector", classOf[ResurrectorActor]) ) Grid.joinClient HealthControl.init Thread.sleep(10000) } object MainStandAlone extends App with LazyLogging { import Settings._ implicit val sys = ActorSystem("ExtrenalSystem") val conf = new Conf(args) val ar = sys.actorOf(DataCenterSyncManager.props(destinationHostsAndPorts(conf.destinationHosts()), Some(conf.syncJson()))) } class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { // val syncJson = opt[String](required = true) val syncJson = trailArg[String]() val destinationHosts = trailArg[String]() verify() }
Example 26
Source File: CollectionRollerCliParser.scala From pulse with Apache License 2.0 | 5 votes |
package io.phdata.pulse.collectionroller import org.rogach.scallop.ScallopConf class CollectionRollerCliArgsParser(args: Seq[String]) extends ScallopConf(args) { lazy val conf = opt[String]("conf", 's', required = true, descr = "Path to the collection roller yaml configuration") lazy val daemonize = opt[Boolean]( "daemonize", required = false, default = Some(false), descr = "Daemonize the process and run the CollectionRoller on a schedule") lazy val zkHosts = opt[String]("zk-hosts", required = true, descr = "Zookeeper hosts") lazy val deleteApplications = opt[String]("delete-applications", required = false, descr = "Delete applications (operation)") lazy val listApplications = opt[Boolean]("list-applications", required = false, descr = "List all applications (operation)") lazy val verbose = opt[Boolean]( "verbose", required = false, descr = "List additional info (aliases and collections) for all applications (operation)") verify() }
Example 27
Source File: AlertEngineCliParser.scala From pulse with Apache License 2.0 | 5 votes |
package io.phdata.pulse.alertengine import org.rogach.scallop.{ ScallopConf, ScallopOption } class AlertEngineCliParser(args: Seq[String]) extends ScallopConf(args) { lazy val conf: ScallopOption[String] = opt[String]( "conf", 's', required = true, descr = "Alert Engine config yaml file. See https://github.com/phdata/pulse/blob/master/alert-engine/README.md for schema") lazy val daemonize: ScallopOption[Boolean] = opt[Boolean]( "daemonize", required = false, default = Some(false), descr = "Daemonize the process and run alerting on an interval") lazy val smtpServer: ScallopOption[String] = opt[String]("smtp-server", required = false, descr = "SMTP server hostmane") lazy val smtpUser: ScallopOption[String] = opt[String]( "smtp-user", required = false, descr = "SMTP username (from address), like '[email protected]'") // default smptPassword is "", this will turn Some("") into None() lazy val smtpPassword: Option[String] = sys.env.get("SMTP_PASSWORD").filter(_ != "") lazy val smtpPort: ScallopOption[Long] = opt[Long]("smtp-port", required = false, descr = "SMTP server port. Defaults to 25") lazy val smtp_tls: ScallopOption[Boolean] = opt[Boolean]( "smtp-tls", required = false, descr = "Whether to use START_TLS. Defaults to false") lazy val silencedApplicationsFile: ScallopOption[String] = opt[String]( "silenced-application-file", required = false, descr = "File containing applications ignore when alerting, one application per line") lazy val zkHost: ScallopOption[String] = opt[String]( "zk-hosts", required = false, descr = "Zookeeper hosts. Used to connect to Solr Cloud") lazy val dbUrl: ScallopOption[String] = opt[String]("db-url", required = false, descr = "URL to connect to the database") lazy val dbUser: ScallopOption[String] = opt[String]("db-user", required = false, descr = "User to connect to the database as") lazy val dbPassword: ScallopOption[String] = opt[String]("db-password", required = false, descr = "Password to connect to the database with") lazy val dbOptions: ScallopOption[String] = opt[String]( "db-options", required = false, descr = "Database connection options in the form `key1=value1;key2=value2`") verify() }
Example 28
Source File: Conf.scala From osstracker with Apache License 2.0 | 5 votes |
package com.netflix.oss.tools.osstrackerscraper import org.rogach.scallop.ScallopConf class Conf(args: Seq[String]) extends ScallopConf(args) { val action = opt[String](required = true) verify() } object Conf { val ACTION_UPDATE_CASSANDRA = "updatecassandra" val ACTION_UPDATE_ELASTICSEARCH = "updateelasticsearch" val OSSTRACKER_KEYSPACE = "osstracker" val SENTINAL_DEV_LEAD_ID = "111111"; // Assign to valid emp id val SENTINAL_MGR_LEAD_ID = "222222"; // Assign to valid emp id val SENTINAL_ORG = "UNKNOWN"; // Assign to unknown org until edited in console }
Example 29
Source File: DataLoader.scala From variantsdwh with Apache License 2.0 | 5 votes |
package pl.edu.pw.ii.zsibio.dwh.benchmark import com.typesafe.config.ConfigFactory import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} import org.rogach.scallop.ScallopConf import org.apache.kudu.spark.kudu._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DataType, StructField, StructType} object DataLoader { class RunConf(args:Array[String]) extends ScallopConf(args){ val csvFile =opt[String]("csvFile",required = true, descr = "A CSV file to load" ) val tableName =opt[String]("tableName",required = true, descr = "A table to load" ) val storageType = opt[String]("storageType",required = true, descr = "Storage type parquet|orc|kudu|carbon") val dbName =opt[String]("dbName",required = true, descr = "Database name" ) verify() } def main(args: Array[String]): Unit = { val runConf = new RunConf(args) val scConf = new SparkConf() .setAppName("DataLoader") val sc = new SparkContext(scConf) val sqlContext = new HiveContext(sc) if(runConf.storageType().toLowerCase() == "orc" || runConf.storageType().toLowerCase() == "parquet") { val df = sqlContext.read .format("com.databricks.spark.csv") .option("delimiter", "|") .option("nullValue","\\N") .option("inferSchema", "true") // Automatically infer data types .load(runConf.csvFile()) .repartition(10) df.registerTempTable("temp_csv") sqlContext.sql( s""" |INSERT OVERWRITE TABLE ${runConf.dbName()}.${runConf.tableName()} |SELECT * FROM temp_csv """.stripMargin) } if(runConf.storageType().toLowerCase() == "kudu"){ val confFile = ConfigFactory.load() val kuduMaster = confFile.getString("kudu.master.server") val kuduContext = new KuduContext(kuduMaster) val dfTarget = sqlContext.read.options(Map("kudu.master" -> kuduMaster,"kudu.table" -> runConf.tableName())).kudu val df = sqlContext.read .format("com.databricks.spark.csv") .option("delimiter", "|") .option("nullValue","\\N") .schema(dfTarget.schema) .load(runConf.csvFile()) .repartition(10) kuduContext.upsertRows(df,runConf.tableName()) } } private def synSchemas(inSchema:StructType, outSchema:StructType) = { val size = inSchema.fields.length val structFields = (0 to size - 1).map{ i => StructField(outSchema.fields(i).name,inSchema.fields(i).dataType,outSchema.fields(i).nullable) } new StructType(structFields.toArray) } }
Example 30
Source File: Arguments.scala From ZparkIO with MIT License | 5 votes |
package com.leobenkel.zparkioProjectExample import com.leobenkel.zparkio.Services.CommandLineArguments import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments import org.rogach.scallop.{ScallopConf, ScallopOption} import zio.ZIO case class Arguments(input: List[String]) extends ScallopConf(input) with CommandLineArguments.Service { val inputId: ScallopOption[Int] = opt[Int]( default = Some(10), required = false, noshort = true ) val sparkFoo: ScallopOption[String] = opt[String]( default = Some("hello"), required = false, noshort = true ) } object Arguments { def apply[A](f: Arguments => A): ZIO[CommandLineArguments[Arguments], Throwable, A] = { CommandLineArguments.get[Arguments].apply(f) } }
Example 31
Source File: Arguments.scala From ZparkIO with MIT License | 5 votes |
package com.leobenkel.zparkioProfileExampleMoreComplex import com.leobenkel.zparkio.Services.CommandLineArguments import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments import com.leobenkel.zparkioProfileExampleMoreComplex.Services.Database import org.rogach.scallop.{ScallopConf, ScallopOption} import zio.ZIO case class Arguments(input: List[String]) extends ScallopConf(input) with CommandLineArguments.Service { val databaseUsername: ScallopOption[String] = opt[String]( default = Some("admin"), required = false, noshort = true ) val databasePassword: ScallopOption[String] = opt[String]( default = Some("123456"), required = false, noshort = true ) val databaseHost: ScallopOption[String] = opt[String]( default = Some("database://host.com/database"), required = false, noshort = true ) val generatedInputSize: ScallopOption[Int] = opt[Int]( default = Some(100), required = false, noshort = true, descr = "The size of the sample data generated" ) val sparkConfig: ScallopOption[String] = opt[String]( default = Some("foo"), required = false, noshort = true ) lazy val credentials: Database.Credentials = Database.Credentials( user = databaseUsername(), psw = databasePassword(), host = databaseHost() ) } object Arguments { def apply[A](f: Arguments => A): ZIO[CommandLineArguments[Arguments], Throwable, A] = { CommandLineArguments.get[Arguments].apply(f) } }
Example 32
Source File: CommandLineArgumentsTest.scala From ZparkIO with MIT License | 5 votes |
package com.leobenkel.zparkio.Services import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments import org.rogach.scallop.exceptions.{RequiredOptionNotFound, UnknownOption} import org.rogach.scallop.{ScallopConf, ScallopOption} import org.scalatest._ import zio.Exit.{Failure, Success} import zio.{BootstrapRuntime, Layer, Task, ZIO, ZLayer} class CommandLineArgumentsTest extends FreeSpec { "CommandLineService" - { case class ArgumentsService(input: Seq[String]) extends ScallopConf(input) with CommandLineArguments.Service { val test: ScallopOption[String] = opt[String]( default = None, required = true, noshort = true ) } object Arguments { def get[A]( f: ArgumentsService => A ): ZIO[CommandLineArguments[ArgumentsService], Throwable, A] = { CommandLineArguments.get[ArgumentsService].apply(f) } def apply(input: Seq[String]): Layer[Nothing, CommandLineArguments[ArgumentsService]] = { ZLayer.succeed(ArgumentsService(input)) } } val runtime = new BootstrapRuntime {} "should work" in { val test: String = "qwe-asd-asd-zxc" runtime.unsafeRunSync { Arguments.get(_.test.toOption).provideLayer(Arguments(Seq("--test", test))) } match { case Success(Some(value)) => assertResult(value)(test) case Success(None) => fail("Did not found any value") case Failure(ex) => fail(ex.prettyPrint) } } "should fail - missing required" in { runtime.unsafeRunSync(for { arg <- Task(Arguments(Nil)) a <- Arguments.get(_.test.toOption).provideLayer(arg) } yield { a }) match { case Success(_) => fail("Should have failed") case Failure(ex) => assertThrows[RequiredOptionNotFound](throw ex.squash) } } "should fail - unknonw option" in { runtime.unsafeRunSync(for { arg <- Task(Arguments(Seq("--abc", "foo"))) a <- Arguments.get(_.test.toOption).provideLayer(arg) } yield { a }) match { case Success(_) => fail("Should have failed") case Failure(ex) => assertThrows[UnknownOption](throw ex.squash) } } } }
Example 33
Source File: ParametricFaceImageGeneratorOptions.scala From parametric-face-image-generator with Apache License 2.0 | 5 votes |
package faces.utils import org.rogach.scallop.{ScallopConf, ScallopOption} import org.rogach.scallop.exceptions.ScallopException class ParametricFaceImageGeneratorOptions(args: Seq[String]) extends ScallopConf(args) { banner( """|parametric-face-image-generator |© University of Basel |License: http://www.apache.org/licenses/LICENSE-2.0 | |Options:""".stripMargin) val configurationFile: ScallopOption[String] = opt[String](required = true,descr = "configuration file with the parameters") footer( """""".stripMargin ) override def onError(e: Throwable): Unit = e match { case ScallopException(message) => printHelp println("You provided the arguments: "+args.mkString(" ")) println(message) sys.exit(1) case ex => super.onError(ex) } }
Example 34
Source File: Conf.scala From ncdbg with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.programmaticallyspeaking.ncd.config import org.rogach.scallop.{ScallopConf, ValueConverter} case class Address(host: String, port: Int) { override def toString = host + ":" + port } class AddressConverter extends ValueConverter[Address] { val valueRegexp = "([^:]+(?::))?([0-9]+)".r override def parse(s: List[(String, List[String])]): Either[String, Option[Address]] = { s match { case (_, valueRegexp(host, port) :: Nil) :: Nil => // I tried getting rid of the trailing : using a non-capturing group, but it didn't work. val theHost = Option(host).map(h => h.dropRight(1)).getOrElse("localhost") Right(Some(Address(theHost, port.toInt))) case Nil => Right(None) case _ => Left("address must have format <host>:<port> or only <port>") } } override val tag = scala.reflect.runtime.universe.typeTag[Address] override val argType = org.rogach.scallop.ArgType.SINGLE } class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { implicit val addressConverter = new AddressConverter private val defaultListen = Address("localhost", 7778) private val defaultConnect = Address("localhost", 7777) banner( """Usage: ncdbg [OPTION]... | |Ncdbg (Nashorn-Chrome-debugger) connects to a debuggable Java process running Nashorn scripts, |while acting as a server for Chrome Developer Tools. This makes it possible to debug Nashorn scripts |using Chrome. | |Options: """.stripMargin) val listen = opt[Address](default = Some(defaultListen), descr = s"address to listen on, on <host>:<port> format or port only. Defaults to $defaultListen.") val connect = opt[Address](default = Some(defaultConnect), descr = s"address to connect to, on <host>:<port> format or port only. Defaults to $defaultConnect.") val isLazy = toggle(name = "lazy", default = Some(false), descrYes = "defer connection until DevTools connects, and stay alive when the debug target dies.", descrNo = "connect right away and require the debug target to live. This is the default.") verify() }
Example 35
Source File: AddressConverterTest.scala From ncdbg with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.programmaticallyspeaking.ncd.boot import com.programmaticallyspeaking.ncd.config.{Address, AddressConverter} import com.programmaticallyspeaking.ncd.testing.UnitTest import org.rogach.scallop.exceptions.WrongOptionFormat import org.rogach.scallop.{ScallopConf, throwError} class AddressConverterTest extends UnitTest { // Throw error instead of exiting on option error. throwError.value = true def conf(args: String*) = new ScallopConf(args.toSeq) { val address = opt[Address]()(new AddressConverter) verify() } "should parse host:port" in { val c = conf("--address", "foo:1234") c.address.toOption should be (Some(Address("foo", 1234))) } "should parse only port" in { val c = conf("--address", "1234") c.address.toOption should be (Some(Address("localhost", 1234))) } "should handle no address" in { val c = conf() c.address.toOption should be (None) } "should reject non-integer port" in { intercept[WrongOptionFormat](conf("--address", "foo")) } }
Example 36
Source File: IngesterMain.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.tools.data.ingester import java.io.FileInputStream import java.util.zip.GZIPInputStream import akka.stream.scaladsl.Sink import cmwell.tools.data.utils.akka.stats.IngesterStats //import cmwell.tools.data.sparql.SparqlProcessorMain.Opts.opt import cmwell.tools.data.utils.ArgsManipulations._ import cmwell.tools.data.utils.akka.Implicits._ import cmwell.tools.data.utils.akka._ import cmwell.tools.data.utils.ops._ import com.typesafe.scalalogging.LazyLogging import org.rogach.scallop.ScallopConf import scala.concurrent.ExecutionContext.Implicits.global object IngesterMain extends App with LazyLogging { object Opts extends ScallopConf(args) { version(s"cm-well ingester ${getVersionFromManifest()} (c) 2015") val host = opt[String]("host", descr = "cm-well host name", required = true) val format = opt[String]("format", descr = "input format (e.g. ntriples, nquads, jsonld)", required = true) val file = opt[String]("file", descr = "input file path", default = None) val gzip = opt[Boolean]("gzip", descr = "is input file gzipped", default = Some(false)) val token = opt[String]("token", descr = "cm-well write permission token", default = None) val replaceMode = opt[Boolean]("with-replace-mode", descr = "replace-mode parameter in cm-well", default = Some(false)) val force = opt[Boolean]("force", descr = "force parameter in cm-well", default = Some(false)) val priority = opt[Boolean]("priority", default = Some(false), descr = "ingest data in priority mode") val numConnections = opt[Int]("num-connections", descr = "number of http connections to open") dependsOnAll(gzip, List(file)) verify() } val start = System.currentTimeMillis() var totalIngestedBytes = 0L var ingestedBytesInWindow = 0 var ingestedInfotonsInWindow = 0 var totalIngestedInfotons = 0L var totalFailedInfotons = 0L var lastTime = start var nextPrint = 0L var lastMessageSize = 0 val windowSizeMillis = 1000 val formatter = java.text.NumberFormat.getNumberInstance // resize akka http connection pool Opts.numConnections.toOption.map { numConnections => System.setProperty("akka.http.host-connection-pool.max-connections", numConnections.toString) } val inputStream = if (Opts.file.isSupplied) { val inputFile = new FileInputStream(Opts.file()) if (Opts.gzip()) { new GZIPInputStream(inputFile) } else { inputFile } } else { System.in } val result = Ingester .fromInputStream( baseUrl = formatHost(Opts.host()), format = Opts.format(), writeToken = Opts.token.toOption, replaceMode = Opts.replaceMode(), force = Opts.force(), isPriority = Opts.priority(), in = inputStream ) .via(IngesterStats(isStderr = true)) .runWith(Sink.ignore) // actor system is still alive, will be destroyed when finished result.onComplete { x => System.err.println("\n") System.err.println(s"finished: $x") cleanup() } }
Example 37
Source File: LogCollectorCliParser.scala From pulse with Apache License 2.0 | 5 votes |
package io.phdata.pulse.logcollector import org.rogach.scallop.ScallopConf class LogCollectorCliParser(args: Seq[String]) extends ScallopConf(args) { lazy val port = opt[Int]("port", required = false, descr = "HTTP Server Listening port") lazy val zkHosts = opt[String]("zk-hosts", required = true, descr = "Zookeeper hosts") lazy val kuduMasters = opt[String]("kudu-masters", required = false, descr = "Kudu masters") lazy val mode = opt[String]("consume-mode", required = false, descr = "'http' or 'kafka'", default = Some("http")) lazy val kafkaProps = opt[String]("kafka-properties", required = false, descr = "Kafka properties file") lazy val topic = opt[String]("topic", required = false, descr = "Kafka Topic") validateOpt(mode, port) { case (Some("http") | None, None) => Left("Need a port if running http mode") case _ => Right(Unit) } validateOpt(mode, kafkaProps, topic) { case (Some("kafka"), None, Some(_)) => Left("--kafka-properties argument needed if --consume-mode=kafka") case (Some("kafka"), Some(_), None) => Left("--topic argument needed if --consume-mode=kafka") case (Some("kafka"), None, None) => Left("--topic and --kafka-properties arguments needed if --consume-mode=kafka") case _ => Right(Unit) } verify() }
Example 38
Source File: Main2.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.tools.file.export import cmwell.tools.neptune.export.ExportToNeptuneManager import org.rogach.scallop.ScallopConf class Conf1(arguments: Seq[String]) extends ScallopConf(arguments) { val sourceCluster = opt[String]("source-cluster", required = true, descr = "the source cluster which data is being exported from") val lengthHint = opt[Int]("length-hint", default = Some(16000), validate = 300000.>=, descr="number of infotons that should be consumed in each bulk-consume call") val qp = opt[String](name="qp-param", default=None, descr = "cm well qp param") val directory = opt[String](name="directory", required = true, default=Some("./"), descr = "s3 directory which neptune read data from") verify() } object Main2 { def main(args: Array[String]) { val conf = new Conf1(args) println("Source cluster is: " + conf.sourceCluster()) println("length-hint: " + conf.lengthHint()) println("qp: " + conf.qp.getOrElse("(not provided)")) println("s3 bucket:" + conf.directory()) val qpParam :Option[String]= conf.qp.toOption.map(s => s",$s") println("About to Export..") val exportToNeptuneManager = new ExportToNeptuneManager(1) exportToNeptuneManager.exportToNeptune(conf.sourceCluster(), "", conf.lengthHint(), false, qpParam, false, None, None, "", Some(conf.directory())) } }
Example 39
Source File: Main.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.tools.neptune.export import org.rogach.scallop.ScallopConf class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { val sourceCluster = opt[String]("source-cluster", required = true, descr = "the source cluster which data is being exported from") val neptuneCluster = opt[String]("neptune-cluster", required = true, descr="neptune cluster which data is being exported to") val ingestConnectionPoolSize = opt[Int]("ingest-connection-pool-size", default = Some(5), validate = 50.>=, descr="number of connection pool that should be created by the tool in order to ingest to neptune") val lengthHint = opt[Int]("length-hint", default = Some(16000), validate = 300000.>=, descr="number of infotons that should be consumed in each bulk-consume call") val qp = opt[String](name="qp-param", default=None, descr = "cm well qp param") val updateInfotons = opt[Boolean]("update-infotons", descr = "enable this parameter when you use an update mode or delete of infotons") val bulkLoader = opt[Boolean]("bulk-loader", descr = "enable this parameter in order to export by using s3-bulk loader api. bulk loader is only for initial load") val proxyHost = opt[String]("proxy-host", default=None, descr = "proxy host is provided when you use bulk loader and your machine use proxy") val proxyPort = opt[Int]("proxy-port", default=None, descr = "proxy port is provided when you use bulk loader and your machine use proxy") val s3Bucket = opt[String](name="s3-bucket", default=Some("cm-well/sync"), descr = "s3 directory which neptune read data from") verify() } object Main { def main(args: Array[String]) { val conf = new Conf(args) println("Source cluster is: " + conf.sourceCluster()) println("Neptune cluster is: " + conf.neptuneCluster()) println("Connection pool size is: " + conf.ingestConnectionPoolSize()) println("length-hint: " + conf.lengthHint()) println("update infotons: " + conf.updateInfotons()) println("qp: " + conf.qp.getOrElse("(not provided)")) println("bulk loader: " + conf.bulkLoader()) println("proxy host: " + conf.proxyHost.getOrElse("not provided")) println("proxy port: " + conf.proxyPort.getOrElse(-1)) println("s3 bucket:" + conf.s3Bucket()) val qpParam :Option[String]= conf.qp.toOption.map(s => s",$s") val proxyHost :Option[String]= conf.proxyHost.toOption val proxyPort :Option[Int]= conf.proxyPort.toOption println("About to Export..") val exportToNeptuneManager = new ExportToNeptuneManager(conf.ingestConnectionPoolSize()) exportToNeptuneManager.exportToNeptune(conf.sourceCluster(), conf.neptuneCluster(), conf.lengthHint(), conf.updateInfotons(), qpParam, conf.bulkLoader(), proxyHost, proxyPort, conf.s3Bucket(), None) } }
Example 40
Source File: Main.scala From CM-Well with Apache License 2.0 | 5 votes |
import akka.actor.{ActorSystem, Props} import org.rogach.scallop.ScallopConf class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { val sourceUrl = opt[String]("source-url", required = true, descr = "the source url which download rdf file") val format = opt[String]("format", required = true, descr="the ofile format") var cluster = opt[String]("cluster", required = true, descr="the target server which content is ingested to") verify() } object Main { def main(args: Array[String]) { val conf = new Conf(args) // Note: This line also works for "object Main extends App" println("source file is: " + conf.sourceUrl()) println("output format is: " + conf.format()) val system = ActorSystem("MySystem") println("About to Start import tool flow...") val mainActor = system.actorOf(Props(new AkkaFileReaderWithActor(conf.sourceUrl(), conf.format(), conf.cluster())), name = "myactor") mainActor ! ActorInput } }
Example 41
Source File: DumpIndexWithSystemFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.IndexWithSystemFields import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpIndexWithSystemFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpIndexWithSystemFields.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump system fields from Elasticsearch indexes", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = IndexWithSystemFields()(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 42
Source File: DumpInfotonWithUuidOnly.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonWithUuidOnly import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpInfotonWithUuidOnly { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpInfotonWithUuidOnly.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump infoton table - uuid only", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = InfotonWithUuidOnly()(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 43
Source File: FindInfotonIndexInconsistencies.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonAndIndexWithSystemFields import cmwell.analytics.data.InfotonAndIndexWithSystemFields.{isConsistent, isWellFormed} import cmwell.analytics.util.CmwellConnector import cmwell.analytics.util.ConsistencyThreshold.defaultConsistencyThreshold import cmwell.analytics.util.ISO8601.{instantToMillis, instantToText} import org.apache.log4j.LogManager import org.apache.spark.sql.Row import org.apache.spark.sql.functions._ import org.joda.time.format.ISODateTimeFormat import org.rogach.scallop.{ScallopConf, ScallopOption, ValueConverter, singleArgConverter} import scala.util.control.NonFatal object FindInfotonIndexInconsistencies { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(FindInfotonIndexInconsistencies.getClass) try { object Opts extends ScallopConf(args) { private val instantConverter: ValueConverter[Long] = singleArgConverter[Long](instantToMillis) // If this parameter is not supplied, the (unreliable) ES Spark connector is used to extract the data from the es index. val esExtract: ScallopOption[String] = opt[String]("es", short = 'e', descr = "The path where the (parquet) extract of system fields the es index are stored", required = false) val consistencyThreshold: ScallopOption[Long] = opt[Long]("consistency-threshold", short = 'c', descr = "Ignore any inconsistencies at or after this instant", default = Some(defaultConsistencyThreshold))(instantConverter) val outParquet: ScallopOption[String] = opt[String]("out-parquet", short = 'p', descr = "The path to save the output to (in parquet format)", required = false) val outCsv: ScallopOption[String] = opt[String]("out-csv", short = 'v', descr = "The path to save the output to (in CSV format)", required = false) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Find inconsistencies between system fields in Infoton and Index", sparkShell = Opts.shell() ).withSparkSessionDo { spark => logger.info(s"Using a consistency threshold of ${instantToText(Opts.consistencyThreshold())}.") val ds = InfotonAndIndexWithSystemFields(esExtractPath = Opts.esExtract.toOption)(spark) // Filter out any inconsistencies found if more current than this point in time. val i = ds.schema.indexWhere(_.name == "infoton_lastModified") val filterCurrent: Row => Boolean = { row: Row => val parser = ISODateTimeFormat.dateTimeParser if (row.isNullAt(i)) true // Shouldn't be null, but don't filter out if we can't get a lastModified else try { parser.parseMillis(row.getAs[String](i)) < Opts.consistencyThreshold() } catch { case NonFatal(_) => true // Don't filter out if lastModified couldn't be converted } } val inconsistentData = ds.filter(not(isConsistent(ds) && isWellFormed(ds))) .filter(filterCurrent) .cache() // Save the inconsistent data in Parquet format suitable for additional analysis if (Opts.outParquet.isDefined) inconsistentData .write .parquet(Opts.outParquet()) // Save the inconsistent data to a single CSV file suitable for reporting. if (Opts.outCsv.isDefined) inconsistentData .coalesce(1) .write .option("header", value = true) .csv(Opts.outCsv()) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 44
Source File: DumpPathWithKeyFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.PathWithKeyFields import cmwell.analytics.util.CmwellConnector import cmwell.analytics.util.DatasetFilter import cmwell.analytics.util.TimestampConversion.timestampConverter import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpPathWithKeyFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpPathWithKeyFields.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter) val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump path table - key fields", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val datasetFilter = DatasetFilter( lastModifiedGte = Opts.lastModifiedGteFilter.toOption, pathPrefix = Opts.pathPrefixFilter.toOption) val ds = PathWithKeyFields(Some(datasetFilter))(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 45
Source File: DumpPathWithUuidOnly.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.PathWithUuidOnly import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpPathWithUuidOnly { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpPathWithUuidOnly.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump path table - uuid only", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = PathWithUuidOnly()(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 46
Source File: FindDuplicatedSystemFields.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.InfotonWithDuplicatedSystemFields import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object FindDuplicatedSystemFields { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(FindDuplicatedSystemFields.getClass) try { object Opts extends ScallopConf(args) { val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Find infotons with duplicated system fields" ).withSparkSessionDo { spark => import spark.implicits._ InfotonWithDuplicatedSystemFields()(spark) .toDF .write.csv(Opts.out()) } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }
Example 47
Source File: DumpIndexWithUuidOnly.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.main import cmwell.analytics.data.IndexWithUuidsOnly import cmwell.analytics.util.CmwellConnector import org.apache.log4j.LogManager import org.rogach.scallop.{ScallopConf, ScallopOption} object DumpIndexWithUuidOnly { def main(args: Array[String]): Unit = { val logger = LogManager.getLogger(DumpIndexWithUuidOnly.getClass) // Here, the parallelism defines how many partitions are produced. // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns. val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2) try { object Opts extends ScallopConf(args) { val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism)) val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true) val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only include current", required = false, default = Some(true)) val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false)) val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true) val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet")) validateOpt(format) { case Some("parquet") | Some("csv") => Right(Unit) case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.") } verify() } CmwellConnector( cmwellUrl = Opts.url(), appName = "Dump UUIDs from Elasticsearch indexes", sparkShell = Opts.shell() ).withSparkSessionDo { spark => val ds = IndexWithUuidsOnly(currentOnly = Opts.currentOnly())(spark) .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier) Opts.format() match { case "parquet" => ds.write.parquet(Opts.out()) case "csv" => ds.write.csv(Opts.out()) } } } catch { case ex: Throwable => logger.error(ex.getMessage, ex) System.exit(1) } } }