org.rogach.scallop.ScallopConf Scala Examples

The following examples show how to use org.rogach.scallop.ScallopConf. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: FeatureCounts.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.apps

import htsjdk.samtools.ValidationStringency
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.SparkSession
import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
import org.biodatageeks.sequila.utils.Columns
import org.rogach.scallop.ScallopConf
import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable}
import org.seqdoop.hadoop_bam.util.SAMHeaderReader

object FeatureCounts {
  case class Region(contig:String, pos_start:Int, pos_end:Int)
  class RunConf(args:Array[String]) extends ScallopConf(args){

    val output = opt[String](required = true)
    val annotations = opt[String](required = true)
    val readsFile = trailArg[String](required = true)
    val Format = trailArg[String](required = false)
    verify()
  }

  def main(args: Array[String]): Unit = {
    val runConf = new RunConf(args)
    val spark = SparkSession
      .builder()
      .appName("SeQuiLa-FC")
      .getOrCreate()

    spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder","true")
    //spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (1024).toString)
    spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil



    val query ="""SELECT targets.GeneId AS GeneId,
                     targets.Chr AS Chr,
                     targets.Start AS Start,
                     targets.End AS End,
                     targets.Strand AS Strand,
                     CAST(targets.End AS INTEGER)-CAST(targets.Start AS INTEGER) + 1 AS Length,
                     count(*) AS Counts
            FROM reads JOIN targets
      |ON (
      |  targets.Chr=reads.contigName
      |  AND
      |  reads.end >= CAST(targets.Start AS INTEGER)
      |  AND
      |  reads.start <= CAST(targets.End AS INTEGER)
      |)
      |GROUP BY targets.GeneId,targets.Chr,targets.Start,targets.End,targets.Strand""".stripMargin
      spark
        .sparkContext
        .setLogLevel("ERROR")

      spark
        .sparkContext
        .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString)

      val alignments = spark
        .sparkContext.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat](runConf.readsFile())
        .map(_._2.get)
        .map(r => Region(r.getContig, r.getStart, r.getEnd))

      val readsTable = spark.sqlContext.createDataFrame(alignments)
      readsTable.createOrReplaceTempView("reads")

      val targets = spark
        .read
        .option("header", "true")
        .option("delimiter", "\t")
        .csv(runConf.annotations())
      targets
        .withColumnRenamed("contigName", Columns.CONTIG)
        .createOrReplaceTempView("targets")

     spark.sql(query)
       .orderBy("GeneId")
        .coalesce(1)
        .write
        .option("header", "true")
        .option("delimiter", "\t")
        .csv(runConf.output())
  }

} 
Example 2
Source File: DumpInfotonWithKeyFields.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.InfotonWithKeyFields
import cmwell.analytics.util.{CmwellConnector, DatasetFilter}
import cmwell.analytics.util.DatasetFilter._
import cmwell.analytics.util.TimestampConversion.timestampConverter
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpInfotonWithKeyFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpInfotonWithKeyFields.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to ", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump infoton table - uuid, lastModified, path",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val datasetFilter = DatasetFilter(
          lastModifiedGte = Opts.lastModifiedGteFilter.toOption,
          pathPrefix = Opts.pathPrefixFilter.toOption)

        val ds = InfotonWithKeyFields(Some(datasetFilter))(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 3
Source File: CheckInfotonDataIntegrity.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.InfotonDataIntegrity
import cmwell.analytics.util.{CmwellConnector, DatasetFilter}
import cmwell.analytics.util.TimestampConversion.timestampConverter
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object CheckInfotonDataIntegrity {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CheckInfotonDataIntegrity.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Check infoton data integrity"
      ).withSparkSessionDo { spark =>

        val datasetFilter = DatasetFilter(
          lastModifiedGte = Opts.lastModifiedGteFilter.toOption,
          pathPrefix = Opts.pathPrefixFilter.toOption)

        val ds = InfotonDataIntegrity(Some(datasetFilter))(spark)

        val damagedInfotons = ds.filter(infoton =>
          infoton.hasIncorrectUuid ||
            infoton.hasDuplicatedSystemFields ||
            infoton.hasInvalidContent ||
            infoton.hasMissingOrIllFormedSystemFields
        )

        damagedInfotons.select("uuid", "lastModified", "path",
          "hasIncorrectUuid", "hasMissingOrIllFormedSystemFields", "hasDuplicatedSystemFields", "hasInvalidContent", "hasUnknownSystemField")
          .write.csv(Opts.out())
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 4
Source File: ExtractFromParquet.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.data

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.util.Connector
import cmwell.analytics.util.StringUtil._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.rogach.scallop.{ScallopConf, ScallopOption}

object ExtractFromParquet {

  def main(args: Array[String]): Unit = {

    object Opts extends ScallopConf(args) {

      val pathsToFind: ScallopOption[String] = opt[String]("paths-to-find", short = 'f', descr = "A file containing the list of paths to look for", required = true)
      val parquetData: ScallopOption[String] = opt[String]("parquet-file", short = 'p', descr = "A Parquet file containing the data; single string column rdfStatement", required = true)
      val extractedData: ScallopOption[String] = opt[String]("extracted-data", short = 'd', descr = "The file that extracted data will be written to (in nquads format)", required = true)
      val pathsNotFound: ScallopOption[String] = opt[String]("paths-not-found", short = 'n', descr = "The output file that any paths that were not found are written to", required = true)
      val pathsFound: ScallopOption[String] = opt[String]("paths-found", short = 'a', descr = "The output file containing the paths that we found are written to", required = true)

      verify()
    }

    Connector(sparkShell = true, appName = "Extract from parquet").withSparkSessionDo {
      spark: SparkSession =>

        val pathsToFind = Set(splitLines(FileUtils.readFileToString(new File(Opts.pathsToFind()), UTF_8)): _*)

        val ds: DataFrame = spark.read.parquet(Opts.parquetData())

        // Cheesy parsing of path from an RDF nquad, but sufficient for this purpose
        def extractPath(rdfStatement: String): String = rdfStatement.substring(7, rdfStatement.indexOf(">"))

        val statementsFound = ds.rdd.filter { row: Row =>

          val statement = row.getAs[String]("rdfStatement")
          val path = extractPath(statement)

          pathsToFind.contains(path)
        }.collect() // expect the result to be small, so collect is OK

        // Save all the paths that were not found to file - look for them in other files.
        val pathsFound: Set[String] = Set(statementsFound.map(row => extractPath(row.getString(0))): _*)
        println(s"There were ${pathsFound.size} paths found (out of ${pathsToFind.size}).")
        FileUtils.writeStringToFile(new File(Opts.pathsFound()), pathsFound.mkString("\n"), UTF_8, false)

        val pathsNotFound = pathsToFind.diff(pathsFound)
        println(s"There were ${pathsNotFound.size} paths not found.")
        FileUtils.writeStringToFile(new File(Opts.pathsNotFound()), pathsNotFound.mkString("\n"), UTF_8, false)

        // Save the RDF statements for the paths that were found
        val x = statementsFound.map(row => row.getString(0)).mkString("\n")
        FileUtils.writeStringToFile(new File(Opts.extractedData()), x, UTF_8, false)
    }
  }
} 
Example 5
Source File: DumpCompleteDocumentFromEs.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpCompleteDocumentFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpCompleteDocumentFromEs.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("dump-complete-document-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-filter", short = 'c', descr = "Filter on current status", default = None)
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithCompleteDocument
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 6
Source File: CopyIndex.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object CopyIndex {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CopyIndex.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("copy-index")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = true)
        val writeIndex: ScallopOption[String] = opt[String]("write-index", short = 'w', descr = "The name of the index to write to", required = true)

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument](
        indexName = Opts.writeIndex(),
        esEndpoint = esContactPoint)

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),
        objectExtractor = IndexWithCompleteDocument,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 7
Source File: DumpKeyFieldsFromEs.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithKeyFields}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpKeyFieldsFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpKeyFieldsFromEs.getClass)

    implicit val system: ActorSystem = ActorSystem("dump-key-fields-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {
      // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
      // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
      // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
      val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)

        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default = Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithKeyFields
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 8
Source File: DumpUuidOnlyFromEs.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithUuidOnly}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpUuidOnlyFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpUuidOnlyFromEs.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("dump-uuid-only-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithUuidOnly
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 9
Source File: DumpSystemFieldsFromEs.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithSystemFields}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpSystemFieldsFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpSystemFieldsFromEs.getClass)

    implicit val system: ActorSystem = ActorSystem("dump-system-fields-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {
      // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
      // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
      // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
      val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithSystemFields
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 10
Source File: CopyIndexesWithMapping.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.collection.JavaConverters._
import scala.concurrent.ExecutionContextExecutor

object CopyIndexesWithMapping {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CopyIndexesWithMapping.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("copy-index-with-mapping")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val indexMap: ScallopOption[String] = opt[String]("index-map", short = 'i', descr = "A map from source to target index names, in JSON format", required = true)

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())

      // Expect a map in the form: { "sourceIndex1": "targetIndex1", "sourceIndex2": "targetIndex2", ... }
      val indexMap: Map[String, String] = new ObjectMapper().readTree(Opts.indexMap()).fields.asScala.map { entry =>
        entry.getKey -> entry.getValue.asText
      }.toMap

      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexMap.keys.toSeq)

      // Validate that the index-map parameter specified valid index names, and not aliases.
      for (indexName <- indexMap.keys)
        if (!esTopology.allIndexNames.contains(indexName))
          throw new RuntimeException(s"index-map parameter included $indexName as a source, which is not a valid index name.")

      for (indexName <- indexMap.values)
        if (!esTopology.allIndexNames.contains(indexName))
          throw new RuntimeException(s"index-map parameter included $indexName as a target, which is not a valid index name.")

      val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument](
        indexMap = indexMap,
        esEndpoint = esContactPoint)

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),
        objectExtractor = IndexWithCompleteDocument,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 11
Source File: CalculateXORSummary.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{IndexWithSourceHash, XORSummary, XORSummaryFactory}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.codec.binary.Hex
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object CalculateXORSummary {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CalculateXORSummary.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("xor-summary")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = false)

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      val dataWriterFactory = new XORSummaryFactory()

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),
        objectExtractor = IndexWithSourceHash,
        dataWriterFactory = dataWriterFactory.apply,
        sourceFilter = false)

      // Summarize the summaries down to the index level.
      val summaryByIndex: Map[String, XORSummary] = dataWriterFactory.shardSummaries
        .groupBy { case (shard, _) => shard.indexName }
        .map { case (indexName, summaryMap) => indexName -> summaryMap.values.reduce(XORSummary.combine) }

      // TODO: Fix questionable JSON generation
      val r = "{" +
        summaryByIndex.map { case (index, summary) =>
          val x = Hex.encodeHexString(summary.summary)
          s""" { "index": "$index", "summary": "$x" } """
        }.mkString("\n") + "}"

      println(r)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
} 
Example 12
Source File: TestResults.scala    From mimir   with Apache License 2.0 5 votes vote down vote up
package mimir.util

import java.nio.file.Files
import java.nio.file.Paths
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets
import java.io.ByteArrayOutputStream
import java.io.PrintWriter
import org.rogach.scallop.ScallopConf

object TestResults {
  def main(args: Array[String]) {
    val config = new TestResultConfig(args)
    println("running tests....")
    parseTestResults(config.sbtPath(),config.sbtCmd())
  }
  
  def parseTestResults(sbtPath:String = "/opt/local/bin/sbt", sbtCmd:String = "test") = {
    val procOutput = runCommand(Seq(sbtPath,sbtCmd))._2.replaceAll("""\x1b\[[0-9;]*[a-zA-Z]""", "")
    
    val pattern = """(?m)^.*\[info\] Total.*$|^.*\[info\] Finished.*$|^.*\[info\] [\d]+ examp.*$""".r
    
    val header = "test_name,seconds,examples,expectations,failures,errors,skipped\n"
    
    val pattern2 = """\[info\] Total for specification (\w+)\s+\[info\] Finished in (.+)\R\[info\] (.+)\R""".r
    val pattern3 = """([a-zA-Z]+): (?:(\d+) minutes? )?(?:(\d+) seconds?[,:] )?(?:(\d+) ms[,:] )?(\d+) examples?, (?:(\d+) expectations?, )?(\d+) failures?, (\d+) errors?(?:, (\d+) skipped)?""".r
    val string = pattern2.findAllMatchIn(procOutput).map(mat => s"${mat.group(1)}: ${mat.group(2)}: ${mat.group(3)}")
      .map(nline => nline match {
        case pattern3(test_name,minutes,seconds,ms,examples,expectations,failures,errors,skipped) => {
          val allseconds = (minutes match {
            case "" => 0
            case null => 0
            case x => x.toInt*60
          }) + (seconds match {
            case "" => 0
            case null => 0
            case x => x.toInt
          }) +  (ms match {
            case "" => 0.0
            case null => 0.0
            case x => x.toDouble/1000.0
          })
          s"$test_name,$allseconds,$examples,$expectations,$failures,$errors,$skipped"
        }
      }).mkString("\n")
    
    val outStr = header + string
      
    println(outStr)
    Files.write(Paths.get("test_output.csv"), outStr.getBytes(StandardCharsets.UTF_8))
  }
  
  import sys.process._
  def runCommand(cmd: Seq[String]): (Int, String, String) = {
    val stdoutStream = new ByteArrayOutputStream
    val stderrStream = new ByteArrayOutputStream
    val stdoutWriter = new PrintWriter(stdoutStream)
    val stderrWriter = new PrintWriter(stderrStream)
    val exitValue = cmd.!(ProcessLogger(stdoutWriter.println, stderrWriter.println))
    stdoutWriter.close()
    stderrWriter.close()
    (exitValue, stdoutStream.toString, stderrStream.toString)
  }
  
  
}

class TestResultConfig(arguments: Seq[String]) extends ScallopConf(arguments)
{
  val experimental = opt[List[String]]("X", default = Some(List[String]()))
  val sparkHost = opt[String]("sparkHost", descr = "The IP or hostname of the spark master",
    default = Some("spark-master.local"))
  val sparkPort = opt[String]("sparkPort", descr = "The port of the spark master",
    default = Some("7077"))
  val sbtPath = opt[String]("sbtPath", descr = "The path to sbt binary",
    default = Some("/opt/local/bin/sbt"))
  val sbtCmd = opt[String]("sbtCmd", descr = "The sbt command to run",
    default = Some("test"))
} 
Example 13
Source File: AnalyzeInconsistenciesResult.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.data.InfotonAndIndexWithSystemFields
import cmwell.analytics.util.Connector
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.collection.breakOut

object AnalyzeInconsistenciesResult {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true)
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))

        verify()
      }

      Connector(
        appName = "Analyze InfotonAndIndexWithSystemFields Output",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds: DataFrame = spark.read.parquet(Opts.in())

        import org.apache.spark.sql.functions._

        // A column expression that counts the number of failures for each constraint.
        // This will also include null counts, needed to interpret the results.
        val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) =>
          name -> sum(when(predicate, 0L).otherwise(1L)).as(name)
        }(breakOut)

        // Compute the failure counts
        val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head

        val results = for {
          i <- constraints.indices
          constraintName = constraints(i)._1
          failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i)
        } yield s"$constraintName,$failureCount"

        FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8)
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 14
Source File: Options.scala    From ai.vitk.ner   with GNU General Public License v3.0 5 votes vote down vote up
package ai.vitk.ner

import org.rogach.scallop.ScallopConf


class Options(arguments: Seq[String]) extends ScallopConf(arguments) {
  val master = opt[String](default = Some("local[*]"), descr = "the Spark master URL")
  val memory = opt[String](default = Some("8g"), descr = "executor memory")
  val mode = opt[String](default = Some("tag"), descr = "mode of the tagger, either 'train', 'tag' or 'eval'")
  val verbose = opt[Boolean](default = Some(false), descr = "verbose mode")
  val language = opt[String](default = Some("vi"), descr = "natural language in use, either 'vi', 'en' or 'ja'")
  val dimension = opt[Int](default = Some(32768), descr = "domain dimension for feature hashing")
  val iteration = opt[Int](default = Some(600), descr = "max number of iterations in training")
  val independent = opt[Boolean](default = Some(false), descr = "use only independent features")
  val reversed = opt[Boolean](default = Some(false), descr = "backward model")
  val input = opt[String](default = Some("test.txt"), descr = "input file for tagging")
  verify()
} 
Example 15
Source File: Run.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.run.GraphRunner
import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus}
import com.flaminem.flamy.model.ItemArgs
import com.flaminem.flamy.model.names.ItemName
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class Run extends Subcommand("run") with FlamySubcommand{

  banner("Perform a run on the specified environment")

  private val environment: ScallopOption[Environment] =
    opt(name="on", default=None, descr="Specifies environment to run on.", required=false, noshort=true)

  private val dryRun: ScallopOption[Boolean] =
    opt(name="dry", default=Some(false), descr="Perform a dry-run", noshort=true)

  validateOpt(environment, dryRun) {
    case (None,Some(false)) => Left("Please specify an environment to run on (with the --on option), or use the --dry option to perform a local dry-run")
    case _ => Right(())
  }

  private val from: ScallopOption[List[ItemName]] =
    opt[List[ItemName]](name="from", default=Some(Nil), descr="start from the given schemas/tables.", noshort=true, argName = "items")

  private val to: ScallopOption[List[ItemName]] =
    opt[List[ItemName]](name="to", default=Some(Nil), descr="stop at the given schemas/tables.", noshort=true, argName = "items")
  codependent(from,to)

  private val items: ScallopOption[List[ItemName]] =
    trailArg[List[ItemName]](default=Some(Nil),required=false)

  lazy val itemArgs = ItemArgs(items(), from(), to())

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    val context = new FlamyContext(globalOptions, this.environment.get)
    context.dryRun = this.dryRun()
    if (itemArgs.isEmpty) {
      System.err.println("Please specify items to run on")
      ReturnFailure
    }
    else {
      val graphRunner = GraphRunner(itemArgs, context)
      graphRunner.run()
    }
  }

} 
Example 16
Source File: Check.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.spark.ModelSparkContext
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.files.{FileRunner, ItemFileAction}
import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, ModelHivePartitionFetcher}
import com.flaminem.flamy.exec.utils._
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.graph.TableGraph
import com.flaminem.flamy.model._
import com.flaminem.flamy.model.core.Model
import com.flaminem.flamy.model.files.FilePath
import com.flaminem.flamy.model.names.ItemName
import org.apache.spark.sql.SQLContext
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


      val runGraph: TableGraph = baseGraph.subGraph(items())

      val dryRunner: FlamyRunner = FlamyRunner(context)
      println("Creating schemas and tables ...")
      try {
        dryRunner.checkAll(baseGraph)
      }
      finally{
        //TODO: For some strange reason, closing the connection here will result in ClassNotFoundErrors for udfs in the RunActions...
        //      dryRunner.close()
      }
      FlamyOutput.out.info("Running Populates ...")
      dryRunner.populateAll(runGraph.model, context)
      dryRunner.close()
      ReturnStatus(success = dryRunner.getStats.getFailCount==0)
    }

  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }


} 
Example 17
Source File: Drop.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.actions.{DropSchemaAction, DropTableAction}
import com.flaminem.flamy.exec.hive.HiveTableFetcher
import com.flaminem.flamy.exec.utils.{Action, _}
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.model.{ItemFilter, TableInfo}
import com.flaminem.flamy.model.exceptions.FlamyException
import com.flaminem.flamy.model.names.{ItemName, SchemaName, TableName}
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

class Drop extends Subcommand("drop") with FlamySubcommand {

  val schemas: Subcommand = new Subcommand("schemas") with FlamySubcommand {
    banner("Drop the specified schemas on the specified environment")
    val environment: ScallopOption[Environment] =
      opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)
    val dryRun: ScallopOption[Boolean] =
      opt(name = "dry", default = Some(false), descr = "Perform a dry-run", required = false, noshort = true)
    val all: ScallopOption[Boolean] =
      opt(
        name = "all",
        default = Some(false),
        descr = "Unlike other commands, not providing any schema name will not do anything. Unless you use this option.",
        noshort = true
      )
    val items: ScallopOption[List[String]] =
      trailArg[List[String]](default = Some(List()), required = false)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      if(all() && items().nonEmpty) {
        throw new FlamyException("Using the --all option will drop all schemas, and no schema name should be specified.")
      }
      val context = new FlamyContext(globalOptions, environment.get)
      context.dryRun = dryRun()
      val itemFilter = ItemFilter(items(), acceptIfEmpty = all())
      val fetcher = HiveTableFetcher(context)
      val schemaNames: Iterable[SchemaName] = fetcher.listSchemaNames.filter{itemFilter}.filterNot{_.fullName == "default"}

      val flamyRunner: FlamyRunner = FlamyRunner(context)
      val actionRunner = new ActionRunner(silentOnSuccess = false, silentOnFailure = false)
      val dropActions = schemaNames.map{schemaName => new DropSchemaAction(schemaName, flamyRunner)}
      actionRunner.run(dropActions)

      ReturnStatus(success = actionRunner.getStats.getFailCount == 0)
    }
  }

  val tables: Subcommand = new Subcommand("tables") with FlamySubcommand {
    banner("Drop the specified tables on the specified environment")
    val environment: ScallopOption[Environment] =
      opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)
    val dryRun: ScallopOption[Boolean] =
      opt(name = "dry", default = Some(false), descr = "Perform a dry-run", required = false, noshort = true)
    val all: ScallopOption[Boolean] =
      opt(
        name = "all",
        default = Some(false),
        descr = "Unlike other commands, not providing any table name will not do anything. Unless you use this option.",
        noshort = true
      )
    val items: ScallopOption[List[ItemName]] =
      trailArg[List[ItemName]](default = Some(List()), required = false)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      if(all() && items().nonEmpty) {
        throw new FlamyException("Using the --all option will drop all tables, and no table name should be specified.")
      }
      if(!all() && items().isEmpty) {
        throw new FlamyException("If you really want to drop all the tables, you should add the --all option.")
      }
      val context = new FlamyContext(globalOptions, environment.get)
      context.dryRun = dryRun()
      val itemFilter = ItemFilter(items(), acceptIfEmpty = all())
      val fetcher = HiveTableFetcher(context)
      val tables: Iterable[TableInfo] = fetcher.listTables(itemFilter)

      val flamyRunner: FlamyRunner = FlamyRunner(context)
      val actionRunner = new ActionRunner(silentOnSuccess = false, silentOnFailure = false)
      val dropActions = tables.map{table => new DropTableAction(table, flamyRunner)}
      actionRunner.run(dropActions)

      ReturnStatus(success = actionRunner.getStats.getFailCount == 0)
    }
  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case (command: FlamySubcommand) :: Nil =>
        command.doCommand(globalOptions, Nil)
      case _ => printHelp()
    }
    ReturnSuccess
  }
} 
Example 18
Source File: GatherInfo.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.hive.HivePartitionFetcher
import com.flaminem.flamy.exec.utils.{ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.model.ItemFilter
import com.flaminem.flamy.model.names.ItemName
import com.flaminem.flamy.utils.AutoClose
import com.flaminem.flamy.utils.time.TimeUtils
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class GatherInfo extends Subcommand("gather-info") with FlamySubcommand {

  banner("Gather all partitioning information on specified items (everything if no argument is given) and output this as csv on stdout.")

  val environment: ScallopOption[Environment] =
    opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)

  val items: ScallopOption[List[ItemName]] =
    trailArg[List[ItemName]](default = Some(List()), required = false)

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    val context = new FlamyContext(globalOptions, this.environment.get)
    for{
      fetcher: HivePartitionFetcher <- AutoClose(HivePartitionFetcher(context))
    } {
      val itemFilter = new ItemFilter(this.items(), true)
      for {
        tpInfo <- fetcher.listTableNames.filter{itemFilter}.map{fetcher.getTablePartitioningInfo}
        partition <- tpInfo.sortedTablePartitions
      } {
        println(
          Seq(
            tpInfo.tableName.schemaName,
            tpInfo.tableName.name,
            partition.partitionName,
            partition.getFileSize.getOrElse("\\N"),
            partition.getModificationTime(context, refresh = false).map {
              TimeUtils.timestampToUniversalTime
            }.getOrElse("\\N")
          ).mkString("\t")
        )
      }
    }
    ReturnSuccess
  }

} 
Example 19
Source File: Export.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands.tools

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyContextFormatter, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.graph.TableGraph
import com.flaminem.flamy.model.core.Model
import com.flaminem.flamy.model.exceptions.UnexpectedBehaviorException
import com.flaminem.flamy.model.names.ItemName
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}


class Export extends Subcommand("export") with FlamySubcommand {

  val conf = new Subcommand("conf") with FlamySubcommand {
    banner("Automatically generate a configuration template or doc")
    private lazy val template: ScallopOption[Boolean] = toggle(name = "template", default = Some(false), noshort = true)
    private lazy val markdown: ScallopOption[Boolean] = toggle(name = "markdown", default = Some(false), noshort = true)
    private lazy val rst: ScallopOption[Boolean] = toggle(name = "rst", default = Some(false), noshort = true)

    requireOne(template, markdown, rst)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      val context = new FlamyContext(globalOptions, Some(Environment("<ENV>")))
      if(template()) {
        FlamyOutput.out.println(new FlamyContextFormatter(context).toTemplate)
      }
      else if(markdown()) {
        FlamyOutput.out.println(new FlamyContextFormatter(context).toMarkdown)
      }
      else if(rst()) {
        FlamyOutput.out.println(new FlamyContextFormatter(context).toRST)
      }
      else {
        throw new UnexpectedBehaviorException("Either --template or --markdown option should be used")
      }
      ReturnSuccess
    }
  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }

} 
Example 20
Source File: Repair.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.hive.HiveTableFetcher
import com.flaminem.flamy.exec.utils.{Action, ActionRunner, ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.model.ItemFilter
import com.flaminem.flamy.model.names.{ItemName, TableName}
import com.flaminem.flamy.utils.AutoClose
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class Repair extends Subcommand("repair") with FlamySubcommand{

  val tables = new Subcommand("tables") {
    banner("Execute a msck repair table on every specified table. " +
      "This will automatically add to the metastore the partitions that exists on hdfs but not yet in the metastore.")
    val environment: ScallopOption[Environment] =
      opt(name="on", descr="Specifies environment to run on", required = true, noshort=true)
    val dryRun: ScallopOption[Boolean] =
      opt(name="dry", default=Some(false), descr="Perform a dry-run", required = false, noshort=true)
    val items: ScallopOption[List[ItemName]] =
      trailArg[List[ItemName]](default=Some(List()),required = false)
  }

  private class RepairTableAction(runner: FlamyRunner, tableName: TableName) extends Action{

    @throws(classOf[Exception])
    override def run(): Unit = {
      runner.runText(f"use ${tableName.schemaName} ; MSCK REPAIR TABLE ${tableName.name}")
    }

    override val name: String = tableName.fullName
    override val logPath: String = f"${tableName.schemaName}.db/${tableName.name}/REPAIR.hql"
  }

  private def repairTables(context: FlamyContext, items: ItemName*): Unit = {
    val itemFilter = new ItemFilter(items, acceptIfEmpty = true)
    val fetcher = HiveTableFetcher(context)
    val tables: Iterable[TableName] = fetcher.listTables(itemFilter).filterNot{_.isView}.filter{_.isPartitioned}.map{_.tableName}

    val actionRunner: ActionRunner = new ActionRunner(silentOnSuccess = false)
    for {
      flamyRunner: FlamyRunner <- AutoClose(FlamyRunner(context))
    } {
      val actions = tables.map{tableName => new RepairTableAction(flamyRunner, tableName)}
      actionRunner.run(actions)
    }
  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case ([email protected]) :: Nil =>
        val context = new FlamyContext(globalOptions, command.environment.get)
        context.dryRun = command.dryRun()
        repairTables(context, command.items():_*)
      case _ => printHelp()
    }
    ReturnSuccess
  }


} 
Example 21
Source File: Count.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, HiveTableFetcher, RemoteHiveRunner}
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.model.ItemFilter
import com.flaminem.flamy.model.names.{ItemName, TableName, TablePartitionName}
import com.flaminem.flamy.utils.AutoClose
import com.flaminem.flamy.utils.prettyprint.Tabulator
import com.flaminem.flamy.utils.sql.hive.StreamedResultSet
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class Count extends Subcommand("count") with FlamySubcommand {

  val tables = new Subcommand("tables") with FlamySubcommand {
    banner("Execute a select count(1) on every specified table.")
    val environment: ScallopOption[Environment] =
      opt(name="on", descr="Specifies environment to run on", required=false, noshort=true)
    val items: ScallopOption[List[ItemName]] =
      trailArg[List[ItemName]](default = Some(List()), required = false)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      val context = new FlamyContext(globalOptions, environment.get)
      val itemFilter = ItemFilter(items(), acceptIfEmpty = true)
      val fetcher = HiveTableFetcher(context)
      val tables: Iterable[TableName] = fetcher.listTableNames.filter{itemFilter}

      val hiveRunner: RemoteHiveRunner = new RemoteHiveRunner(context)
      try {
        for {
          tableName <- tables if !Thread.currentThread().isInterrupted
        } try {
          val res: StreamedResultSet = hiveRunner.executeQuery(f"SELECT COUNT(1) FROM $tableName")
          val row = res.next()
          FlamyOutput.out.success(f"ok: $tableName : ${row(0)}")
        } catch {
          case e: Throwable =>
            e.printStackTrace()
            FlamyOutput.err.failure(f"not ok: $tableName : ${e.getMessage}")
        }
      }
      finally{
        hiveRunner.close()
      }
      ReturnSuccess
    }

  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }

} 
Example 22
Source File: WaitForPartition.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.hive.PartitionWaiter
import com.flaminem.flamy.exec.utils.ReturnStatus
import com.flaminem.flamy.model.names.ItemName
import com.flaminem.flamy.utils.AutoClose
import com.flaminem.flamy.utils.time.TimeUtils
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class WaitForPartition extends Subcommand("wait-for-partition") with FlamySubcommand{

  banner("Wait for a partition to be created.")

  val environment: ScallopOption[Environment] =
    opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)
  val timeout: ScallopOption[Long] =
    opt(
      name = "timeout",
      descr = "Number of seconds after which flamy will fail if the partitions still does not exist",
      default = Some(12 * 3600),
      noshort = true
    )
  val after: ScallopOption[String] =
    opt(
      name = "after",
      argName = "yyyy-MM-dd HH:mm:ss",
      descr = """Wait for the partition to be created or refreshed after this time. Expected format is "yyyy-MM-dd HH:mm:ss"""",
      default = None,
      noshort = true
    )
  val retryInterval: ScallopOption[Long] =
    opt(
      name = "retry-interval",
      argName = "INTERVAL",
      descr = "When a partition is not found, retry after INTERVAL seconds",
      default = Some(60),
      noshort = true
    )
  val items: ScallopOption[List[ItemName]] =
    trailArg[List[ItemName]](required = true)

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    val context = new FlamyContext(globalOptions, environment.get)
    val waiter = new PartitionWaiter(context)
    for{
      waiter: PartitionWaiter <- AutoClose(new PartitionWaiter(context))
    } yield {
      waiter.waitForPartition(items(), timeout(), after.get.map{TimeUtils.universalTimeToTimeStamp}, retryInterval())
    }
  }

} 
Example 23
Source File: CliArgs.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.exec.shell

import com.flaminem.flamy.Launcher
import com.flaminem.flamy.Launcher.Options
import org.rogach.scallop.{CliOption, Scallop, ScallopConf}


    val lastOptionArgs: Seq[String] =
      if(lastOptionAndArgs.isEmpty) {
        Nil
      }
      else {
        lastOptionAndArgs.tail
      }

    val trailArgs: Seq[String] =
      if(lastOption.isDefined) {
        Nil
      }
      else {
        subCommands.foldLeft(args){
          case (args, command) => args.dropWhile(_ != command).drop(1)
        }
      }

    new CliArgs(
      builder = builder,
      args = args,
      lastOption = lastOption,
      lastOptionArgs = lastOptionArgs,
      previousOptions = previousOptions,
      trailArgs = trailArgs,
      lastWord = lastWord
    )
  }


} 
Example 24
Source File: VariablesTest.scala    From flamy   with Apache License 2.0 5 votes vote down vote up
package com.flaminem.flamy.model

import org.rogach.scallop.{ScallopConf, Subcommand}
import org.scalatest.{FreeSpec, Matchers}


class VariablesTest extends FreeSpec with Matchers{

  "replaceInText should work" in {
    val variables = new Variables
    variables += ("TO_REPLACE" -> "REPLACED")
    val text = "TO_REPLACE has been ${TO_REPLACE}"
    val expected = "TO_REPLACE has been REPLACED"

    assert(variables.replaceInText(text)===expected)
  }

  "subsetInText should work" in {
    val variables = new Variables
    variables += ("IN_KEY" -> "IN_VALUE")
    variables += ("OUT_KEY" -> "OUT_VALUE")
    val text = "this text contains ${IN_KEY} but does not contains OUT_KEY"

    val expectedVariables = new Variables
    expectedVariables += ("IN_KEY" -> "IN_VALUE")

    assert(variables.subsetInText(text, Nil) === expectedVariables)
  }

  "replaceInText should preserve partition variables" in {
    val text: String = """INSERT OVERWRITE TABLE db1.dest PARTITION(part=${partition:toto}) SELECT ${partition:toto} as num FROM db2.source"""
    val vars = new Variables()
    vars += "partition:toto" -> "${partition:toto}0"
    val expected: String = """INSERT OVERWRITE TABLE db1.dest PARTITION(part="${partition:toto}") SELECT "${partition:toto}" as num FROM db2.source"""
    assert(vars.replaceInText(text) == expected)
  }


  "the scallopConverter should work" in {
    object Conf extends ScallopConf(Seq("--variables", "HELLO=yes", "LIST=(1,2,3)", "sub")) {
      val variables = opt[Variables](name = "variables")(Variables.scallopConverter)
      val sub =
        new Subcommand("sub") {
          banner("Print version information of this program")
          override def toString = "sub"
        }
    }
    assert(Conf.variables() === Variables("HELLO" -> "yes", "LIST" -> "(1,2,3)"))
    assert(Conf.subcommand.get.toString === "sub")
  }

} 
Example 25
Source File: Main.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.dc.stream

import akka.actor.ActorSystem
import cmwell.ctrl.hc.HealthControl
import cmwell.dc.{LazyLogging, Settings}
import cmwell.tools.data.sparql.{SparqlProcessorManager, SparqlProcessorManagerSettings}
import cmwell.tracking.ResurrectorActor
import k.grid.service.ServiceTypes
import k.grid.{Grid, GridConnection}
import org.rogach.scallop.ScallopConf
import uk.org.lidalia.sysoutslf4j.context.SysOutOverSLF4J


object Main extends App with LazyLogging {
  import Settings._
  logger.info("Starting Dc-Sync using stream")
  //SLF4J initialization is not thread safe, so it's "initialized" by writing some log and only then using sendSystemOutAndErrToSLF4J.
  //Without it there will be en error in stderr and some log line at the beginning will be lost
  SysOutOverSLF4J.sendSystemOutAndErrToSLF4J()

  Grid.setGridConnection(GridConnection(memberName = "dc"))
  Grid.declareServices(
    ServiceTypes()
      .add("DataCenterSyncManager", classOf[DataCenterSyncManager], destinationHostsAndPorts(rawTarget), None)
      .add(HealthControl.services)
      .add(SparqlProcessorManager.name, classOf[SparqlProcessorManager], new SparqlProcessorManagerSettings)
      .add("Resurrector", classOf[ResurrectorActor])
  )
  Grid.joinClient
  HealthControl.init
  Thread.sleep(10000)
}

object MainStandAlone extends App with LazyLogging {
  import Settings._

  implicit val sys = ActorSystem("ExtrenalSystem")

  val conf = new Conf(args)

  val ar =
    sys.actorOf(DataCenterSyncManager.props(destinationHostsAndPorts(conf.destinationHosts()), Some(conf.syncJson())))
}

class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
//  val syncJson = opt[String](required = true)
  val syncJson = trailArg[String]()
  val destinationHosts = trailArg[String]()
  verify()
} 
Example 26
Source File: CollectionRollerCliParser.scala    From pulse   with Apache License 2.0 5 votes vote down vote up
package io.phdata.pulse.collectionroller

import org.rogach.scallop.ScallopConf

class CollectionRollerCliArgsParser(args: Seq[String]) extends ScallopConf(args) {
  lazy val conf = opt[String]("conf",
                              's',
                              required = true,
                              descr = "Path to the collection roller yaml configuration")
  lazy val daemonize = opt[Boolean](
    "daemonize",
    required = false,
    default = Some(false),
    descr = "Daemonize the process and run the CollectionRoller on a schedule")
  lazy val zkHosts = opt[String]("zk-hosts", required = true, descr = "Zookeeper hosts")
  lazy val deleteApplications =
    opt[String]("delete-applications", required = false, descr = "Delete applications (operation)")
  lazy val listApplications =
    opt[Boolean]("list-applications", required = false, descr = "List all applications (operation)")
  lazy val verbose =
    opt[Boolean](
      "verbose",
      required = false,
      descr = "List additional info (aliases and collections) for all applications (operation)")
  verify()
} 
Example 27
Source File: AlertEngineCliParser.scala    From pulse   with Apache License 2.0 5 votes vote down vote up
package io.phdata.pulse.alertengine

import org.rogach.scallop.{ ScallopConf, ScallopOption }

class AlertEngineCliParser(args: Seq[String]) extends ScallopConf(args) {
  lazy val conf: ScallopOption[String] = opt[String](
    "conf",
    's',
    required = true,
    descr =
      "Alert Engine config yaml file. See https://github.com/phdata/pulse/blob/master/alert-engine/README.md for schema")
  lazy val daemonize: ScallopOption[Boolean] = opt[Boolean](
    "daemonize",
    required = false,
    default = Some(false),
    descr = "Daemonize the process and run alerting on an interval")
  lazy val smtpServer: ScallopOption[String] =
    opt[String]("smtp-server", required = false, descr = "SMTP server hostmane")
  lazy val smtpUser: ScallopOption[String] = opt[String](
    "smtp-user",
    required = false,
    descr = "SMTP username (from address), like '[email protected]'")

  // default smptPassword is "", this will turn Some("") into None()
  lazy val smtpPassword: Option[String] = sys.env.get("SMTP_PASSWORD").filter(_ != "")
  lazy val smtpPort: ScallopOption[Long] =
    opt[Long]("smtp-port", required = false, descr = "SMTP server port. Defaults to 25")
  lazy val smtp_tls: ScallopOption[Boolean] = opt[Boolean](
    "smtp-tls",
    required = false,
    descr = "Whether to use START_TLS. Defaults to false")
  lazy val silencedApplicationsFile: ScallopOption[String] = opt[String](
    "silenced-application-file",
    required = false,
    descr = "File containing applications ignore when alerting, one application per line")

  lazy val zkHost: ScallopOption[String] = opt[String](
    "zk-hosts",
    required = false,
    descr = "Zookeeper hosts. Used to connect to Solr Cloud")

  lazy val dbUrl: ScallopOption[String] =
    opt[String]("db-url", required = false, descr = "URL to connect to the database")
  lazy val dbUser: ScallopOption[String] =
    opt[String]("db-user", required = false, descr = "User to connect to the database as")
  lazy val dbPassword: ScallopOption[String] =
    opt[String]("db-password", required = false, descr = "Password to connect to the database with")
  lazy val dbOptions: ScallopOption[String] = opt[String](
    "db-options",
    required = false,
    descr = "Database connection options in the form `key1=value1;key2=value2`")

  verify()
} 
Example 28
Source File: Conf.scala    From osstracker   with Apache License 2.0 5 votes vote down vote up
package com.netflix.oss.tools.osstrackerscraper

import org.rogach.scallop.ScallopConf

class Conf(args: Seq[String]) extends ScallopConf(args) {
  val action = opt[String](required = true)
  verify()
}

object Conf {
  val ACTION_UPDATE_CASSANDRA = "updatecassandra"
  val ACTION_UPDATE_ELASTICSEARCH = "updateelasticsearch"
  val OSSTRACKER_KEYSPACE = "osstracker"
  val SENTINAL_DEV_LEAD_ID = "111111"; // Assign to valid emp id
  val SENTINAL_MGR_LEAD_ID = "222222"; // Assign to valid emp id
  val SENTINAL_ORG = "UNKNOWN"; // Assign to unknown org until edited in console
} 
Example 29
Source File: DataLoader.scala    From variantsdwh   with Apache License 2.0 5 votes vote down vote up
package pl.edu.pw.ii.zsibio.dwh.benchmark

import com.typesafe.config.ConfigFactory
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.ScallopConf
import org.apache.kudu.spark.kudu._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataType, StructField, StructType}


object DataLoader {
  class RunConf(args:Array[String]) extends ScallopConf(args){

    val csvFile =opt[String]("csvFile",required = true, descr = "A CSV file to load" )
    val tableName =opt[String]("tableName",required = true, descr = "A table to load" )
    val storageType = opt[String]("storageType",required = true, descr = "Storage type parquet|orc|kudu|carbon")
    val dbName =opt[String]("dbName",required = true, descr = "Database name" )


    verify()
  }
  def main(args: Array[String]): Unit = {
    val runConf = new RunConf(args)
    val scConf = new SparkConf()
        .setAppName("DataLoader")
    val sc = new SparkContext(scConf)
    val sqlContext = new HiveContext(sc)


    if(runConf.storageType().toLowerCase() == "orc" || runConf.storageType().toLowerCase() == "parquet") {
      val df = sqlContext.read
        .format("com.databricks.spark.csv")
        .option("delimiter", "|")
        .option("nullValue","\\N")
        .option("inferSchema", "true") // Automatically infer data types
        .load(runConf.csvFile())
        .repartition(10)
      df.registerTempTable("temp_csv")
      sqlContext.sql(
        s"""
        |INSERT OVERWRITE TABLE ${runConf.dbName()}.${runConf.tableName()}

        |SELECT * FROM temp_csv
        """.stripMargin)
      }
    if(runConf.storageType().toLowerCase() == "kudu"){
      val confFile = ConfigFactory.load()
      val kuduMaster = confFile.getString("kudu.master.server")
      val kuduContext = new KuduContext(kuduMaster)
      val dfTarget = sqlContext.read.options(Map("kudu.master" -> kuduMaster,"kudu.table" -> runConf.tableName())).kudu
      val df = sqlContext.read
        .format("com.databricks.spark.csv")
        .option("delimiter", "|")
        .option("nullValue","\\N")
        .schema(dfTarget.schema)
        .load(runConf.csvFile())
        .repartition(10)
      kuduContext.upsertRows(df,runConf.tableName())
    }

  }

  private def synSchemas(inSchema:StructType, outSchema:StructType) = {

    val size = inSchema.fields.length
    val structFields = (0 to size - 1).map{
      i => StructField(outSchema.fields(i).name,inSchema.fields(i).dataType,outSchema.fields(i).nullable)
    }
    new StructType(structFields.toArray)

  }

} 
Example 30
Source File: Arguments.scala    From ZparkIO   with MIT License 5 votes vote down vote up
package com.leobenkel.zparkioProjectExample

import com.leobenkel.zparkio.Services.CommandLineArguments
import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments
import org.rogach.scallop.{ScallopConf, ScallopOption}
import zio.ZIO

case class Arguments(input: List[String])
    extends ScallopConf(input) with CommandLineArguments.Service {
  val inputId: ScallopOption[Int] = opt[Int](
    default = Some(10),
    required = false,
    noshort = true
  )

  val sparkFoo: ScallopOption[String] = opt[String](
    default = Some("hello"),
    required = false,
    noshort = true
  )
}

object Arguments {
  def apply[A](f: Arguments => A): ZIO[CommandLineArguments[Arguments], Throwable, A] = {
    CommandLineArguments.get[Arguments].apply(f)
  }
} 
Example 31
Source File: Arguments.scala    From ZparkIO   with MIT License 5 votes vote down vote up
package com.leobenkel.zparkioProfileExampleMoreComplex

import com.leobenkel.zparkio.Services.CommandLineArguments
import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments
import com.leobenkel.zparkioProfileExampleMoreComplex.Services.Database
import org.rogach.scallop.{ScallopConf, ScallopOption}
import zio.ZIO

case class Arguments(input: List[String])
    extends ScallopConf(input) with CommandLineArguments.Service {

  val databaseUsername: ScallopOption[String] = opt[String](
    default = Some("admin"),
    required = false,
    noshort = true
  )

  
  val databasePassword: ScallopOption[String] = opt[String](
    default = Some("123456"),
    required = false,
    noshort = true
  )

  val databaseHost: ScallopOption[String] = opt[String](
    default = Some("database://host.com/database"),
    required = false,
    noshort = true
  )

  val generatedInputSize: ScallopOption[Int] = opt[Int](
    default = Some(100),
    required = false,
    noshort = true,
    descr = "The size of the sample data generated"
  )

  val sparkConfig: ScallopOption[String] = opt[String](
    default = Some("foo"),
    required = false,
    noshort = true
  )

  lazy val credentials: Database.Credentials = Database.Credentials(
    user = databaseUsername(),
    psw = databasePassword(),
    host = databaseHost()
  )
}

object Arguments {
  def apply[A](f: Arguments => A): ZIO[CommandLineArguments[Arguments], Throwable, A] = {
    CommandLineArguments.get[Arguments].apply(f)
  }
} 
Example 32
Source File: CommandLineArgumentsTest.scala    From ZparkIO   with MIT License 5 votes vote down vote up
package com.leobenkel.zparkio.Services

import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments
import org.rogach.scallop.exceptions.{RequiredOptionNotFound, UnknownOption}
import org.rogach.scallop.{ScallopConf, ScallopOption}
import org.scalatest._
import zio.Exit.{Failure, Success}
import zio.{BootstrapRuntime, Layer, Task, ZIO, ZLayer}

class CommandLineArgumentsTest extends FreeSpec {
  "CommandLineService" - {
    case class ArgumentsService(input: Seq[String])
        extends ScallopConf(input) with CommandLineArguments.Service {
      val test: ScallopOption[String] = opt[String](
        default = None,
        required = true,
        noshort = true
      )
    }

    object Arguments {
      def get[A](
        f: ArgumentsService => A
      ): ZIO[CommandLineArguments[ArgumentsService], Throwable, A] = {
        CommandLineArguments.get[ArgumentsService].apply(f)
      }

      def apply(input: Seq[String]): Layer[Nothing, CommandLineArguments[ArgumentsService]] = {
        ZLayer.succeed(ArgumentsService(input))
      }
    }

    val runtime = new BootstrapRuntime {}

    "should work" in {
      val test: String = "qwe-asd-asd-zxc"

      runtime.unsafeRunSync {
        Arguments.get(_.test.toOption).provideLayer(Arguments(Seq("--test", test)))
      } match {
        case Success(Some(value)) => assertResult(value)(test)
        case Success(None)        => fail("Did not found any value")
        case Failure(ex)          => fail(ex.prettyPrint)
      }
    }

    "should fail - missing required" in {
      runtime.unsafeRunSync(for {
        arg <- Task(Arguments(Nil))
        a   <- Arguments.get(_.test.toOption).provideLayer(arg)
      } yield {
        a
      }) match {
        case Success(_)  => fail("Should have failed")
        case Failure(ex) => assertThrows[RequiredOptionNotFound](throw ex.squash)
      }
    }

    "should fail - unknonw option" in {
      runtime.unsafeRunSync(for {
        arg <- Task(Arguments(Seq("--abc", "foo")))
        a   <- Arguments.get(_.test.toOption).provideLayer(arg)
      } yield {
        a
      }) match {
        case Success(_)  => fail("Should have failed")
        case Failure(ex) => assertThrows[UnknownOption](throw ex.squash)
      }
    }
  }
} 
Example 33
Source File: ParametricFaceImageGeneratorOptions.scala    From parametric-face-image-generator   with Apache License 2.0 5 votes vote down vote up
package faces.utils

import org.rogach.scallop.{ScallopConf, ScallopOption}
import org.rogach.scallop.exceptions.ScallopException


class ParametricFaceImageGeneratorOptions(args: Seq[String]) extends ScallopConf(args) {
  banner(
    """|parametric-face-image-generator
       |© University of Basel
       |License: http://www.apache.org/licenses/LICENSE-2.0
       |
       |Options:""".stripMargin)

  val configurationFile: ScallopOption[String] = opt[String](required = true,descr = "configuration file with the parameters")

  footer(
    """""".stripMargin
  )

  override def onError(e: Throwable): Unit = e match {
    case ScallopException(message) =>
      printHelp
      println("You provided the arguments: "+args.mkString(" "))
      println(message)
      sys.exit(1)
    case ex => super.onError(ex)
  }
} 
Example 34
Source File: Conf.scala    From ncdbg   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.programmaticallyspeaking.ncd.config

import org.rogach.scallop.{ScallopConf, ValueConverter}

case class Address(host: String, port: Int) {
  override def toString = host + ":" + port
}
class AddressConverter extends ValueConverter[Address] {

  val valueRegexp = "([^:]+(?::))?([0-9]+)".r

  override def parse(s: List[(String, List[String])]): Either[String, Option[Address]] = {
    s match {
      case (_, valueRegexp(host, port) :: Nil) :: Nil =>
        // I tried getting rid of the trailing : using a non-capturing group, but it didn't work.
        val theHost = Option(host).map(h => h.dropRight(1)).getOrElse("localhost")
        Right(Some(Address(theHost, port.toInt)))
      case Nil =>
        Right(None)
      case _ =>
        Left("address must have format <host>:<port> or only <port>")
    }
  }

  override val tag = scala.reflect.runtime.universe.typeTag[Address]
  override val argType = org.rogach.scallop.ArgType.SINGLE
}

class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
  implicit val addressConverter = new AddressConverter

  private val defaultListen = Address("localhost", 7778)
  private val defaultConnect = Address("localhost", 7777)

  banner(
    """Usage: ncdbg [OPTION]...
      |
      |Ncdbg (Nashorn-Chrome-debugger) connects to a debuggable Java process running Nashorn scripts,
      |while acting as a server for Chrome Developer Tools. This makes it possible to debug Nashorn scripts
      |using Chrome.
      |
      |Options:
    """.stripMargin)

  val listen = opt[Address](default = Some(defaultListen),
    descr = s"address to listen on, on <host>:<port> format or port only. Defaults to $defaultListen.")
  val connect = opt[Address](default = Some(defaultConnect),
    descr = s"address to connect to, on <host>:<port> format or port only. Defaults to $defaultConnect.")
  val isLazy = toggle(name = "lazy", default = Some(false),
    descrYes = "defer connection until DevTools connects, and stay alive when the debug target dies.",
    descrNo = "connect right away and require the debug target to live. This is the default.")
  verify()
} 
Example 35
Source File: AddressConverterTest.scala    From ncdbg   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.programmaticallyspeaking.ncd.boot

import com.programmaticallyspeaking.ncd.config.{Address, AddressConverter}
import com.programmaticallyspeaking.ncd.testing.UnitTest
import org.rogach.scallop.exceptions.WrongOptionFormat
import org.rogach.scallop.{ScallopConf, throwError}

class AddressConverterTest extends UnitTest {
  // Throw error instead of exiting on option error.
  throwError.value = true

  def conf(args: String*) = new ScallopConf(args.toSeq) {
    val address = opt[Address]()(new AddressConverter)
    verify()
  }

  "should parse host:port" in {
    val c = conf("--address", "foo:1234")
    c.address.toOption should be (Some(Address("foo", 1234)))
  }

  "should parse only port" in {
    val c = conf("--address", "1234")
    c.address.toOption should be (Some(Address("localhost", 1234)))
  }

  "should handle no address" in {
    val c = conf()
    c.address.toOption should be (None)
  }

  "should reject non-integer port" in {
    intercept[WrongOptionFormat](conf("--address", "foo"))
  }
} 
Example 36
Source File: IngesterMain.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.tools.data.ingester

import java.io.FileInputStream
import java.util.zip.GZIPInputStream

import akka.stream.scaladsl.Sink
import cmwell.tools.data.utils.akka.stats.IngesterStats
//import cmwell.tools.data.sparql.SparqlProcessorMain.Opts.opt
import cmwell.tools.data.utils.ArgsManipulations._
import cmwell.tools.data.utils.akka.Implicits._
import cmwell.tools.data.utils.akka._
import cmwell.tools.data.utils.ops._
import com.typesafe.scalalogging.LazyLogging
import org.rogach.scallop.ScallopConf

import scala.concurrent.ExecutionContext.Implicits.global

object IngesterMain extends App with LazyLogging {
  object Opts extends ScallopConf(args) {
    version(s"cm-well ingester ${getVersionFromManifest()} (c) 2015")

    val host = opt[String]("host", descr = "cm-well host name", required = true)
    val format = opt[String]("format", descr = "input format (e.g. ntriples, nquads, jsonld)", required = true)
    val file = opt[String]("file", descr = "input file path", default = None)
    val gzip = opt[Boolean]("gzip", descr = "is input file gzipped", default = Some(false))
    val token = opt[String]("token", descr = "cm-well write permission token", default = None)
    val replaceMode =
      opt[Boolean]("with-replace-mode", descr = "replace-mode parameter in cm-well", default = Some(false))
    val force = opt[Boolean]("force", descr = "force parameter in cm-well", default = Some(false))
    val priority = opt[Boolean]("priority", default = Some(false), descr = "ingest data in priority mode")
    val numConnections = opt[Int]("num-connections", descr = "number of http connections to open")

    dependsOnAll(gzip, List(file))
    verify()
  }

  val start = System.currentTimeMillis()

  var totalIngestedBytes = 0L
  var ingestedBytesInWindow = 0
  var ingestedInfotonsInWindow = 0
  var totalIngestedInfotons = 0L
  var totalFailedInfotons = 0L
  var lastTime = start
  var nextPrint = 0L
  var lastMessageSize = 0
  val windowSizeMillis = 1000

  val formatter = java.text.NumberFormat.getNumberInstance

  // resize akka http connection pool
  Opts.numConnections.toOption.map { numConnections =>
    System.setProperty("akka.http.host-connection-pool.max-connections", numConnections.toString)
  }

  val inputStream = if (Opts.file.isSupplied) {
    val inputFile = new FileInputStream(Opts.file())
    if (Opts.gzip()) {
      new GZIPInputStream(inputFile)
    } else {
      inputFile
    }
  } else {
    System.in
  }

  val result = Ingester
    .fromInputStream(
      baseUrl = formatHost(Opts.host()),
      format = Opts.format(),
      writeToken = Opts.token.toOption,
      replaceMode = Opts.replaceMode(),
      force = Opts.force(),
      isPriority = Opts.priority(),
      in = inputStream
    )
    .via(IngesterStats(isStderr = true))
    .runWith(Sink.ignore)

  // actor system is still alive, will be destroyed when finished
  result.onComplete { x =>
    System.err.println("\n")
    System.err.println(s"finished: $x")
    cleanup()
  }
} 
Example 37
Source File: LogCollectorCliParser.scala    From pulse   with Apache License 2.0 5 votes vote down vote up
package io.phdata.pulse.logcollector

import org.rogach.scallop.ScallopConf

class LogCollectorCliParser(args: Seq[String]) extends ScallopConf(args) {
  lazy val port        = opt[Int]("port", required = false, descr = "HTTP Server Listening port")
  lazy val zkHosts     = opt[String]("zk-hosts", required = true, descr = "Zookeeper hosts")
  lazy val kuduMasters = opt[String]("kudu-masters", required = false, descr = "Kudu masters")
  lazy val mode = opt[String]("consume-mode",
                              required = false,
                              descr = "'http' or 'kafka'",
                              default = Some("http"))
  lazy val kafkaProps =
    opt[String]("kafka-properties", required = false, descr = "Kafka properties file")
  lazy val topic = opt[String]("topic", required = false, descr = "Kafka Topic")

  validateOpt(mode, port) {
    case (Some("http") | None, None) => Left("Need a port if running http mode")
    case _                           => Right(Unit)
  }

  validateOpt(mode, kafkaProps, topic) {
    case (Some("kafka"), None, Some(_)) =>
      Left("--kafka-properties argument needed if --consume-mode=kafka")
    case (Some("kafka"), Some(_), None) => Left("--topic argument needed if --consume-mode=kafka")
    case (Some("kafka"), None, None) =>
      Left("--topic and --kafka-properties arguments needed if --consume-mode=kafka")
    case _ => Right(Unit)
  }

  verify()
} 
Example 38
Source File: Main2.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.tools.file.export

import cmwell.tools.neptune.export.ExportToNeptuneManager
import org.rogach.scallop.ScallopConf


class Conf1(arguments: Seq[String]) extends ScallopConf(arguments) {
  val sourceCluster = opt[String]("source-cluster", required = true, descr = "the source cluster which data is being exported from")
  val lengthHint = opt[Int]("length-hint", default = Some(16000), validate = 300000.>=, descr="number of infotons that should be consumed in each bulk-consume call")
  val qp = opt[String](name="qp-param", default=None, descr = "cm well qp param")
  val directory = opt[String](name="directory", required = true, default=Some("./"), descr = "s3 directory which neptune read data from")

  verify()
}

object Main2 {
  def main(args: Array[String]) {
    val conf = new Conf1(args)
    println("Source cluster is: " + conf.sourceCluster())
    println("length-hint: " + conf.lengthHint())
    println("qp: " + conf.qp.getOrElse("(not provided)"))
    println("s3 bucket:" + conf.directory())
    val qpParam :Option[String]= conf.qp.toOption.map(s => s",$s")
    println("About to Export..")
    val exportToNeptuneManager = new ExportToNeptuneManager(1)
    exportToNeptuneManager.exportToNeptune(conf.sourceCluster(), "", conf.lengthHint(), false, qpParam, false, None, None, "", Some(conf.directory()))
  }

} 
Example 39
Source File: Main.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.tools.neptune.export

import org.rogach.scallop.ScallopConf


class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
  val sourceCluster = opt[String]("source-cluster", required = true, descr = "the source cluster which data is being exported from")
  val neptuneCluster = opt[String]("neptune-cluster", required = true, descr="neptune cluster which data is being exported to")
  val ingestConnectionPoolSize = opt[Int]("ingest-connection-pool-size", default = Some(5), validate = 50.>=, descr="number of connection pool that should be created by the tool in order to ingest to neptune")
  val lengthHint = opt[Int]("length-hint", default = Some(16000), validate = 300000.>=, descr="number of infotons that should be consumed in each bulk-consume call")
  val qp = opt[String](name="qp-param", default=None, descr = "cm well qp param")
  val updateInfotons = opt[Boolean]("update-infotons", descr = "enable this parameter when you use an update mode or delete of infotons")
  val bulkLoader = opt[Boolean]("bulk-loader", descr = "enable this parameter in order to export by using s3-bulk loader api. bulk loader is only for initial load")
  val proxyHost = opt[String]("proxy-host", default=None, descr = "proxy host is provided when you use bulk loader and your machine use proxy")
  val proxyPort = opt[Int]("proxy-port", default=None, descr = "proxy port is provided when you use bulk loader and your machine use proxy")
  val s3Bucket = opt[String](name="s3-bucket", default=Some("cm-well/sync"), descr = "s3 directory which neptune read data from")

  verify()
}

object Main {
  def main(args: Array[String]) {
    val conf = new Conf(args)
    println("Source cluster is: " + conf.sourceCluster())
    println("Neptune cluster is: " + conf.neptuneCluster())
    println("Connection pool size is: " + conf.ingestConnectionPoolSize())
    println("length-hint: " + conf.lengthHint())
    println("update infotons: " + conf.updateInfotons())
    println("qp: " + conf.qp.getOrElse("(not provided)"))
    println("bulk loader: " + conf.bulkLoader())
    println("proxy host: " + conf.proxyHost.getOrElse("not provided"))
    println("proxy port: " + conf.proxyPort.getOrElse(-1))
    println("s3 bucket:" + conf.s3Bucket())
    val qpParam :Option[String]= conf.qp.toOption.map(s => s",$s")
    val proxyHost :Option[String]= conf.proxyHost.toOption
    val proxyPort :Option[Int]= conf.proxyPort.toOption
    println("About to Export..")
    val exportToNeptuneManager = new ExportToNeptuneManager(conf.ingestConnectionPoolSize())
    exportToNeptuneManager.exportToNeptune(conf.sourceCluster(), conf.neptuneCluster(), conf.lengthHint(), conf.updateInfotons(), qpParam,
      conf.bulkLoader(), proxyHost, proxyPort, conf.s3Bucket(), None)
  }

} 
Example 40
Source File: Main.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
import akka.actor.{ActorSystem, Props}
import org.rogach.scallop.ScallopConf


  class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
    val sourceUrl = opt[String]("source-url", required = true, descr = "the source url which download rdf file")
    val format = opt[String]("format", required = true, descr="the ofile format")
    var cluster = opt[String]("cluster", required = true, descr="the target server which content is ingested to")
    verify()
  }

  object Main {
    def main(args: Array[String]) {
      val conf = new Conf(args)  // Note: This line also works for "object Main extends App"
      println("source file is: " + conf.sourceUrl())
      println("output format is: " + conf.format())
      val system = ActorSystem("MySystem")
      println("About to Start import tool flow...")
      val mainActor = system.actorOf(Props(new AkkaFileReaderWithActor(conf.sourceUrl(), conf.format(), conf.cluster())), name = "myactor")
      mainActor ! ActorInput
    }

} 
Example 41
Source File: DumpIndexWithSystemFields.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.IndexWithSystemFields
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpIndexWithSystemFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpIndexWithSystemFields.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump system fields from Elasticsearch indexes",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = IndexWithSystemFields()(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }

      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 42
Source File: DumpInfotonWithUuidOnly.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.InfotonWithUuidOnly
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object DumpInfotonWithUuidOnly {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpInfotonWithUuidOnly.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump infoton table - uuid only",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = InfotonWithUuidOnly()(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 43
Source File: FindInfotonIndexInconsistencies.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.InfotonAndIndexWithSystemFields
import cmwell.analytics.data.InfotonAndIndexWithSystemFields.{isConsistent, isWellFormed}
import cmwell.analytics.util.CmwellConnector
import cmwell.analytics.util.ConsistencyThreshold.defaultConsistencyThreshold
import cmwell.analytics.util.ISO8601.{instantToMillis, instantToText}
import org.apache.log4j.LogManager
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._
import org.joda.time.format.ISODateTimeFormat
import org.rogach.scallop.{ScallopConf, ScallopOption, ValueConverter, singleArgConverter}

import scala.util.control.NonFatal

object FindInfotonIndexInconsistencies {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(FindInfotonIndexInconsistencies.getClass)

    try {

      object Opts extends ScallopConf(args) {

        private val instantConverter: ValueConverter[Long] = singleArgConverter[Long](instantToMillis)

        // If this parameter is not supplied, the (unreliable) ES Spark connector is used to extract the data from the es index.
        val esExtract: ScallopOption[String] = opt[String]("es", short = 'e', descr = "The path where the (parquet) extract of system fields the es index are stored", required = false)

        val consistencyThreshold: ScallopOption[Long] = opt[Long]("consistency-threshold", short = 'c', descr = "Ignore any inconsistencies at or after this instant", default = Some(defaultConsistencyThreshold))(instantConverter)

        val outParquet: ScallopOption[String] = opt[String]("out-parquet", short = 'p', descr = "The path to save the output to (in parquet format)", required = false)
        val outCsv: ScallopOption[String] = opt[String]("out-csv", short = 'v', descr = "The path to save the output to (in CSV format)", required = false)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Find inconsistencies between system fields in Infoton and Index",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        logger.info(s"Using a consistency threshold of ${instantToText(Opts.consistencyThreshold())}.")

        val ds = InfotonAndIndexWithSystemFields(esExtractPath = Opts.esExtract.toOption)(spark)

        // Filter out any inconsistencies found if more current than this point in time.
        val i = ds.schema.indexWhere(_.name == "infoton_lastModified")
        val filterCurrent: Row => Boolean = { row: Row =>

          val parser = ISODateTimeFormat.dateTimeParser
          if (row.isNullAt(i))
            true // Shouldn't be null, but don't filter out if we can't get a lastModified
          else
            try {
              parser.parseMillis(row.getAs[String](i)) < Opts.consistencyThreshold()
            }
            catch {
              case NonFatal(_) => true // Don't filter out if lastModified couldn't be converted
            }
        }

        val inconsistentData = ds.filter(not(isConsistent(ds) && isWellFormed(ds)))
          .filter(filterCurrent)
          .cache()

        // Save the inconsistent data in Parquet format suitable for additional analysis
        if (Opts.outParquet.isDefined)
          inconsistentData
            .write
            .parquet(Opts.outParquet())

        // Save the inconsistent data to a single CSV file suitable for reporting.
        if (Opts.outCsv.isDefined)
          inconsistentData
            .coalesce(1)
            .write
            .option("header", value = true)
            .csv(Opts.outCsv())
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 44
Source File: DumpPathWithKeyFields.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.PathWithKeyFields
import cmwell.analytics.util.CmwellConnector
import cmwell.analytics.util.DatasetFilter
import cmwell.analytics.util.TimestampConversion.timestampConverter
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpPathWithKeyFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpPathWithKeyFields.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump path table - key fields",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val datasetFilter = DatasetFilter(
          lastModifiedGte = Opts.lastModifiedGteFilter.toOption,
          pathPrefix = Opts.pathPrefixFilter.toOption)

        val ds = PathWithKeyFields(Some(datasetFilter))(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 45
Source File: DumpPathWithUuidOnly.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.PathWithUuidOnly
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object DumpPathWithUuidOnly {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpPathWithUuidOnly.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump path table - uuid only",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = PathWithUuidOnly()(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 46
Source File: FindDuplicatedSystemFields.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.InfotonWithDuplicatedSystemFields
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object FindDuplicatedSystemFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(FindDuplicatedSystemFields.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Find infotons with duplicated system fields"
      ).withSparkSessionDo { spark =>

        import spark.implicits._

        InfotonWithDuplicatedSystemFields()(spark)
          .toDF
          .write.csv(Opts.out())
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
} 
Example 47
Source File: DumpIndexWithUuidOnly.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import cmwell.analytics.data.IndexWithUuidsOnly
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpIndexWithUuidOnly {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpIndexWithUuidOnly.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only include current", required = false, default = Some(true))
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump UUIDs from Elasticsearch indexes",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = IndexWithUuidsOnly(currentOnly = Opts.currentOnly())(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}