org.rogach.scallop.ScallopConf Scala Example

Source File: FeatureCounts.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.apps

import htsjdk.samtools.ValidationStringency
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.SparkSession
import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
import org.biodatageeks.sequila.utils.Columns
import org.rogach.scallop.ScallopConf
import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable}
import org.seqdoop.hadoop_bam.util.SAMHeaderReader

object FeatureCounts {
  case class Region(contig:String, pos_start:Int, pos_end:Int)
  class RunConf(args:Array[String]) extends ScallopConf(args){

    val output = opt[String](required = true)
    val annotations = opt[String](required = true)
    val readsFile = trailArg[String](required = true)
    val Format = trailArg[String](required = false)
    verify()
  }

  def main(args: Array[String]): Unit = {
    val runConf = new RunConf(args)
    val spark = SparkSession
      .builder()
      .appName("SeQuiLa-FC")
      .getOrCreate()

    spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder","true")
    //spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (1024).toString)
    spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil



    val query ="""SELECT targets.GeneId AS GeneId,
                     targets.Chr AS Chr,
                     targets.Start AS Start,
                     targets.End AS End,
                     targets.Strand AS Strand,
                     CAST(targets.End AS INTEGER)-CAST(targets.Start AS INTEGER) + 1 AS Length,
                     count(*) AS Counts
            FROM reads JOIN targets
      |ON (
      |  targets.Chr=reads.contigName
      |  AND
      |  reads.end >= CAST(targets.Start AS INTEGER)
      |  AND
      |  reads.start <= CAST(targets.End AS INTEGER)
      |)
      |GROUP BY targets.GeneId,targets.Chr,targets.Start,targets.End,targets.Strand""".stripMargin
      spark
        .sparkContext
        .setLogLevel("ERROR")

      spark
        .sparkContext
        .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString)

      val alignments = spark
        .sparkContext.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat](runConf.readsFile())
        .map(_._2.get)
        .map(r => Region(r.getContig, r.getStart, r.getEnd))

      val readsTable = spark.sqlContext.createDataFrame(alignments)
      readsTable.createOrReplaceTempView("reads")

      val targets = spark
        .read
        .option("header", "true")
        .option("delimiter", "\t")
        .csv(runConf.annotations())
      targets
        .withColumnRenamed("contigName", Columns.CONTIG)
        .createOrReplaceTempView("targets")

     spark.sql(query)
       .orderBy("GeneId")
        .coalesce(1)
        .write
        .option("header", "true")
        .option("delimiter", "\t")
        .csv(runConf.output())
  }

}

Source File: DumpInfotonWithKeyFields.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.InfotonWithKeyFields
import cmwell.analytics.util.{CmwellConnector, DatasetFilter}
import cmwell.analytics.util.DatasetFilter._
import cmwell.analytics.util.TimestampConversion.timestampConverter
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpInfotonWithKeyFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpInfotonWithKeyFields.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to ", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump infoton table - uuid, lastModified, path",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val datasetFilter = DatasetFilter(
          lastModifiedGte = Opts.lastModifiedGteFilter.toOption,
          pathPrefix = Opts.pathPrefixFilter.toOption)

        val ds = InfotonWithKeyFields(Some(datasetFilter))(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: CheckInfotonDataIntegrity.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.InfotonDataIntegrity
import cmwell.analytics.util.{CmwellConnector, DatasetFilter}
import cmwell.analytics.util.TimestampConversion.timestampConverter
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object CheckInfotonDataIntegrity {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CheckInfotonDataIntegrity.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Check infoton data integrity"
      ).withSparkSessionDo { spark =>

        val datasetFilter = DatasetFilter(
          lastModifiedGte = Opts.lastModifiedGteFilter.toOption,
          pathPrefix = Opts.pathPrefixFilter.toOption)

        val ds = InfotonDataIntegrity(Some(datasetFilter))(spark)

        val damagedInfotons = ds.filter(infoton =>
          infoton.hasIncorrectUuid ||
            infoton.hasDuplicatedSystemFields ||
            infoton.hasInvalidContent ||
            infoton.hasMissingOrIllFormedSystemFields
        )

        damagedInfotons.select("uuid", "lastModified", "path",
          "hasIncorrectUuid", "hasMissingOrIllFormedSystemFields", "hasDuplicatedSystemFields", "hasInvalidContent", "hasUnknownSystemField")
          .write.csv(Opts.out())
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: ExtractFromParquet.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.data

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.util.Connector
import cmwell.analytics.util.StringUtil._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.rogach.scallop.{ScallopConf, ScallopOption}

object ExtractFromParquet {

  def main(args: Array[String]): Unit = {

    object Opts extends ScallopConf(args) {

      val pathsToFind: ScallopOption[String] = opt[String]("paths-to-find", short = 'f', descr = "A file containing the list of paths to look for", required = true)
      val parquetData: ScallopOption[String] = opt[String]("parquet-file", short = 'p', descr = "A Parquet file containing the data; single string column rdfStatement", required = true)
      val extractedData: ScallopOption[String] = opt[String]("extracted-data", short = 'd', descr = "The file that extracted data will be written to (in nquads format)", required = true)
      val pathsNotFound: ScallopOption[String] = opt[String]("paths-not-found", short = 'n', descr = "The output file that any paths that were not found are written to", required = true)
      val pathsFound: ScallopOption[String] = opt[String]("paths-found", short = 'a', descr = "The output file containing the paths that we found are written to", required = true)

      verify()
    }

    Connector(sparkShell = true, appName = "Extract from parquet").withSparkSessionDo {
      spark: SparkSession =>

        val pathsToFind = Set(splitLines(FileUtils.readFileToString(new File(Opts.pathsToFind()), UTF_8)): _*)

        val ds: DataFrame = spark.read.parquet(Opts.parquetData())

        // Cheesy parsing of path from an RDF nquad, but sufficient for this purpose
        def extractPath(rdfStatement: String): String = rdfStatement.substring(7, rdfStatement.indexOf(">"))

        val statementsFound = ds.rdd.filter { row: Row =>

          val statement = row.getAs[String]("rdfStatement")
          val path = extractPath(statement)

          pathsToFind.contains(path)
        }.collect() // expect the result to be small, so collect is OK

        // Save all the paths that were not found to file - look for them in other files.
        val pathsFound: Set[String] = Set(statementsFound.map(row => extractPath(row.getString(0))): _*)
        println(s"There were ${pathsFound.size} paths found (out of ${pathsToFind.size}).")
        FileUtils.writeStringToFile(new File(Opts.pathsFound()), pathsFound.mkString("\n"), UTF_8, false)

        val pathsNotFound = pathsToFind.diff(pathsFound)
        println(s"There were ${pathsNotFound.size} paths not found.")
        FileUtils.writeStringToFile(new File(Opts.pathsNotFound()), pathsNotFound.mkString("\n"), UTF_8, false)

        // Save the RDF statements for the paths that were found
        val x = statementsFound.map(row => row.getString(0)).mkString("\n")
        FileUtils.writeStringToFile(new File(Opts.extractedData()), x, UTF_8, false)
    }
  }
}

Source File: DumpCompleteDocumentFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpCompleteDocumentFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpCompleteDocumentFromEs.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("dump-complete-document-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-filter", short = 'c', descr = "Filter on current status", default = None)
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithCompleteDocument
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: CopyIndex.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object CopyIndex {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CopyIndex.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("copy-index")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = true)
        val writeIndex: ScallopOption[String] = opt[String]("write-index", short = 'w', descr = "The name of the index to write to", required = true)

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument](
        indexName = Opts.writeIndex(),
        esEndpoint = esContactPoint)

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),
        objectExtractor = IndexWithCompleteDocument,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DumpKeyFieldsFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithKeyFields}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpKeyFieldsFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpKeyFieldsFromEs.getClass)

    implicit val system: ActorSystem = ActorSystem("dump-key-fields-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {
      // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
      // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
      // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
      val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)

        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default = Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithKeyFields
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DumpUuidOnlyFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithUuidOnly}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpUuidOnlyFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpUuidOnlyFromEs.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("dump-uuid-only-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithUuidOnly
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: DumpSystemFieldsFromEs.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.nio.file.Paths

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithSystemFields}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.TimestampConversion.timestampConverter
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object DumpSystemFieldsFromEs {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpSystemFieldsFromEs.getClass)

    implicit val system: ActorSystem = ActorSystem("dump-system-fields-from-es")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {
      // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
      // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
      // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
      val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from (default: cm_well_all)", required = false)
        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only download current uuids")
        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The data format: either 'parquet' or 'csv'", default = Some("parquet"))
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val sourceFilter: ScallopOption[Boolean] = toggle("source-filter", noshort = true, default=Some(true), prefix = "no-",
          descrNo = "Do not filter _source fields (workaround for bad index)", descrYes = "Use source filtering to reduce network traffic")

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      // Calling script should clear output directory as necessary.

      val objectExtractor = IndexWithSystemFields
      val dataWriterFactory = DataWriterFactory.file(format = Opts.format(), objectExtractor, outDirectory = Opts.out())

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),

        currentOnly = Opts.currentOnly(),
        lastModifiedGteFilter = Opts.lastModifiedGteFilter.toOption,
        pathPrefixFilter = Opts.pathPrefixFilter.toOption,

        objectExtractor = objectExtractor,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = Opts.sourceFilter())

      // The Hadoop convention is to touch the (empty) _SUCCESS file to signal successful completion.
      FileUtils.touch(Paths.get(Opts.out(), "_SUCCESS").toFile)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: CopyIndexesWithMapping.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{DataWriterFactory, IndexWithCompleteDocument}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.collection.JavaConverters._
import scala.concurrent.ExecutionContextExecutor

object CopyIndexesWithMapping {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CopyIndexesWithMapping.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("copy-index-with-mapping")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val indexMap: ScallopOption[String] = opt[String]("index-map", short = 'i', descr = "A map from source to target index names, in JSON format", required = true)

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())

      // Expect a map in the form: { "sourceIndex1": "targetIndex1", "sourceIndex2": "targetIndex2", ... }
      val indexMap: Map[String, String] = new ObjectMapper().readTree(Opts.indexMap()).fields.asScala.map { entry =>
        entry.getKey -> entry.getValue.asText
      }.toMap

      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexMap.keys.toSeq)

      // Validate that the index-map parameter specified valid index names, and not aliases.
      for (indexName <- indexMap.keys)
        if (!esTopology.allIndexNames.contains(indexName))
          throw new RuntimeException(s"index-map parameter included $indexName as a source, which is not a valid index name.")

      for (indexName <- indexMap.values)
        if (!esTopology.allIndexNames.contains(indexName))
          throw new RuntimeException(s"index-map parameter included $indexName as a target, which is not a valid index name.")

      val dataWriterFactory = DataWriterFactory.index[IndexWithCompleteDocument](
        indexMap = indexMap,
        esEndpoint = esContactPoint)

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),
        objectExtractor = IndexWithCompleteDocument,
        dataWriterFactory = dataWriterFactory,
        sourceFilter = false)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: CalculateXORSummary.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import cmwell.analytics.data.{IndexWithSourceHash, XORSummary, XORSummaryFactory}
import cmwell.analytics.downloader.PartitionedDownloader
import cmwell.analytics.util.{DiscoverEsTopology, FindContactPoints}
import org.apache.commons.codec.binary.Hex
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.concurrent.ExecutionContextExecutor

object CalculateXORSummary {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(CalculateXORSummary.getClass)

    // Since we expect this to be run on a CM-Well node, the default parallelism is to use half the processors
    // so as to avoid starving the CM-Well node from processor resources. A higher level of parallelism might
    // be possible (without interfering with CM-Well) since most of the work will actually be on the ES side.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    implicit val system: ActorSystem = ActorSystem("xor-summary")
    implicit val executionContext: ExecutionContextExecutor = system.dispatcher
    implicit val actorMaterializer: ActorMaterializer = ActorMaterializer()

    try {

      object Opts extends ScallopConf(args) {

        val readIndex: ScallopOption[String] = opt[String]("read-index", short = 'i', descr = "The name of the index to read from", required = false)

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      val esContactPoint = FindContactPoints.es(Opts.url())
      val indexesOrAliasesToRead = Opts.readIndex.toOption.fold(Seq("cm_well_all"))(Seq(_))
      val esTopology = DiscoverEsTopology(esContactPoint = esContactPoint, aliases = indexesOrAliasesToRead)

      val dataWriterFactory = new XORSummaryFactory()

      PartitionedDownloader.runDownload(
        esTopology = esTopology,
        parallelism = Opts.parallelism(),
        objectExtractor = IndexWithSourceHash,
        dataWriterFactory = dataWriterFactory.apply,
        sourceFilter = false)

      // Summarize the summaries down to the index level.
      val summaryByIndex: Map[String, XORSummary] = dataWriterFactory.shardSummaries
        .groupBy { case (shard, _) => shard.indexName }
        .map { case (indexName, summaryMap) => indexName -> summaryMap.values.reduce(XORSummary.combine) }

      // TODO: Fix questionable JSON generation
      val r = "{" +
        summaryByIndex.map { case (index, summary) =>
          val x = Hex.encodeHexString(summary.summary)
          s""" { "index": "$index", "summary": "$x" } """
        }.mkString("\n") + "}"

      println(r)
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
    finally {
      system.terminate()
    }
  }
}

Source File: TestResults.scala From mimir with Apache License 2.0

5 votes

package mimir.util

import java.nio.file.Files
import java.nio.file.Paths
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets
import java.io.ByteArrayOutputStream
import java.io.PrintWriter
import org.rogach.scallop.ScallopConf

object TestResults {
  def main(args: Array[String]) {
    val config = new TestResultConfig(args)
    println("running tests....")
    parseTestResults(config.sbtPath(),config.sbtCmd())
  }
  
  def parseTestResults(sbtPath:String = "/opt/local/bin/sbt", sbtCmd:String = "test") = {
    val procOutput = runCommand(Seq(sbtPath,sbtCmd))._2.replaceAll("""\x1b\[[0-9;]*[a-zA-Z]""", "")
    
    val pattern = """(?m)^.*\[info\] Total.*$|^.*\[info\] Finished.*$|^.*\[info\] [\d]+ examp.*$""".r
    
    val header = "test_name,seconds,examples,expectations,failures,errors,skipped\n"
    
    val pattern2 = """\[info\] Total for specification (\w+)\s+\[info\] Finished in (.+)\R\[info\] (.+)\R""".r
    val pattern3 = """([a-zA-Z]+): (?:(\d+) minutes? )?(?:(\d+) seconds?[,:] )?(?:(\d+) ms[,:] )?(\d+) examples?, (?:(\d+) expectations?, )?(\d+) failures?, (\d+) errors?(?:, (\d+) skipped)?""".r
    val string = pattern2.findAllMatchIn(procOutput).map(mat => s"${mat.group(1)}: ${mat.group(2)}: ${mat.group(3)}")
      .map(nline => nline match {
        case pattern3(test_name,minutes,seconds,ms,examples,expectations,failures,errors,skipped) => {
          val allseconds = (minutes match {
            case "" => 0
            case null => 0
            case x => x.toInt*60
          }) + (seconds match {
            case "" => 0
            case null => 0
            case x => x.toInt
          }) +  (ms match {
            case "" => 0.0
            case null => 0.0
            case x => x.toDouble/1000.0
          })
          s"$test_name,$allseconds,$examples,$expectations,$failures,$errors,$skipped"
        }
      }).mkString("\n")
    
    val outStr = header + string
      
    println(outStr)
    Files.write(Paths.get("test_output.csv"), outStr.getBytes(StandardCharsets.UTF_8))
  }
  
  import sys.process._
  def runCommand(cmd: Seq[String]): (Int, String, String) = {
    val stdoutStream = new ByteArrayOutputStream
    val stderrStream = new ByteArrayOutputStream
    val stdoutWriter = new PrintWriter(stdoutStream)
    val stderrWriter = new PrintWriter(stderrStream)
    val exitValue = cmd.!(ProcessLogger(stdoutWriter.println, stderrWriter.println))
    stdoutWriter.close()
    stderrWriter.close()
    (exitValue, stdoutStream.toString, stderrStream.toString)
  }
  
  
}

class TestResultConfig(arguments: Seq[String]) extends ScallopConf(arguments)
{
  val experimental = opt[List[String]]("X", default = Some(List[String]()))
  val sparkHost = opt[String]("sparkHost", descr = "The IP or hostname of the spark master",
    default = Some("spark-master.local"))
  val sparkPort = opt[String]("sparkPort", descr = "The port of the spark master",
    default = Some("7077"))
  val sbtPath = opt[String]("sbtPath", descr = "The path to sbt binary",
    default = Some("/opt/local/bin/sbt"))
  val sbtCmd = opt[String]("sbtCmd", descr = "The sbt command to run",
    default = Some("test"))
}

Source File: AnalyzeInconsistenciesResult.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.data.InfotonAndIndexWithSystemFields
import cmwell.analytics.util.Connector
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.collection.breakOut

object AnalyzeInconsistenciesResult {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true)
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))

        verify()
      }

      Connector(
        appName = "Analyze InfotonAndIndexWithSystemFields Output",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds: DataFrame = spark.read.parquet(Opts.in())

        import org.apache.spark.sql.functions._

        // A column expression that counts the number of failures for each constraint.
        // This will also include null counts, needed to interpret the results.
        val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) =>
          name -> sum(when(predicate, 0L).otherwise(1L)).as(name)
        }(breakOut)

        // Compute the failure counts
        val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head

        val results = for {
          i <- constraints.indices
          constraintName = constraints(i)._1
          failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i)
        } yield s"$constraintName,$failureCount"

        FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8)
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: Options.scala From ai.vitk.ner with GNU General Public License v3.0

5 votes

package ai.vitk.ner

import org.rogach.scallop.ScallopConf


class Options(arguments: Seq[String]) extends ScallopConf(arguments) {
  val master = opt[String](default = Some("local[*]"), descr = "the Spark master URL")
  val memory = opt[String](default = Some("8g"), descr = "executor memory")
  val mode = opt[String](default = Some("tag"), descr = "mode of the tagger, either 'train', 'tag' or 'eval'")
  val verbose = opt[Boolean](default = Some(false), descr = "verbose mode")
  val language = opt[String](default = Some("vi"), descr = "natural language in use, either 'vi', 'en' or 'ja'")
  val dimension = opt[Int](default = Some(32768), descr = "domain dimension for feature hashing")
  val iteration = opt[Int](default = Some(600), descr = "max number of iterations in training")
  val independent = opt[Boolean](default = Some(false), descr = "use only independent features")
  val reversed = opt[Boolean](default = Some(false), descr = "backward model")
  val input = opt[String](default = Some("test.txt"), descr = "input file for tagging")
  verify()
}

Source File: Run.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.run.GraphRunner
import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus}
import com.flaminem.flamy.model.ItemArgs
import com.flaminem.flamy.model.names.ItemName
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class Run extends Subcommand("run") with FlamySubcommand{

  banner("Perform a run on the specified environment")

  private val environment: ScallopOption[Environment] =
    opt(name="on", default=None, descr="Specifies environment to run on.", required=false, noshort=true)

  private val dryRun: ScallopOption[Boolean] =
    opt(name="dry", default=Some(false), descr="Perform a dry-run", noshort=true)

  validateOpt(environment, dryRun) {
    case (None,Some(false)) => Left("Please specify an environment to run on (with the --on option), or use the --dry option to perform a local dry-run")
    case _ => Right(())
  }

  private val from: ScallopOption[List[ItemName]] =
    opt[List[ItemName]](name="from", default=Some(Nil), descr="start from the given schemas/tables.", noshort=true, argName = "items")

  private val to: ScallopOption[List[ItemName]] =
    opt[List[ItemName]](name="to", default=Some(Nil), descr="stop at the given schemas/tables.", noshort=true, argName = "items")
  codependent(from,to)

  private val items: ScallopOption[List[ItemName]] =
    trailArg[List[ItemName]](default=Some(Nil),required=false)

  lazy val itemArgs = ItemArgs(items(), from(), to())

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    val context = new FlamyContext(globalOptions, this.environment.get)
    context.dryRun = this.dryRun()
    if (itemArgs.isEmpty) {
      System.err.println("Please specify items to run on")
      ReturnFailure
    }
    else {
      val graphRunner = GraphRunner(itemArgs, context)
      graphRunner.run()
    }
  }

}

Source File: Check.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.spark.ModelSparkContext
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.files.{FileRunner, ItemFileAction}
import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, ModelHivePartitionFetcher}
import com.flaminem.flamy.exec.utils._
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.graph.TableGraph
import com.flaminem.flamy.model._
import com.flaminem.flamy.model.core.Model
import com.flaminem.flamy.model.files.FilePath
import com.flaminem.flamy.model.names.ItemName
import org.apache.spark.sql.SQLContext
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


      val runGraph: TableGraph = baseGraph.subGraph(items())

      val dryRunner: FlamyRunner = FlamyRunner(context)
      println("Creating schemas and tables ...")
      try {
        dryRunner.checkAll(baseGraph)
      }
      finally{
        //TODO: For some strange reason, closing the connection here will result in ClassNotFoundErrors for udfs in the RunActions...
        //      dryRunner.close()
      }
      FlamyOutput.out.info("Running Populates ...")
      dryRunner.populateAll(runGraph.model, context)
      dryRunner.close()
      ReturnStatus(success = dryRunner.getStats.getFailCount==0)
    }

  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }


}

Source File: Drop.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.actions.{DropSchemaAction, DropTableAction}
import com.flaminem.flamy.exec.hive.HiveTableFetcher
import com.flaminem.flamy.exec.utils.{Action, _}
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.model.{ItemFilter, TableInfo}
import com.flaminem.flamy.model.exceptions.FlamyException
import com.flaminem.flamy.model.names.{ItemName, SchemaName, TableName}
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

class Drop extends Subcommand("drop") with FlamySubcommand {

  val schemas: Subcommand = new Subcommand("schemas") with FlamySubcommand {
    banner("Drop the specified schemas on the specified environment")
    val environment: ScallopOption[Environment] =
      opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)
    val dryRun: ScallopOption[Boolean] =
      opt(name = "dry", default = Some(false), descr = "Perform a dry-run", required = false, noshort = true)
    val all: ScallopOption[Boolean] =
      opt(
        name = "all",
        default = Some(false),
        descr = "Unlike other commands, not providing any schema name will not do anything. Unless you use this option.",
        noshort = true
      )
    val items: ScallopOption[List[String]] =
      trailArg[List[String]](default = Some(List()), required = false)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      if(all() && items().nonEmpty) {
        throw new FlamyException("Using the --all option will drop all schemas, and no schema name should be specified.")
      }
      val context = new FlamyContext(globalOptions, environment.get)
      context.dryRun = dryRun()
      val itemFilter = ItemFilter(items(), acceptIfEmpty = all())
      val fetcher = HiveTableFetcher(context)
      val schemaNames: Iterable[SchemaName] = fetcher.listSchemaNames.filter{itemFilter}.filterNot{_.fullName == "default"}

      val flamyRunner: FlamyRunner = FlamyRunner(context)
      val actionRunner = new ActionRunner(silentOnSuccess = false, silentOnFailure = false)
      val dropActions = schemaNames.map{schemaName => new DropSchemaAction(schemaName, flamyRunner)}
      actionRunner.run(dropActions)

      ReturnStatus(success = actionRunner.getStats.getFailCount == 0)
    }
  }

  val tables: Subcommand = new Subcommand("tables") with FlamySubcommand {
    banner("Drop the specified tables on the specified environment")
    val environment: ScallopOption[Environment] =
      opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)
    val dryRun: ScallopOption[Boolean] =
      opt(name = "dry", default = Some(false), descr = "Perform a dry-run", required = false, noshort = true)
    val all: ScallopOption[Boolean] =
      opt(
        name = "all",
        default = Some(false),
        descr = "Unlike other commands, not providing any table name will not do anything. Unless you use this option.",
        noshort = true
      )
    val items: ScallopOption[List[ItemName]] =
      trailArg[List[ItemName]](default = Some(List()), required = false)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      if(all() && items().nonEmpty) {
        throw new FlamyException("Using the --all option will drop all tables, and no table name should be specified.")
      }
      if(!all() && items().isEmpty) {
        throw new FlamyException("If you really want to drop all the tables, you should add the --all option.")
      }
      val context = new FlamyContext(globalOptions, environment.get)
      context.dryRun = dryRun()
      val itemFilter = ItemFilter(items(), acceptIfEmpty = all())
      val fetcher = HiveTableFetcher(context)
      val tables: Iterable[TableInfo] = fetcher.listTables(itemFilter)

      val flamyRunner: FlamyRunner = FlamyRunner(context)
      val actionRunner = new ActionRunner(silentOnSuccess = false, silentOnFailure = false)
      val dropActions = tables.map{table => new DropTableAction(table, flamyRunner)}
      actionRunner.run(dropActions)

      ReturnStatus(success = actionRunner.getStats.getFailCount == 0)
    }
  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case (command: FlamySubcommand) :: Nil =>
        command.doCommand(globalOptions, Nil)
      case _ => printHelp()
    }
    ReturnSuccess
  }
}

Source File: GatherInfo.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.hive.HivePartitionFetcher
import com.flaminem.flamy.exec.utils.{ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.model.ItemFilter
import com.flaminem.flamy.model.names.ItemName
import com.flaminem.flamy.utils.AutoClose
import com.flaminem.flamy.utils.time.TimeUtils
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class GatherInfo extends Subcommand("gather-info") with FlamySubcommand {

  banner("Gather all partitioning information on specified items (everything if no argument is given) and output this as csv on stdout.")

  val environment: ScallopOption[Environment] =
    opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)

  val items: ScallopOption[List[ItemName]] =
    trailArg[List[ItemName]](default = Some(List()), required = false)

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    val context = new FlamyContext(globalOptions, this.environment.get)
    for{
      fetcher: HivePartitionFetcher <- AutoClose(HivePartitionFetcher(context))
    } {
      val itemFilter = new ItemFilter(this.items(), true)
      for {
        tpInfo <- fetcher.listTableNames.filter{itemFilter}.map{fetcher.getTablePartitioningInfo}
        partition <- tpInfo.sortedTablePartitions
      } {
        println(
          Seq(
            tpInfo.tableName.schemaName,
            tpInfo.tableName.name,
            partition.partitionName,
            partition.getFileSize.getOrElse("\\N"),
            partition.getModificationTime(context, refresh = false).map {
              TimeUtils.timestampToUniversalTime
            }.getOrElse("\\N")
          ).mkString("\t")
        )
      }
    }
    ReturnSuccess
  }

}

Source File: Export.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands.tools

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyContextFormatter, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.graph.TableGraph
import com.flaminem.flamy.model.core.Model
import com.flaminem.flamy.model.exceptions.UnexpectedBehaviorException
import com.flaminem.flamy.model.names.ItemName
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}


class Export extends Subcommand("export") with FlamySubcommand {

  val conf = new Subcommand("conf") with FlamySubcommand {
    banner("Automatically generate a configuration template or doc")
    private lazy val template: ScallopOption[Boolean] = toggle(name = "template", default = Some(false), noshort = true)
    private lazy val markdown: ScallopOption[Boolean] = toggle(name = "markdown", default = Some(false), noshort = true)
    private lazy val rst: ScallopOption[Boolean] = toggle(name = "rst", default = Some(false), noshort = true)

    requireOne(template, markdown, rst)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      val context = new FlamyContext(globalOptions, Some(Environment("<ENV>")))
      if(template()) {
        FlamyOutput.out.println(new FlamyContextFormatter(context).toTemplate)
      }
      else if(markdown()) {
        FlamyOutput.out.println(new FlamyContextFormatter(context).toMarkdown)
      }
      else if(rst()) {
        FlamyOutput.out.println(new FlamyContextFormatter(context).toRST)
      }
      else {
        throw new UnexpectedBehaviorException("Either --template or --markdown option should be used")
      }
      ReturnSuccess
    }
  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }

}

Source File: Repair.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.hive.HiveTableFetcher
import com.flaminem.flamy.exec.utils.{Action, ActionRunner, ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.model.ItemFilter
import com.flaminem.flamy.model.names.{ItemName, TableName}
import com.flaminem.flamy.utils.AutoClose
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class Repair extends Subcommand("repair") with FlamySubcommand{

  val tables = new Subcommand("tables") {
    banner("Execute a msck repair table on every specified table. " +
      "This will automatically add to the metastore the partitions that exists on hdfs but not yet in the metastore.")
    val environment: ScallopOption[Environment] =
      opt(name="on", descr="Specifies environment to run on", required = true, noshort=true)
    val dryRun: ScallopOption[Boolean] =
      opt(name="dry", default=Some(false), descr="Perform a dry-run", required = false, noshort=true)
    val items: ScallopOption[List[ItemName]] =
      trailArg[List[ItemName]](default=Some(List()),required = false)
  }

  private class RepairTableAction(runner: FlamyRunner, tableName: TableName) extends Action{

    @throws(classOf[Exception])
    override def run(): Unit = {
      runner.runText(f"use ${tableName.schemaName} ; MSCK REPAIR TABLE ${tableName.name}")
    }

    override val name: String = tableName.fullName
    override val logPath: String = f"${tableName.schemaName}.db/${tableName.name}/REPAIR.hql"
  }

  private def repairTables(context: FlamyContext, items: ItemName*): Unit = {
    val itemFilter = new ItemFilter(items, acceptIfEmpty = true)
    val fetcher = HiveTableFetcher(context)
    val tables: Iterable[TableName] = fetcher.listTables(itemFilter).filterNot{_.isView}.filter{_.isPartitioned}.map{_.tableName}

    val actionRunner: ActionRunner = new ActionRunner(silentOnSuccess = false)
    for {
      flamyRunner: FlamyRunner <- AutoClose(FlamyRunner(context))
    } {
      val actions = tables.map{tableName => new RepairTableAction(flamyRunner, tableName)}
      actionRunner.run(actions)
    }
  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case ([email protected]) :: Nil =>
        val context = new FlamyContext(globalOptions, command.environment.get)
        context.dryRun = command.dryRun()
        repairTables(context, command.items():_*)
      case _ => printHelp()
    }
    ReturnSuccess
  }


}

Source File: Count.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, HiveTableFetcher, RemoteHiveRunner}
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.exec.utils.{ReturnFailure, ReturnStatus, ReturnSuccess}
import com.flaminem.flamy.model.ItemFilter
import com.flaminem.flamy.model.names.{ItemName, TableName, TablePartitionName}
import com.flaminem.flamy.utils.AutoClose
import com.flaminem.flamy.utils.prettyprint.Tabulator
import com.flaminem.flamy.utils.sql.hive.StreamedResultSet
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class Count extends Subcommand("count") with FlamySubcommand {

  val tables = new Subcommand("tables") with FlamySubcommand {
    banner("Execute a select count(1) on every specified table.")
    val environment: ScallopOption[Environment] =
      opt(name="on", descr="Specifies environment to run on", required=false, noshort=true)
    val items: ScallopOption[List[ItemName]] =
      trailArg[List[ItemName]](default = Some(List()), required = false)

    override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
      val context = new FlamyContext(globalOptions, environment.get)
      val itemFilter = ItemFilter(items(), acceptIfEmpty = true)
      val fetcher = HiveTableFetcher(context)
      val tables: Iterable[TableName] = fetcher.listTableNames.filter{itemFilter}

      val hiveRunner: RemoteHiveRunner = new RemoteHiveRunner(context)
      try {
        for {
          tableName <- tables if !Thread.currentThread().isInterrupted
        } try {
          val res: StreamedResultSet = hiveRunner.executeQuery(f"SELECT COUNT(1) FROM $tableName")
          val row = res.next()
          FlamyOutput.out.success(f"ok: $tableName : ${row(0)}")
        } catch {
          case e: Throwable =>
            e.printStackTrace()
            FlamyOutput.err.failure(f"not ok: $tableName : ${e.getMessage}")
        }
      }
      finally{
        hiveRunner.close()
      }
      ReturnSuccess
    }

  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }

}

Source File: WaitForPartition.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.hive.PartitionWaiter
import com.flaminem.flamy.exec.utils.ReturnStatus
import com.flaminem.flamy.model.names.ItemName
import com.flaminem.flamy.utils.AutoClose
import com.flaminem.flamy.utils.time.TimeUtils
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


class WaitForPartition extends Subcommand("wait-for-partition") with FlamySubcommand{

  banner("Wait for a partition to be created.")

  val environment: ScallopOption[Environment] =
    opt(name = "on", descr = "Specifies environment to run on", required = true, noshort = true)
  val timeout: ScallopOption[Long] =
    opt(
      name = "timeout",
      descr = "Number of seconds after which flamy will fail if the partitions still does not exist",
      default = Some(12 * 3600),
      noshort = true
    )
  val after: ScallopOption[String] =
    opt(
      name = "after",
      argName = "yyyy-MM-dd HH:mm:ss",
      descr = """Wait for the partition to be created or refreshed after this time. Expected format is "yyyy-MM-dd HH:mm:ss"""",
      default = None,
      noshort = true
    )
  val retryInterval: ScallopOption[Long] =
    opt(
      name = "retry-interval",
      argName = "INTERVAL",
      descr = "When a partition is not found, retry after INTERVAL seconds",
      default = Some(60),
      noshort = true
    )
  val items: ScallopOption[List[ItemName]] =
    trailArg[List[ItemName]](required = true)

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    val context = new FlamyContext(globalOptions, environment.get)
    val waiter = new PartitionWaiter(context)
    for{
      waiter: PartitionWaiter <- AutoClose(new PartitionWaiter(context))
    } yield {
      waiter.waitForPartition(items(), timeout(), after.get.map{TimeUtils.universalTimeToTimeStamp}, retryInterval())
    }
  }

}

Source File: CliArgs.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.exec.shell

import com.flaminem.flamy.Launcher
import com.flaminem.flamy.Launcher.Options
import org.rogach.scallop.{CliOption, Scallop, ScallopConf}


    val lastOptionArgs: Seq[String] =
      if(lastOptionAndArgs.isEmpty) {
        Nil
      }
      else {
        lastOptionAndArgs.tail
      }

    val trailArgs: Seq[String] =
      if(lastOption.isDefined) {
        Nil
      }
      else {
        subCommands.foldLeft(args){
          case (args, command) => args.dropWhile(_ != command).drop(1)
        }
      }

    new CliArgs(
      builder = builder,
      args = args,
      lastOption = lastOption,
      lastOptionArgs = lastOptionArgs,
      previousOptions = previousOptions,
      trailArgs = trailArgs,
      lastWord = lastWord
    )
  }


}

Source File: VariablesTest.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.model

import org.rogach.scallop.{ScallopConf, Subcommand}
import org.scalatest.{FreeSpec, Matchers}


class VariablesTest extends FreeSpec with Matchers{

  "replaceInText should work" in {
    val variables = new Variables
    variables += ("TO_REPLACE" -> "REPLACED")
    val text = "TO_REPLACE has been ${TO_REPLACE}"
    val expected = "TO_REPLACE has been REPLACED"

    assert(variables.replaceInText(text)===expected)
  }

  "subsetInText should work" in {
    val variables = new Variables
    variables += ("IN_KEY" -> "IN_VALUE")
    variables += ("OUT_KEY" -> "OUT_VALUE")
    val text = "this text contains ${IN_KEY} but does not contains OUT_KEY"

    val expectedVariables = new Variables
    expectedVariables += ("IN_KEY" -> "IN_VALUE")

    assert(variables.subsetInText(text, Nil) === expectedVariables)
  }

  "replaceInText should preserve partition variables" in {
    val text: String = """INSERT OVERWRITE TABLE db1.dest PARTITION(part=${partition:toto}) SELECT ${partition:toto} as num FROM db2.source"""
    val vars = new Variables()
    vars += "partition:toto" -> "${partition:toto}0"
    val expected: String = """INSERT OVERWRITE TABLE db1.dest PARTITION(part="${partition:toto}") SELECT "${partition:toto}" as num FROM db2.source"""
    assert(vars.replaceInText(text) == expected)
  }


  "the scallopConverter should work" in {
    object Conf extends ScallopConf(Seq("--variables", "HELLO=yes", "LIST=(1,2,3)", "sub")) {
      val variables = opt[Variables](name = "variables")(Variables.scallopConverter)
      val sub =
        new Subcommand("sub") {
          banner("Print version information of this program")
          override def toString = "sub"
        }
    }
    assert(Conf.variables() === Variables("HELLO" -> "yes", "LIST" -> "(1,2,3)"))
    assert(Conf.subcommand.get.toString === "sub")
  }

}

Source File: Main.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.dc.stream

import akka.actor.ActorSystem
import cmwell.ctrl.hc.HealthControl
import cmwell.dc.{LazyLogging, Settings}
import cmwell.tools.data.sparql.{SparqlProcessorManager, SparqlProcessorManagerSettings}
import cmwell.tracking.ResurrectorActor
import k.grid.service.ServiceTypes
import k.grid.{Grid, GridConnection}
import org.rogach.scallop.ScallopConf
import uk.org.lidalia.sysoutslf4j.context.SysOutOverSLF4J


object Main extends App with LazyLogging {
  import Settings._
  logger.info("Starting Dc-Sync using stream")
  //SLF4J initialization is not thread safe, so it's "initialized" by writing some log and only then using sendSystemOutAndErrToSLF4J.
  //Without it there will be en error in stderr and some log line at the beginning will be lost
  SysOutOverSLF4J.sendSystemOutAndErrToSLF4J()

  Grid.setGridConnection(GridConnection(memberName = "dc"))
  Grid.declareServices(
    ServiceTypes()
      .add("DataCenterSyncManager", classOf[DataCenterSyncManager], destinationHostsAndPorts(rawTarget), None)
      .add(HealthControl.services)
      .add(SparqlProcessorManager.name, classOf[SparqlProcessorManager], new SparqlProcessorManagerSettings)
      .add("Resurrector", classOf[ResurrectorActor])
  )
  Grid.joinClient
  HealthControl.init
  Thread.sleep(10000)
}

object MainStandAlone extends App with LazyLogging {
  import Settings._

  implicit val sys = ActorSystem("ExtrenalSystem")

  val conf = new Conf(args)

  val ar =
    sys.actorOf(DataCenterSyncManager.props(destinationHostsAndPorts(conf.destinationHosts()), Some(conf.syncJson())))
}

class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
//  val syncJson = opt[String](required = true)
  val syncJson = trailArg[String]()
  val destinationHosts = trailArg[String]()
  verify()
}

Source File: CollectionRollerCliParser.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.collectionroller

import org.rogach.scallop.ScallopConf

class CollectionRollerCliArgsParser(args: Seq[String]) extends ScallopConf(args) {
  lazy val conf = opt[String]("conf",
                              's',
                              required = true,
                              descr = "Path to the collection roller yaml configuration")
  lazy val daemonize = opt[Boolean](
    "daemonize",
    required = false,
    default = Some(false),
    descr = "Daemonize the process and run the CollectionRoller on a schedule")
  lazy val zkHosts = opt[String]("zk-hosts", required = true, descr = "Zookeeper hosts")
  lazy val deleteApplications =
    opt[String]("delete-applications", required = false, descr = "Delete applications (operation)")
  lazy val listApplications =
    opt[Boolean]("list-applications", required = false, descr = "List all applications (operation)")
  lazy val verbose =
    opt[Boolean](
      "verbose",
      required = false,
      descr = "List additional info (aliases and collections) for all applications (operation)")
  verify()
}

Source File: AlertEngineCliParser.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.alertengine

import org.rogach.scallop.{ ScallopConf, ScallopOption }

class AlertEngineCliParser(args: Seq[String]) extends ScallopConf(args) {
  lazy val conf: ScallopOption[String] = opt[String](
    "conf",
    's',
    required = true,
    descr =
      "Alert Engine config yaml file. See https://github.com/phdata/pulse/blob/master/alert-engine/README.md for schema")
  lazy val daemonize: ScallopOption[Boolean] = opt[Boolean](
    "daemonize",
    required = false,
    default = Some(false),
    descr = "Daemonize the process and run alerting on an interval")
  lazy val smtpServer: ScallopOption[String] =
    opt[String]("smtp-server", required = false, descr = "SMTP server hostmane")
  lazy val smtpUser: ScallopOption[String] = opt[String](
    "smtp-user",
    required = false,
    descr = "SMTP username (from address), like '[email protected]'")

  // default smptPassword is "", this will turn Some("") into None()
  lazy val smtpPassword: Option[String] = sys.env.get("SMTP_PASSWORD").filter(_ != "")
  lazy val smtpPort: ScallopOption[Long] =
    opt[Long]("smtp-port", required = false, descr = "SMTP server port. Defaults to 25")
  lazy val smtp_tls: ScallopOption[Boolean] = opt[Boolean](
    "smtp-tls",
    required = false,
    descr = "Whether to use START_TLS. Defaults to false")
  lazy val silencedApplicationsFile: ScallopOption[String] = opt[String](
    "silenced-application-file",
    required = false,
    descr = "File containing applications ignore when alerting, one application per line")

  lazy val zkHost: ScallopOption[String] = opt[String](
    "zk-hosts",
    required = false,
    descr = "Zookeeper hosts. Used to connect to Solr Cloud")

  lazy val dbUrl: ScallopOption[String] =
    opt[String]("db-url", required = false, descr = "URL to connect to the database")
  lazy val dbUser: ScallopOption[String] =
    opt[String]("db-user", required = false, descr = "User to connect to the database as")
  lazy val dbPassword: ScallopOption[String] =
    opt[String]("db-password", required = false, descr = "Password to connect to the database with")
  lazy val dbOptions: ScallopOption[String] = opt[String](
    "db-options",
    required = false,
    descr = "Database connection options in the form `key1=value1;key2=value2`")

  verify()
}

Source File: Conf.scala From osstracker with Apache License 2.0

5 votes

package com.netflix.oss.tools.osstrackerscraper

import org.rogach.scallop.ScallopConf

class Conf(args: Seq[String]) extends ScallopConf(args) {
  val action = opt[String](required = true)
  verify()
}

object Conf {
  val ACTION_UPDATE_CASSANDRA = "updatecassandra"
  val ACTION_UPDATE_ELASTICSEARCH = "updateelasticsearch"
  val OSSTRACKER_KEYSPACE = "osstracker"
  val SENTINAL_DEV_LEAD_ID = "111111"; // Assign to valid emp id
  val SENTINAL_MGR_LEAD_ID = "222222"; // Assign to valid emp id
  val SENTINAL_ORG = "UNKNOWN"; // Assign to unknown org until edited in console
}

Source File: DataLoader.scala From variantsdwh with Apache License 2.0

5 votes

package pl.edu.pw.ii.zsibio.dwh.benchmark

import com.typesafe.config.ConfigFactory
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.ScallopConf
import org.apache.kudu.spark.kudu._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataType, StructField, StructType}


object DataLoader {
  class RunConf(args:Array[String]) extends ScallopConf(args){

    val csvFile =opt[String]("csvFile",required = true, descr = "A CSV file to load" )
    val tableName =opt[String]("tableName",required = true, descr = "A table to load" )
    val storageType = opt[String]("storageType",required = true, descr = "Storage type parquet|orc|kudu|carbon")
    val dbName =opt[String]("dbName",required = true, descr = "Database name" )


    verify()
  }
  def main(args: Array[String]): Unit = {
    val runConf = new RunConf(args)
    val scConf = new SparkConf()
        .setAppName("DataLoader")
    val sc = new SparkContext(scConf)
    val sqlContext = new HiveContext(sc)


    if(runConf.storageType().toLowerCase() == "orc" || runConf.storageType().toLowerCase() == "parquet") {
      val df = sqlContext.read
        .format("com.databricks.spark.csv")
        .option("delimiter", "|")
        .option("nullValue","\\N")
        .option("inferSchema", "true") // Automatically infer data types
        .load(runConf.csvFile())
        .repartition(10)
      df.registerTempTable("temp_csv")
      sqlContext.sql(
        s"""
        |INSERT OVERWRITE TABLE ${runConf.dbName()}.${runConf.tableName()}

        |SELECT * FROM temp_csv
        """.stripMargin)
      }
    if(runConf.storageType().toLowerCase() == "kudu"){
      val confFile = ConfigFactory.load()
      val kuduMaster = confFile.getString("kudu.master.server")
      val kuduContext = new KuduContext(kuduMaster)
      val dfTarget = sqlContext.read.options(Map("kudu.master" -> kuduMaster,"kudu.table" -> runConf.tableName())).kudu
      val df = sqlContext.read
        .format("com.databricks.spark.csv")
        .option("delimiter", "|")
        .option("nullValue","\\N")
        .schema(dfTarget.schema)
        .load(runConf.csvFile())
        .repartition(10)
      kuduContext.upsertRows(df,runConf.tableName())
    }

  }

  private def synSchemas(inSchema:StructType, outSchema:StructType) = {

    val size = inSchema.fields.length
    val structFields = (0 to size - 1).map{
      i => StructField(outSchema.fields(i).name,inSchema.fields(i).dataType,outSchema.fields(i).nullable)
    }
    new StructType(structFields.toArray)

  }

}

Source File: Arguments.scala From ZparkIO with MIT License

5 votes

package com.leobenkel.zparkioProjectExample

import com.leobenkel.zparkio.Services.CommandLineArguments
import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments
import org.rogach.scallop.{ScallopConf, ScallopOption}
import zio.ZIO

case class Arguments(input: List[String])
    extends ScallopConf(input) with CommandLineArguments.Service {
  val inputId: ScallopOption[Int] = opt[Int](
    default = Some(10),
    required = false,
    noshort = true
  )

  val sparkFoo: ScallopOption[String] = opt[String](
    default = Some("hello"),
    required = false,
    noshort = true
  )
}

object Arguments {
  def apply[A](f: Arguments => A): ZIO[CommandLineArguments[Arguments], Throwable, A] = {
    CommandLineArguments.get[Arguments].apply(f)
  }
}

Source File: Arguments.scala From ZparkIO with MIT License

5 votes

package com.leobenkel.zparkioProfileExampleMoreComplex

import com.leobenkel.zparkio.Services.CommandLineArguments
import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments
import com.leobenkel.zparkioProfileExampleMoreComplex.Services.Database
import org.rogach.scallop.{ScallopConf, ScallopOption}
import zio.ZIO

case class Arguments(input: List[String])
    extends ScallopConf(input) with CommandLineArguments.Service {

  val databaseUsername: ScallopOption[String] = opt[String](
    default = Some("admin"),
    required = false,
    noshort = true
  )

  
  val databasePassword: ScallopOption[String] = opt[String](
    default = Some("123456"),
    required = false,
    noshort = true
  )

  val databaseHost: ScallopOption[String] = opt[String](
    default = Some("database://host.com/database"),
    required = false,
    noshort = true
  )

  val generatedInputSize: ScallopOption[Int] = opt[Int](
    default = Some(100),
    required = false,
    noshort = true,
    descr = "The size of the sample data generated"
  )

  val sparkConfig: ScallopOption[String] = opt[String](
    default = Some("foo"),
    required = false,
    noshort = true
  )

  lazy val credentials: Database.Credentials = Database.Credentials(
    user = databaseUsername(),
    psw = databasePassword(),
    host = databaseHost()
  )
}

object Arguments {
  def apply[A](f: Arguments => A): ZIO[CommandLineArguments[Arguments], Throwable, A] = {
    CommandLineArguments.get[Arguments].apply(f)
  }
}

Source File: CommandLineArgumentsTest.scala From ZparkIO with MIT License

5 votes

package com.leobenkel.zparkio.Services

import com.leobenkel.zparkio.Services.CommandLineArguments.CommandLineArguments
import org.rogach.scallop.exceptions.{RequiredOptionNotFound, UnknownOption}
import org.rogach.scallop.{ScallopConf, ScallopOption}
import org.scalatest._
import zio.Exit.{Failure, Success}
import zio.{BootstrapRuntime, Layer, Task, ZIO, ZLayer}

class CommandLineArgumentsTest extends FreeSpec {
  "CommandLineService" - {
    case class ArgumentsService(input: Seq[String])
        extends ScallopConf(input) with CommandLineArguments.Service {
      val test: ScallopOption[String] = opt[String](
        default = None,
        required = true,
        noshort = true
      )
    }

    object Arguments {
      def get[A](
        f: ArgumentsService => A
      ): ZIO[CommandLineArguments[ArgumentsService], Throwable, A] = {
        CommandLineArguments.get[ArgumentsService].apply(f)
      }

      def apply(input: Seq[String]): Layer[Nothing, CommandLineArguments[ArgumentsService]] = {
        ZLayer.succeed(ArgumentsService(input))
      }
    }

    val runtime = new BootstrapRuntime {}

    "should work" in {
      val test: String = "qwe-asd-asd-zxc"

      runtime.unsafeRunSync {
        Arguments.get(_.test.toOption).provideLayer(Arguments(Seq("--test", test)))
      } match {
        case Success(Some(value)) => assertResult(value)(test)
        case Success(None)        => fail("Did not found any value")
        case Failure(ex)          => fail(ex.prettyPrint)
      }
    }

    "should fail - missing required" in {
      runtime.unsafeRunSync(for {
        arg <- Task(Arguments(Nil))
        a   <- Arguments.get(_.test.toOption).provideLayer(arg)
      } yield {
        a
      }) match {
        case Success(_)  => fail("Should have failed")
        case Failure(ex) => assertThrows[RequiredOptionNotFound](throw ex.squash)
      }
    }

    "should fail - unknonw option" in {
      runtime.unsafeRunSync(for {
        arg <- Task(Arguments(Seq("--abc", "foo")))
        a   <- Arguments.get(_.test.toOption).provideLayer(arg)
      } yield {
        a
      }) match {
        case Success(_)  => fail("Should have failed")
        case Failure(ex) => assertThrows[UnknownOption](throw ex.squash)
      }
    }
  }
}

Source File: ParametricFaceImageGeneratorOptions.scala From parametric-face-image-generator with Apache License 2.0

5 votes

package faces.utils

import org.rogach.scallop.{ScallopConf, ScallopOption}
import org.rogach.scallop.exceptions.ScallopException


class ParametricFaceImageGeneratorOptions(args: Seq[String]) extends ScallopConf(args) {
  banner(
    """|parametric-face-image-generator
       |© University of Basel
       |License: http://www.apache.org/licenses/LICENSE-2.0
       |
       |Options:""".stripMargin)

  val configurationFile: ScallopOption[String] = opt[String](required = true,descr = "configuration file with the parameters")

  footer(
    """""".stripMargin
  )

  override def onError(e: Throwable): Unit = e match {
    case ScallopException(message) =>
      printHelp
      println("You provided the arguments: "+args.mkString(" "))
      println(message)
      sys.exit(1)
    case ex => super.onError(ex)
  }
}

Source File: Conf.scala From ncdbg with BSD 3-Clause "New" or "Revised" License

5 votes

package com.programmaticallyspeaking.ncd.config

import org.rogach.scallop.{ScallopConf, ValueConverter}

case class Address(host: String, port: Int) {
  override def toString = host + ":" + port
}
class AddressConverter extends ValueConverter[Address] {

  val valueRegexp = "([^:]+(?::))?([0-9]+)".r

  override def parse(s: List[(String, List[String])]): Either[String, Option[Address]] = {
    s match {
      case (_, valueRegexp(host, port) :: Nil) :: Nil =>
        // I tried getting rid of the trailing : using a non-capturing group, but it didn't work.
        val theHost = Option(host).map(h => h.dropRight(1)).getOrElse("localhost")
        Right(Some(Address(theHost, port.toInt)))
      case Nil =>
        Right(None)
      case _ =>
        Left("address must have format <host>:<port> or only <port>")
    }
  }

  override val tag = scala.reflect.runtime.universe.typeTag[Address]
  override val argType = org.rogach.scallop.ArgType.SINGLE
}

class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
  implicit val addressConverter = new AddressConverter

  private val defaultListen = Address("localhost", 7778)
  private val defaultConnect = Address("localhost", 7777)

  banner(
    """Usage: ncdbg [OPTION]...
      |
      |Ncdbg (Nashorn-Chrome-debugger) connects to a debuggable Java process running Nashorn scripts,
      |while acting as a server for Chrome Developer Tools. This makes it possible to debug Nashorn scripts
      |using Chrome.
      |
      |Options:
    """.stripMargin)

  val listen = opt[Address](default = Some(defaultListen),
    descr = s"address to listen on, on <host>:<port> format or port only. Defaults to $defaultListen.")
  val connect = opt[Address](default = Some(defaultConnect),
    descr = s"address to connect to, on <host>:<port> format or port only. Defaults to $defaultConnect.")
  val isLazy = toggle(name = "lazy", default = Some(false),
    descrYes = "defer connection until DevTools connects, and stay alive when the debug target dies.",
    descrNo = "connect right away and require the debug target to live. This is the default.")
  verify()
}

Source File: AddressConverterTest.scala From ncdbg with BSD 3-Clause "New" or "Revised" License

5 votes

package com.programmaticallyspeaking.ncd.boot

import com.programmaticallyspeaking.ncd.config.{Address, AddressConverter}
import com.programmaticallyspeaking.ncd.testing.UnitTest
import org.rogach.scallop.exceptions.WrongOptionFormat
import org.rogach.scallop.{ScallopConf, throwError}

class AddressConverterTest extends UnitTest {
  // Throw error instead of exiting on option error.
  throwError.value = true

  def conf(args: String*) = new ScallopConf(args.toSeq) {
    val address = opt[Address]()(new AddressConverter)
    verify()
  }

  "should parse host:port" in {
    val c = conf("--address", "foo:1234")
    c.address.toOption should be (Some(Address("foo", 1234)))
  }

  "should parse only port" in {
    val c = conf("--address", "1234")
    c.address.toOption should be (Some(Address("localhost", 1234)))
  }

  "should handle no address" in {
    val c = conf()
    c.address.toOption should be (None)
  }

  "should reject non-integer port" in {
    intercept[WrongOptionFormat](conf("--address", "foo"))
  }
}

Source File: IngesterMain.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.tools.data.ingester

import java.io.FileInputStream
import java.util.zip.GZIPInputStream

import akka.stream.scaladsl.Sink
import cmwell.tools.data.utils.akka.stats.IngesterStats
//import cmwell.tools.data.sparql.SparqlProcessorMain.Opts.opt
import cmwell.tools.data.utils.ArgsManipulations._
import cmwell.tools.data.utils.akka.Implicits._
import cmwell.tools.data.utils.akka._
import cmwell.tools.data.utils.ops._
import com.typesafe.scalalogging.LazyLogging
import org.rogach.scallop.ScallopConf

import scala.concurrent.ExecutionContext.Implicits.global

object IngesterMain extends App with LazyLogging {
  object Opts extends ScallopConf(args) {
    version(s"cm-well ingester ${getVersionFromManifest()} (c) 2015")

    val host = opt[String]("host", descr = "cm-well host name", required = true)
    val format = opt[String]("format", descr = "input format (e.g. ntriples, nquads, jsonld)", required = true)
    val file = opt[String]("file", descr = "input file path", default = None)
    val gzip = opt[Boolean]("gzip", descr = "is input file gzipped", default = Some(false))
    val token = opt[String]("token", descr = "cm-well write permission token", default = None)
    val replaceMode =
      opt[Boolean]("with-replace-mode", descr = "replace-mode parameter in cm-well", default = Some(false))
    val force = opt[Boolean]("force", descr = "force parameter in cm-well", default = Some(false))
    val priority = opt[Boolean]("priority", default = Some(false), descr = "ingest data in priority mode")
    val numConnections = opt[Int]("num-connections", descr = "number of http connections to open")

    dependsOnAll(gzip, List(file))
    verify()
  }

  val start = System.currentTimeMillis()

  var totalIngestedBytes = 0L
  var ingestedBytesInWindow = 0
  var ingestedInfotonsInWindow = 0
  var totalIngestedInfotons = 0L
  var totalFailedInfotons = 0L
  var lastTime = start
  var nextPrint = 0L
  var lastMessageSize = 0
  val windowSizeMillis = 1000

  val formatter = java.text.NumberFormat.getNumberInstance

  // resize akka http connection pool
  Opts.numConnections.toOption.map { numConnections =>
    System.setProperty("akka.http.host-connection-pool.max-connections", numConnections.toString)
  }

  val inputStream = if (Opts.file.isSupplied) {
    val inputFile = new FileInputStream(Opts.file())
    if (Opts.gzip()) {
      new GZIPInputStream(inputFile)
    } else {
      inputFile
    }
  } else {
    System.in
  }

  val result = Ingester
    .fromInputStream(
      baseUrl = formatHost(Opts.host()),
      format = Opts.format(),
      writeToken = Opts.token.toOption,
      replaceMode = Opts.replaceMode(),
      force = Opts.force(),
      isPriority = Opts.priority(),
      in = inputStream
    )
    .via(IngesterStats(isStderr = true))
    .runWith(Sink.ignore)

  // actor system is still alive, will be destroyed when finished
  result.onComplete { x =>
    System.err.println("\n")
    System.err.println(s"finished: $x")
    cleanup()
  }
}

Source File: LogCollectorCliParser.scala From pulse with Apache License 2.0

5 votes

package io.phdata.pulse.logcollector

import org.rogach.scallop.ScallopConf

class LogCollectorCliParser(args: Seq[String]) extends ScallopConf(args) {
  lazy val port        = opt[Int]("port", required = false, descr = "HTTP Server Listening port")
  lazy val zkHosts     = opt[String]("zk-hosts", required = true, descr = "Zookeeper hosts")
  lazy val kuduMasters = opt[String]("kudu-masters", required = false, descr = "Kudu masters")
  lazy val mode = opt[String]("consume-mode",
                              required = false,
                              descr = "'http' or 'kafka'",
                              default = Some("http"))
  lazy val kafkaProps =
    opt[String]("kafka-properties", required = false, descr = "Kafka properties file")
  lazy val topic = opt[String]("topic", required = false, descr = "Kafka Topic")

  validateOpt(mode, port) {
    case (Some("http") | None, None) => Left("Need a port if running http mode")
    case _                           => Right(Unit)
  }

  validateOpt(mode, kafkaProps, topic) {
    case (Some("kafka"), None, Some(_)) =>
      Left("--kafka-properties argument needed if --consume-mode=kafka")
    case (Some("kafka"), Some(_), None) => Left("--topic argument needed if --consume-mode=kafka")
    case (Some("kafka"), None, None) =>
      Left("--topic and --kafka-properties arguments needed if --consume-mode=kafka")
    case _ => Right(Unit)
  }

  verify()
}

Source File: Main2.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.tools.file.export

import cmwell.tools.neptune.export.ExportToNeptuneManager
import org.rogach.scallop.ScallopConf


class Conf1(arguments: Seq[String]) extends ScallopConf(arguments) {
  val sourceCluster = opt[String]("source-cluster", required = true, descr = "the source cluster which data is being exported from")
  val lengthHint = opt[Int]("length-hint", default = Some(16000), validate = 300000.>=, descr="number of infotons that should be consumed in each bulk-consume call")
  val qp = opt[String](name="qp-param", default=None, descr = "cm well qp param")
  val directory = opt[String](name="directory", required = true, default=Some("./"), descr = "s3 directory which neptune read data from")

  verify()
}

object Main2 {
  def main(args: Array[String]) {
    val conf = new Conf1(args)
    println("Source cluster is: " + conf.sourceCluster())
    println("length-hint: " + conf.lengthHint())
    println("qp: " + conf.qp.getOrElse("(not provided)"))
    println("s3 bucket:" + conf.directory())
    val qpParam :Option[String]= conf.qp.toOption.map(s => s",$s")
    println("About to Export..")
    val exportToNeptuneManager = new ExportToNeptuneManager(1)
    exportToNeptuneManager.exportToNeptune(conf.sourceCluster(), "", conf.lengthHint(), false, qpParam, false, None, None, "", Some(conf.directory()))
  }

}

Source File: Main.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.tools.neptune.export

import org.rogach.scallop.ScallopConf


class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
  val sourceCluster = opt[String]("source-cluster", required = true, descr = "the source cluster which data is being exported from")
  val neptuneCluster = opt[String]("neptune-cluster", required = true, descr="neptune cluster which data is being exported to")
  val ingestConnectionPoolSize = opt[Int]("ingest-connection-pool-size", default = Some(5), validate = 50.>=, descr="number of connection pool that should be created by the tool in order to ingest to neptune")
  val lengthHint = opt[Int]("length-hint", default = Some(16000), validate = 300000.>=, descr="number of infotons that should be consumed in each bulk-consume call")
  val qp = opt[String](name="qp-param", default=None, descr = "cm well qp param")
  val updateInfotons = opt[Boolean]("update-infotons", descr = "enable this parameter when you use an update mode or delete of infotons")
  val bulkLoader = opt[Boolean]("bulk-loader", descr = "enable this parameter in order to export by using s3-bulk loader api. bulk loader is only for initial load")
  val proxyHost = opt[String]("proxy-host", default=None, descr = "proxy host is provided when you use bulk loader and your machine use proxy")
  val proxyPort = opt[Int]("proxy-port", default=None, descr = "proxy port is provided when you use bulk loader and your machine use proxy")
  val s3Bucket = opt[String](name="s3-bucket", default=Some("cm-well/sync"), descr = "s3 directory which neptune read data from")

  verify()
}

object Main {
  def main(args: Array[String]) {
    val conf = new Conf(args)
    println("Source cluster is: " + conf.sourceCluster())
    println("Neptune cluster is: " + conf.neptuneCluster())
    println("Connection pool size is: " + conf.ingestConnectionPoolSize())
    println("length-hint: " + conf.lengthHint())
    println("update infotons: " + conf.updateInfotons())
    println("qp: " + conf.qp.getOrElse("(not provided)"))
    println("bulk loader: " + conf.bulkLoader())
    println("proxy host: " + conf.proxyHost.getOrElse("not provided"))
    println("proxy port: " + conf.proxyPort.getOrElse(-1))
    println("s3 bucket:" + conf.s3Bucket())
    val qpParam :Option[String]= conf.qp.toOption.map(s => s",$s")
    val proxyHost :Option[String]= conf.proxyHost.toOption
    val proxyPort :Option[Int]= conf.proxyPort.toOption
    println("About to Export..")
    val exportToNeptuneManager = new ExportToNeptuneManager(conf.ingestConnectionPoolSize())
    exportToNeptuneManager.exportToNeptune(conf.sourceCluster(), conf.neptuneCluster(), conf.lengthHint(), conf.updateInfotons(), qpParam,
      conf.bulkLoader(), proxyHost, proxyPort, conf.s3Bucket(), None)
  }

}

Source File: Main.scala From CM-Well with Apache License 2.0

5 votes

import akka.actor.{ActorSystem, Props}
import org.rogach.scallop.ScallopConf


  class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
    val sourceUrl = opt[String]("source-url", required = true, descr = "the source url which download rdf file")
    val format = opt[String]("format", required = true, descr="the ofile format")
    var cluster = opt[String]("cluster", required = true, descr="the target server which content is ingested to")
    verify()
  }

  object Main {
    def main(args: Array[String]) {
      val conf = new Conf(args)  // Note: This line also works for "object Main extends App"
      println("source file is: " + conf.sourceUrl())
      println("output format is: " + conf.format())
      val system = ActorSystem("MySystem")
      println("About to Start import tool flow...")
      val mainActor = system.actorOf(Props(new AkkaFileReaderWithActor(conf.sourceUrl(), conf.format(), conf.cluster())), name = "myactor")
      mainActor ! ActorInput
    }

}

Source File: DumpIndexWithSystemFields.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.IndexWithSystemFields
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpIndexWithSystemFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpIndexWithSystemFields.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump system fields from Elasticsearch indexes",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = IndexWithSystemFields()(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }

      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: DumpInfotonWithUuidOnly.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.InfotonWithUuidOnly
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object DumpInfotonWithUuidOnly {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpInfotonWithUuidOnly.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump infoton table - uuid only",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = InfotonWithUuidOnly()(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: FindInfotonIndexInconsistencies.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.InfotonAndIndexWithSystemFields
import cmwell.analytics.data.InfotonAndIndexWithSystemFields.{isConsistent, isWellFormed}
import cmwell.analytics.util.CmwellConnector
import cmwell.analytics.util.ConsistencyThreshold.defaultConsistencyThreshold
import cmwell.analytics.util.ISO8601.{instantToMillis, instantToText}
import org.apache.log4j.LogManager
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._
import org.joda.time.format.ISODateTimeFormat
import org.rogach.scallop.{ScallopConf, ScallopOption, ValueConverter, singleArgConverter}

import scala.util.control.NonFatal

object FindInfotonIndexInconsistencies {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(FindInfotonIndexInconsistencies.getClass)

    try {

      object Opts extends ScallopConf(args) {

        private val instantConverter: ValueConverter[Long] = singleArgConverter[Long](instantToMillis)

        // If this parameter is not supplied, the (unreliable) ES Spark connector is used to extract the data from the es index.
        val esExtract: ScallopOption[String] = opt[String]("es", short = 'e', descr = "The path where the (parquet) extract of system fields the es index are stored", required = false)

        val consistencyThreshold: ScallopOption[Long] = opt[Long]("consistency-threshold", short = 'c', descr = "Ignore any inconsistencies at or after this instant", default = Some(defaultConsistencyThreshold))(instantConverter)

        val outParquet: ScallopOption[String] = opt[String]("out-parquet", short = 'p', descr = "The path to save the output to (in parquet format)", required = false)
        val outCsv: ScallopOption[String] = opt[String]("out-csv", short = 'v', descr = "The path to save the output to (in CSV format)", required = false)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Find inconsistencies between system fields in Infoton and Index",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        logger.info(s"Using a consistency threshold of ${instantToText(Opts.consistencyThreshold())}.")

        val ds = InfotonAndIndexWithSystemFields(esExtractPath = Opts.esExtract.toOption)(spark)

        // Filter out any inconsistencies found if more current than this point in time.
        val i = ds.schema.indexWhere(_.name == "infoton_lastModified")
        val filterCurrent: Row => Boolean = { row: Row =>

          val parser = ISODateTimeFormat.dateTimeParser
          if (row.isNullAt(i))
            true // Shouldn't be null, but don't filter out if we can't get a lastModified
          else
            try {
              parser.parseMillis(row.getAs[String](i)) < Opts.consistencyThreshold()
            }
            catch {
              case NonFatal(_) => true // Don't filter out if lastModified couldn't be converted
            }
        }

        val inconsistentData = ds.filter(not(isConsistent(ds) && isWellFormed(ds)))
          .filter(filterCurrent)
          .cache()

        // Save the inconsistent data in Parquet format suitable for additional analysis
        if (Opts.outParquet.isDefined)
          inconsistentData
            .write
            .parquet(Opts.outParquet())

        // Save the inconsistent data to a single CSV file suitable for reporting.
        if (Opts.outCsv.isDefined)
          inconsistentData
            .coalesce(1)
            .write
            .option("header", value = true)
            .csv(Opts.outCsv())
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: DumpPathWithKeyFields.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.PathWithKeyFields
import cmwell.analytics.util.CmwellConnector
import cmwell.analytics.util.DatasetFilter
import cmwell.analytics.util.TimestampConversion.timestampConverter
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpPathWithKeyFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpPathWithKeyFields.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val lastModifiedGteFilter: ScallopOption[java.sql.Timestamp] = opt[java.sql.Timestamp]("lastmodified-gte-filter", descr = "Filter on lastModified >= <value>, where value is an ISO8601 timestamp", default = None)(timestampConverter)
        val pathPrefixFilter: ScallopOption[String] = opt[String]("path-prefix-filter", descr = "Filter on the path prefix matching <value>", default = None)

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump path table - key fields",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val datasetFilter = DatasetFilter(
          lastModifiedGte = Opts.lastModifiedGteFilter.toOption,
          pathPrefix = Opts.pathPrefixFilter.toOption)

        val ds = PathWithKeyFields(Some(datasetFilter))(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: DumpPathWithUuidOnly.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.PathWithUuidOnly
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object DumpPathWithUuidOnly {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpPathWithUuidOnly.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump path table - uuid only",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = PathWithUuidOnly()(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: FindDuplicatedSystemFields.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.InfotonWithDuplicatedSystemFields
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}

object FindDuplicatedSystemFields {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(FindDuplicatedSystemFields.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Find infotons with duplicated system fields"
      ).withSparkSessionDo { spark =>

        import spark.implicits._

        InfotonWithDuplicatedSystemFields()(spark)
          .toDF
          .write.csv(Opts.out())
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

Source File: DumpIndexWithUuidOnly.scala From CM-Well with Apache License 2.0

5 votes

package cmwell.analytics.main

import cmwell.analytics.data.IndexWithUuidsOnly
import cmwell.analytics.util.CmwellConnector
import org.apache.log4j.LogManager
import org.rogach.scallop.{ScallopConf, ScallopOption}


object DumpIndexWithUuidOnly {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(DumpIndexWithUuidOnly.getClass)

    // Here, the parallelism defines how many partitions are produced.
    // Having too many partitions (esp. with a shuffle) creates pathological I/O patterns.
    val defaultParallelism = 1 max (Runtime.getRuntime.availableProcessors / 2)

    try {

      object Opts extends ScallopConf(args) {

        val parallelism: ScallopOption[Int] = opt[Int]("parallelism", short = 'p', descr = "The parallelism level", default = Some(defaultParallelism))

        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the output to", required = true)
        val currentOnly: ScallopOption[Boolean] = opt[Boolean]("current-only", short = 'c', descr = "Only include current", required = false, default = Some(true))
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))
        val url: ScallopOption[String] = trailArg[String]("url", descr = "A CM-Well URL", required = true)

        val format: ScallopOption[String] = opt[String]("format", short = 'f', descr = "The output format: csv | parquet", required = false, default = Some("parquet"))

        validateOpt(format) {
          case Some("parquet") | Some("csv") => Right(Unit)
          case _ => Left(s"Invalid format - must be 'csv' or 'parquet'.")
        }

        verify()
      }

      CmwellConnector(
        cmwellUrl = Opts.url(),
        appName = "Dump UUIDs from Elasticsearch indexes",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds = IndexWithUuidsOnly(currentOnly = Opts.currentOnly())(spark)
          .coalesce(Opts.parallelism() * CmwellConnector.coalesceParallelismMultiplier)

        Opts.format() match {
          case "parquet" => ds.write.parquet(Opts.out())
          case "csv" => ds.write.csv(Opts.out())
        }
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}

org.rogach.scallop.ScallopConf Scala Examples