org.apache.spark.ml.param.StringArrayParam Scala Examples

The following examples show how to use org.apache.spark.ml.param.StringArrayParam. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: NerOverwriter.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.ner
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
import org.apache.spark.ml.param.{Param, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}


  def getNewResult: String = $(newResult)

  setDefault(
    newResult -> "I-OVERWRITE"
  )

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {

    var annotationsOverwritten = annotations

    annotationsOverwritten.map { tokenAnnotation =>
      val stopWordsSet = $(stopWords).toSet
      if (stopWordsSet.contains(tokenAnnotation.metadata("word"))) {
        Annotation(
          outputAnnotatorType,
          tokenAnnotation.begin,
          tokenAnnotation.end,
          $(newResult),
          tokenAnnotation.metadata
        )
      } else {
        Annotation(
          outputAnnotatorType,
          tokenAnnotation.begin,
          tokenAnnotation.end,
          tokenAnnotation.result,
          tokenAnnotation.metadata
        )
      }

    }

  }

}

object NerOverwriter extends DefaultParamsReadable[NerOverwriter] 
Example 2
Source File: NerConverter.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.ner

import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, ParamsAndFeaturesReadable}
import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

import scala.collection.Map


  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)

  setDefault(
    preservePosition -> true
  )

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = NerTagged.unpack(annotations)
    val docs = annotations.filter(a => a.annotatorType == AnnotatorType.DOCUMENT)
    val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>
      NerTagsEncoding.fromIOB(sentence, doc._1, sentenceIndex=doc._2, $(preservePosition))
    }

    entities.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity))).
      zipWithIndex.map{case (entity, idx) =>
      Annotation(
        outputAnnotatorType,
        entity.start,
        entity.end,
        entity.text,
        Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)
      )
    }
  }

}

object NerConverter extends ParamsAndFeaturesReadable[NerConverter] 
Example 3
Source File: OpIndexToStringNoFilter.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.unary.UnaryTransformer
import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
import org.apache.spark.ml.param.StringArrayParam


  override def transformFn: (RealNN) => Text = {
    (input: RealNN) => {
      val inputColSchema = getInputSchema()(in1.name)
      // If the labels array is empty use column metadata
      val lbls = $(labels)
      val unseen = $(unseenName)
      val values = if (!isDefined(labels) || lbls.isEmpty) {
        Attribute.fromStructField(inputColSchema)
          .asInstanceOf[NominalAttribute].values.get
      } else {
        lbls
      }
      val idx = input.value.get.toInt
      if (0 <= idx && idx < values.length) {
        values(idx).toText
      } else {
        unseen.toText
      }
    }
  }
}

object OpIndexToStringNoFilter {
  val unseenDefault: String = "UnseenIndex"
} 
Example 4
Source File: IntermediateCacher.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}

class IntermediateCacher(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("intermediateCacher"))
  }

  val inputCols = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
  setDefault(inputCols -> Array.empty[String])

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*)
    intermediateDF.cache()
  }

  override def copy(extra: ParamMap): IntermediateCacher = {
    defaultCopy(extra)
  }
}

object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher] 
Example 5
Source File: UserRepoTransformer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._

class UserRepoTransformer(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("userRepoTransformer"))
  }

  val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)

  override def transformSchema(schema: StructType): StructType = {
    $(inputCols).foreach((inputColName: String) => {
      require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.")
    })

    val newFields: Array[StructField] = Array(
      StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false),
      StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false)
    )
    StructType(schema.fields ++ newFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    import dataset.sparkSession.implicits._

    dataset
      .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
      .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
  }

  override def copy(extra: ParamMap): UserRepoTransformer = {
    defaultCopy(extra)
  }
}

object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer]