java.text.Normalizer Scala Example

Source File: TextUtil.scala From play-table-of-contents with MIT License

5 votes

package util

//https://gist.github.com/sam/5213151

object TextUtil {

  def slugify(input: String): String = {
    import java.text.Normalizer
    Normalizer
      .normalize(input, Normalizer.Form.NFD)
      .replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
      .replace('-', ' ') // Replace dashes with spaces
      .trim              // Trim leading/trailing whitespace (including what used to be leading/trailing dashes)
      .replaceAll("\\s+", "-") // Replace whitespace (including newlines and repetitions) with single dashes
      .toLowerCase // Lowercase the final results
  }
}

Source File: StringUtils.scala From hepek with Apache License 2.0

5 votes

package ba.sake.hepek.utils

import java.text.Normalizer

object StringUtils {
  private val UnsafeURLCharsRegex = """[& +$,:;=?@"#{}|^~\[`%!'\]./()*\\]"""

  
  def unindent(str: String): String = {
    var minWhitespaceLength = Int.MaxValue
    str.linesIterator.foreach { line =>
      val currLength = line.takeWhile(c => c == ' ' || c == '\t').length
      // IF NOT EMPTY! (blank, wspace..)
      if (currLength < minWhitespaceLength && !line.matches("^\\s*$")) {
        minWhitespaceLength = currLength
      }
    }
    // drop minWhitespaceLength characters
    // be that a SPACE or TAB, doesn't matter...
    // uglyyyyyyyyyy :/
    str.linesIterator
      .map { line =>
        val res = line.zipWithIndex
          .dropWhile { case (_, index) => index < minWhitespaceLength }
          .map(_._1)
        res.mkString
      }
      .mkString("\n")
  }
}

Source File: Slugs.scala From HAT2.0 with GNU Affero General Public License v3.0

5 votes

package org.hatdex.hat.utils

object Slugs {
  def slugify(str: String): String = {
    import java.text.Normalizer
    Normalizer.normalize(str, Normalizer.Form.NFD)
      .replaceAll("[^\\w ]", "")
      .replace(" ", "-")
      .toLowerCase
  }

  def slugifyUnique(str: String, suffix: Option[String], existing: Seq[String]): String =
    generateUniqueSlug(slugify(str), suffix, existing)

  private def generateUniqueSlug(slug: String, suffix: Option[String], existingSlugs: Seq[String]): String = {
    val slugSuffix = suffix.getOrElse("")
    if (!(existingSlugs contains slug + slugSuffix)) {
      s"$slug$slugSuffix"
    }
    else {
      val endsWithNumber = s"(.+-)([0-9]+)$slugSuffix".r
      val suffixes = existingSlugs.map {
        case endsWithNumber(_, number) => number.toInt
        case _                         => 0
      }
      s"$slug-${suffixes.max + 1}$slugSuffix"
    }
  }
}

Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0

5 votes

package com.giorgioinf.twtml.spark

import java.text.Normalizer
import org.apache.spark.Logging
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import scala.math.BigDecimal
import twitter4j.Status

object MllibHelper extends Logging {

  val numNumberFeatures = 4

  var numRetweetBegin = 100
  var numRetweetEnd = 1000
  var numTextFeatures = 1000
  var hashText = new HashingTF(numTextFeatures)
  var numFeatures = numTextFeatures + numNumberFeatures
  var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

  def reset(conf:ConfArguments) {
    numRetweetBegin = conf.numRetweetBegin
    numRetweetEnd = conf.numRetweetEnd
    numTextFeatures = conf.numTextFeatures

    var hashText = new HashingTF(numTextFeatures)
    var numFeatures = numTextFeatures + numNumberFeatures
    var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

    log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures")
  }

  
  def featurizeText(statuses: Status): SparseVector = {
    val text = statuses.getRetweetedStatus
      .getText
      .toLowerCase

    // Separate accents from characters and then remove non-unicode
    // characters
    val noAccentText = Normalizer
      .normalize(text, Normalizer.Form.NFD)
      .replaceAll("\\p{M}", "")

    // bigrams
    hashText.transform(text.sliding(2).toSeq)
      .asInstanceOf[SparseVector]
  }

  def featurizeNumbers(statuses: Status): Vector = {
    val user = statuses.getRetweetedStatus.getUser
    val created = statuses.getRetweetedStatus.getCreatedAt
    val timeLeft = (System.currentTimeMillis - created.getTime)

    Vectors.dense(
      user.getFollowersCount * Math.pow(10, -12),
      user.getFavouritesCount * Math.pow(10, -12),
      user.getFriendsCount * Math.pow(10, -12),
      timeLeft * Math.pow(10, -14)
      //retweeted.getURLEntities.length,
      //retweeted.getUserMentionEntities.length
    )
  }

  def featurize(statuses: Status): LabeledPoint = {
    val textFeatures = featurizeText(statuses)
    val numberFeatures = featurizeNumbers(statuses)
    val features = Vectors.sparse(
      numFeatures,
      textFeatures.indices ++ numberFeatureIndices,
      textFeatures.values ++ numberFeatures.toArray
    )
    LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features )
  }

  def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = {
    val n = statuses.getRetweetedStatus.getRetweetCount
    (n >= start && n <= end)
  }

  def filtrate(statuses: Status): Boolean = {
    (
      statuses.isRetweet &&
      //statuses.getLang == "en" &&
      retweetInterval(statuses, numRetweetBegin, numRetweetEnd)
    )
  }
}

Source File: StringUtils.scala From gospeak with Apache License 2.0

5 votes

package gospeak.libs.scala

import java.text.Normalizer

object StringUtils {
  def leftPad(str: String, length: Int = 10, char: Char = ' '): String = {
    val toPad = (length - str.length).max(0)
    (char.toString * toPad) + str
  }

  def rightPad(str: String, length: Int = 10, char: Char = ' '): String = {
    val toPad = (length - str.length).max(0)
    str + (char.toString * toPad)
  }

  def removeDiacritics(str: String): String =
    Normalizer.normalize(str, Normalizer.Form.NFD)
      .replaceAll("\\p{InCombiningDiacriticalMarks}+", "")

  def slugify(str: String): String =
    removeDiacritics(str).trim.toLowerCase()
      .replaceAll("[ _+'\"]", "-")
      .replaceAll("--+", "-")
      .replaceAll("[^a-z0-9-]", "")
}

Source File: Dns1123Formatter.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.blueprint.deployment

import java.text.Normalizer


  def transformToDNS1123SubDomain(name: String): String = {
    val subDomainMaxLenght = 253
    name
      .split('.')
      .map(label ⇒ trim(normalize(label)))
      .mkString(".")
      .take(subDomainMaxLenght)
      .stripSuffix(".")
  }
}

java.text.Normalizer Scala Examples