java.text.Normalizer Scala Examples

The following examples show how to use java.text.Normalizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TextUtil.scala    From play-table-of-contents   with MIT License 5 votes vote down vote up
package util

//https://gist.github.com/sam/5213151

object TextUtil {

  def slugify(input: String): String = {
    import java.text.Normalizer
    Normalizer
      .normalize(input, Normalizer.Form.NFD)
      .replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
      .replace('-', ' ') // Replace dashes with spaces
      .trim              // Trim leading/trailing whitespace (including what used to be leading/trailing dashes)
      .replaceAll("\\s+", "-") // Replace whitespace (including newlines and repetitions) with single dashes
      .toLowerCase // Lowercase the final results
  }
} 
Example 2
Source File: StringUtils.scala    From hepek   with Apache License 2.0 5 votes vote down vote up
package ba.sake.hepek.utils

import java.text.Normalizer

object StringUtils {
  private val UnsafeURLCharsRegex = """[& +$,:;=?@"#{}|^~\[`%!'\]./()*\\]"""

  
  def unindent(str: String): String = {
    var minWhitespaceLength = Int.MaxValue
    str.linesIterator.foreach { line =>
      val currLength = line.takeWhile(c => c == ' ' || c == '\t').length
      // IF NOT EMPTY! (blank, wspace..)
      if (currLength < minWhitespaceLength && !line.matches("^\\s*$")) {
        minWhitespaceLength = currLength
      }
    }
    // drop minWhitespaceLength characters
    // be that a SPACE or TAB, doesn't matter...
    // uglyyyyyyyyyy :/
    str.linesIterator
      .map { line =>
        val res = line.zipWithIndex
          .dropWhile { case (_, index) => index < minWhitespaceLength }
          .map(_._1)
        res.mkString
      }
      .mkString("\n")
  }
} 
Example 3
Source File: Slugs.scala    From HAT2.0   with GNU Affero General Public License v3.0 5 votes vote down vote up
package org.hatdex.hat.utils

object Slugs {
  def slugify(str: String): String = {
    import java.text.Normalizer
    Normalizer.normalize(str, Normalizer.Form.NFD)
      .replaceAll("[^\\w ]", "")
      .replace(" ", "-")
      .toLowerCase
  }

  def slugifyUnique(str: String, suffix: Option[String], existing: Seq[String]): String =
    generateUniqueSlug(slugify(str), suffix, existing)

  private def generateUniqueSlug(slug: String, suffix: Option[String], existingSlugs: Seq[String]): String = {
    val slugSuffix = suffix.getOrElse("")
    if (!(existingSlugs contains slug + slugSuffix)) {
      s"$slug$slugSuffix"
    }
    else {
      val endsWithNumber = s"(.+-)([0-9]+)$slugSuffix".r
      val suffixes = existingSlugs.map {
        case endsWithNumber(_, number) => number.toInt
        case _                         => 0
      }
      s"$slug-${suffixes.max + 1}$slugSuffix"
    }
  }
} 
Example 4
Source File: MllibHelper.scala    From twitter-stream-ml   with GNU General Public License v3.0 5 votes vote down vote up
package com.giorgioinf.twtml.spark

import java.text.Normalizer
import org.apache.spark.Logging
import org.apache.spark.mllib.feature.HashingTF
import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import scala.math.BigDecimal
import twitter4j.Status

object MllibHelper extends Logging {

  val numNumberFeatures = 4

  var numRetweetBegin = 100
  var numRetweetEnd = 1000
  var numTextFeatures = 1000
  var hashText = new HashingTF(numTextFeatures)
  var numFeatures = numTextFeatures + numNumberFeatures
  var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

  def reset(conf:ConfArguments) {
    numRetweetBegin = conf.numRetweetBegin
    numRetweetEnd = conf.numRetweetEnd
    numTextFeatures = conf.numTextFeatures

    var hashText = new HashingTF(numTextFeatures)
    var numFeatures = numTextFeatures + numNumberFeatures
    var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray

    log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures")
  }

  
  def featurizeText(statuses: Status): SparseVector = {
    val text = statuses.getRetweetedStatus
      .getText
      .toLowerCase

    // Separate accents from characters and then remove non-unicode
    // characters
    val noAccentText = Normalizer
      .normalize(text, Normalizer.Form.NFD)
      .replaceAll("\\p{M}", "")

    // bigrams
    hashText.transform(text.sliding(2).toSeq)
      .asInstanceOf[SparseVector]
  }

  def featurizeNumbers(statuses: Status): Vector = {
    val user = statuses.getRetweetedStatus.getUser
    val created = statuses.getRetweetedStatus.getCreatedAt
    val timeLeft = (System.currentTimeMillis - created.getTime)

    Vectors.dense(
      user.getFollowersCount * Math.pow(10, -12),
      user.getFavouritesCount * Math.pow(10, -12),
      user.getFriendsCount * Math.pow(10, -12),
      timeLeft * Math.pow(10, -14)
      //retweeted.getURLEntities.length,
      //retweeted.getUserMentionEntities.length
    )
  }

  def featurize(statuses: Status): LabeledPoint = {
    val textFeatures = featurizeText(statuses)
    val numberFeatures = featurizeNumbers(statuses)
    val features = Vectors.sparse(
      numFeatures,
      textFeatures.indices ++ numberFeatureIndices,
      textFeatures.values ++ numberFeatures.toArray
    )
    LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features )
  }

  def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = {
    val n = statuses.getRetweetedStatus.getRetweetCount
    (n >= start && n <= end)
  }

  def filtrate(statuses: Status): Boolean = {
    (
      statuses.isRetweet &&
      //statuses.getLang == "en" &&
      retweetInterval(statuses, numRetweetBegin, numRetweetEnd)
    )
  }
} 
Example 5
Source File: StringUtils.scala    From gospeak   with Apache License 2.0 5 votes vote down vote up
package gospeak.libs.scala

import java.text.Normalizer

object StringUtils {
  def leftPad(str: String, length: Int = 10, char: Char = ' '): String = {
    val toPad = (length - str.length).max(0)
    (char.toString * toPad) + str
  }

  def rightPad(str: String, length: Int = 10, char: Char = ' '): String = {
    val toPad = (length - str.length).max(0)
    str + (char.toString * toPad)
  }

  def removeDiacritics(str: String): String =
    Normalizer.normalize(str, Normalizer.Form.NFD)
      .replaceAll("\\p{InCombiningDiacriticalMarks}+", "")

  def slugify(str: String): String =
    removeDiacritics(str).trim.toLowerCase()
      .replaceAll("[ _+'\"]", "-")
      .replaceAll("--+", "-")
      .replaceAll("[^a-z0-9-]", "")
} 
Example 6
Source File: Dns1123Formatter.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.blueprint.deployment

import java.text.Normalizer


  def transformToDNS1123SubDomain(name: String): String = {
    val subDomainMaxLenght = 253
    name
      .split('.')
      .map(label ⇒ trim(normalize(label)))
      .mkString(".")
      .take(subDomainMaxLenght)
      .stripSuffix(".")
  }
}