java.text.Normalizer Scala Examples
The following examples show how to use java.text.Normalizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextUtil.scala From play-table-of-contents with MIT License | 5 votes |
package util //https://gist.github.com/sam/5213151 object TextUtil { def slugify(input: String): String = { import java.text.Normalizer Normalizer .normalize(input, Normalizer.Form.NFD) .replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters .replace('-', ' ') // Replace dashes with spaces .trim // Trim leading/trailing whitespace (including what used to be leading/trailing dashes) .replaceAll("\\s+", "-") // Replace whitespace (including newlines and repetitions) with single dashes .toLowerCase // Lowercase the final results } }
Example 2
Source File: StringUtils.scala From hepek with Apache License 2.0 | 5 votes |
package ba.sake.hepek.utils import java.text.Normalizer object StringUtils { private val UnsafeURLCharsRegex = """[& +$,:;=?@"#{}|^~\[`%!'\]./()*\\]""" def unindent(str: String): String = { var minWhitespaceLength = Int.MaxValue str.linesIterator.foreach { line => val currLength = line.takeWhile(c => c == ' ' || c == '\t').length // IF NOT EMPTY! (blank, wspace..) if (currLength < minWhitespaceLength && !line.matches("^\\s*$")) { minWhitespaceLength = currLength } } // drop minWhitespaceLength characters // be that a SPACE or TAB, doesn't matter... // uglyyyyyyyyyy :/ str.linesIterator .map { line => val res = line.zipWithIndex .dropWhile { case (_, index) => index < minWhitespaceLength } .map(_._1) res.mkString } .mkString("\n") } }
Example 3
Source File: Slugs.scala From HAT2.0 with GNU Affero General Public License v3.0 | 5 votes |
package org.hatdex.hat.utils object Slugs { def slugify(str: String): String = { import java.text.Normalizer Normalizer.normalize(str, Normalizer.Form.NFD) .replaceAll("[^\\w ]", "") .replace(" ", "-") .toLowerCase } def slugifyUnique(str: String, suffix: Option[String], existing: Seq[String]): String = generateUniqueSlug(slugify(str), suffix, existing) private def generateUniqueSlug(slug: String, suffix: Option[String], existingSlugs: Seq[String]): String = { val slugSuffix = suffix.getOrElse("") if (!(existingSlugs contains slug + slugSuffix)) { s"$slug$slugSuffix" } else { val endsWithNumber = s"(.+-)([0-9]+)$slugSuffix".r val suffixes = existingSlugs.map { case endsWithNumber(_, number) => number.toInt case _ => 0 } s"$slug-${suffixes.max + 1}$slugSuffix" } } }
Example 4
Source File: MllibHelper.scala From twitter-stream-ml with GNU General Public License v3.0 | 5 votes |
package com.giorgioinf.twtml.spark import java.text.Normalizer import org.apache.spark.Logging import org.apache.spark.mllib.feature.HashingTF import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import scala.math.BigDecimal import twitter4j.Status object MllibHelper extends Logging { val numNumberFeatures = 4 var numRetweetBegin = 100 var numRetweetEnd = 1000 var numTextFeatures = 1000 var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray def reset(conf:ConfArguments) { numRetweetBegin = conf.numRetweetBegin numRetweetEnd = conf.numRetweetEnd numTextFeatures = conf.numTextFeatures var hashText = new HashingTF(numTextFeatures) var numFeatures = numTextFeatures + numNumberFeatures var numberFeatureIndices = (numTextFeatures to numFeatures-1).toArray log.debug(s"retweet range: ($numRetweetBegin - $numRetweetEnd), numTextFeatures: $numTextFeatures") } def featurizeText(statuses: Status): SparseVector = { val text = statuses.getRetweetedStatus .getText .toLowerCase // Separate accents from characters and then remove non-unicode // characters val noAccentText = Normalizer .normalize(text, Normalizer.Form.NFD) .replaceAll("\\p{M}", "") // bigrams hashText.transform(text.sliding(2).toSeq) .asInstanceOf[SparseVector] } def featurizeNumbers(statuses: Status): Vector = { val user = statuses.getRetweetedStatus.getUser val created = statuses.getRetweetedStatus.getCreatedAt val timeLeft = (System.currentTimeMillis - created.getTime) Vectors.dense( user.getFollowersCount * Math.pow(10, -12), user.getFavouritesCount * Math.pow(10, -12), user.getFriendsCount * Math.pow(10, -12), timeLeft * Math.pow(10, -14) //retweeted.getURLEntities.length, //retweeted.getUserMentionEntities.length ) } def featurize(statuses: Status): LabeledPoint = { val textFeatures = featurizeText(statuses) val numberFeatures = featurizeNumbers(statuses) val features = Vectors.sparse( numFeatures, textFeatures.indices ++ numberFeatureIndices, textFeatures.values ++ numberFeatures.toArray ) LabeledPoint( statuses.getRetweetedStatus.getRetweetCount.toDouble, features ) } def retweetInterval(statuses: Status, start:Long, end:Long):Boolean = { val n = statuses.getRetweetedStatus.getRetweetCount (n >= start && n <= end) } def filtrate(statuses: Status): Boolean = { ( statuses.isRetweet && //statuses.getLang == "en" && retweetInterval(statuses, numRetweetBegin, numRetweetEnd) ) } }
Example 5
Source File: StringUtils.scala From gospeak with Apache License 2.0 | 5 votes |
package gospeak.libs.scala import java.text.Normalizer object StringUtils { def leftPad(str: String, length: Int = 10, char: Char = ' '): String = { val toPad = (length - str.length).max(0) (char.toString * toPad) + str } def rightPad(str: String, length: Int = 10, char: Char = ' '): String = { val toPad = (length - str.length).max(0) str + (char.toString * toPad) } def removeDiacritics(str: String): String = Normalizer.normalize(str, Normalizer.Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", "") def slugify(str: String): String = removeDiacritics(str).trim.toLowerCase() .replaceAll("[ _+'\"]", "-") .replaceAll("--+", "-") .replaceAll("[^a-z0-9-]", "") }
Example 6
Source File: Dns1123Formatter.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.blueprint.deployment import java.text.Normalizer def transformToDNS1123SubDomain(name: String): String = { val subDomainMaxLenght = 253 name .split('.') .map(label ⇒ trim(normalize(label))) .mkString(".") .take(subDomainMaxLenght) .stripSuffix(".") } }