it.unimi.dsi.lang.MutableString#length

Source File: Chars.java From tagme with Apache License 2.0

6 votes

/** Splits the input string in char sequences using char c as delimiter (c is discarded).
 * @param input
 * @param c
 * @return the splitted sub-strings.
 */
public static CharSequence[] split (MutableString input, char c){
	
	if (input.indexOf(c)<0) return new CharSequence[]{input};
	
	ObjectArrayList<CharSequence> tokens = new ObjectArrayList<CharSequence>();
	int pos=-1, last=0;
	while((pos=input.indexOf(c, last))>=0)
	{
		if (last<pos)
			tokens.add(input.subSequence(last, pos));
		else
			tokens.add(new MutableString(""));
		last = pos+1;
	}
	if (last < input.length()) tokens.add(input.subSequence(last, input.length()));
	return tokens.toArray(Chars.EMPTY_STRINGS);
}

Source File: FrontCodedStringList.java From database with GNU General Public License v2.0

6 votes

/** Returns the element at the specified position in this front-coded list by storing it in a mutable string.
 *
 * @param index an index in the list.
 * @param s a mutable string that will contain the string at the specified position.
 */
public void get( final int index, MutableString s ) { 
	if ( utf8 ) {
		final byte[] a = byteFrontCodedList.getArray( index );
		s.length( countUTF8Chars( a ) );
		byte2Char( a, s.array() );
	}
	else {
		int res = charFrontCodedList.get( index, s.array() );
		if ( res < 0 ) {
			s.length( s.array().length - res );
			res = charFrontCodedList.get( index, s.array() );
		}
		else s.length( res );
	}
}

Source File: Chars.java From tagme with Apache License 2.0

5 votes

/** Convert a string from UTF-16 to ASCII.
 * @param input the string to convert.
 */
public static void normalizeASCII(MutableString input)
{
	char[] chars = input.array();
	int len = input.length();
	for(int i=0; i<len; i++)
		chars[i] = UTF16toASCII.MAP[(int)chars[i]];
}

Source File: FastBufferedReader.java From database with GNU General Public License v2.0

5 votes

public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
	int i;
	final char buffer[] = this.buffer;

	if ( noMoreCharacters() ) return false;

	word.length( 0 );
	nonWord.length( 0 );

	for(;;) {
		for( i = 0; i < avail && isWordConstituent( buffer[ pos + i ] ); i++ );

		word.append( buffer, pos, i  );
		pos += i; 
		avail -= i;
		
		if ( avail > 0 || noMoreCharacters() ) break;
	}
	
	if ( noMoreCharacters() ) return true;

	for(;;) {
		for( i = 0; i < avail && ! isWordConstituent( buffer[ pos + i ] ); i++ );

		nonWord.append( buffer, pos, i  );
		pos += i; 
		avail -= i;

		if ( avail > 0 || noMoreCharacters() ) return true;
	}
}

Source File: ImmutableExternalPrefixMap.java From database with GNU General Public License v2.0

5 votes

private long getIndex( final Object o ) {
	final CharSequence term = (CharSequence)o;
	ensureStream();
	// If term contains any character not coded by the prefix coder, we can return -1
	if ( ! isEncodable( term ) ) return -1;

	/* If term is in the map, any string extending term must follow term. Thus,
	 * term can be in the map only if it can be found in the left block
	 * of an approximated interval for itself. */
	Interval interval = intervalApproximator.getApproximatedInterval( term );
	if ( interval == Intervals.EMPTY_INTERVAL ) return -1;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ];

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.equals( term ) ) return count;
			count++;
		}
		
		return -1;
	}
	catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}

Source File: ImmutableExternalPrefixMap.java From database with GNU General Public License v2.0

5 votes

protected MutableString getTerm( final int index, final MutableString s ) {
	ensureStream();
	// We perform a binary search to find the  block to which s could possibly belong.
	int block = Arrays.binarySearch( blockStart, index );
	if ( block < 0 ) block = - block - 2;

	try {
		dumpStream.position( blockOffset[ block ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		int suffixLength, prefixLength = -1;

		for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { 
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
		}
		
		return s;
	}
	catch( IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}

Source File: BulletParser.java From database with GNU General Public License v2.0

5 votes

/**
 * Replaces entities with the corresponding characters.
 * 
 * <P>This method will modify the mutable string <code>s</code> so that all legal occurrences
 * of entities are replaced by the corresponding character.
 * 
 * @param s a mutable string whose entities will be replaced by the corresponding characters.
 * @param entity a support mutable string used by {@link #scanEntity(char[], int, int, boolean, MutableString)}.
 * @param loose a parameter that will be passed to {@link #scanEntity(char[], int, int, boolean, MutableString)}.
 */
protected void replaceEntities( final MutableString s, final MutableString entity, final boolean loose ) {

	final char[] a = s.array();
	int length = s.length();

	/* We examine the string *backwards*, so that i is always a valid index. */

	int i = length, j;
	while( i-- > 0 )
		if ( a[ i ] == '&' && ( j = scanEntity( a, i, length - i, loose, entity ) ) != -1 ) 
			length = s.replace( i, j, lastEntity ).length();
}

Source File: RuntimeConfiguration.java From BUbiNG with Apache License 2.0

5 votes

/** Adds a (or a set of) new IPv4 to the black list; the IPv4 can be specified directly or it can be a file (prefixed by
 *  <code>file:</code>).
 *
 * @param spec the specification (an IP address, or a file prefixed by <code>file</code>).
 * @throws ConfigurationException
 * @throws FileNotFoundException
 */
public void addBlackListedIPv4(final String spec) throws ConfigurationException, FileNotFoundException {
		if (spec.length() == 0) return; // Skip empty specs
		if (spec.startsWith("file:")) {
			final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1)));
			while (lineIterator.hasNext()) {
				final MutableString line = lineIterator.next();
				if (line.length() > 0) blackListedIPv4Addresses.add(handleIPv4(line.toString()));
			}
		}
		else blackListedIPv4Addresses.add(handleIPv4(spec));
}

Source File: WikiTextExtractor.java From tagme with Apache License 2.0

5 votes

public List<WikiLink> extractDisambiguationLinks(MutableString cleanText)
	{
		FastBufferedReader tokenizer = new FastBufferedReader(cleanText);
		MutableString buffer = new MutableString(1024);
		List<WikiLink> links = new ArrayList<WikiLink>();
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				buffer.trim();
				if (buffer.length() == 0) continue;
				
				if (buffer.charAt(0) == '*')
				{
					int start = 1;
					for(; start<buffer.length() && buffer.charAt(start)=='*' ; start++);
					buffer.delete(start, buffer.length()).trim();
					
					if (buffer.length() == 0) continue;
//					if (!buffer.startsWith("[[")) continue;
					
					List<WikiLink> lineLinks = extractLinkFromCleanedLine(buffer);
					if (lineLinks.size()>0) links.add(lineLinks.get(0));
				}
			}
		} catch (IOException ioe){}
		
		return links;
		
	}

Source File: WikipediaAnchorParser.java From tagme with Apache License 2.0

5 votes

/**
 * Manage dots, removing them if they are part of an abbreviation, or replacing them with
 * withespaces if they are the last char of a word
 * @param input
 * @return
 */
public static MutableString removeDots(MutableString input)
{
	int len = input.length();
	char[] array = input.array();
	MutableString res = new MutableString(len);

	boolean isLastDot = false;
	int i=0, last=0;
	while(i<len)
	{
		while(i<len && array[i]!='.' && !Character.isWhitespace(array[i]))
			i++;

		if (i>last) {
			if (isLastDot && res.length() > 0) res.append(' ');
			res.append(array, last, i-last);
		}

		isLastDot = false;
		while(i<len && (array[i]=='.' || Character.isWhitespace(array[i])))
		{
			if (Character.isWhitespace(array[i]) ||
					(i<len-2 && array[i+2]!='.' && !Character.isWhitespace(array[i+2])) ||
					(i==len-2 && i>1 && array[i-2]!='.' && !Character.isWhitespace(array[i-2]))
					)
				isLastDot = true;
			i++;
			last=i;
		}
	}
	return res;


}

Source File: WikipediaAnchorParser.java From tagme with Apache License 2.0

5 votes

/**
 * Remove all punctuations for an anchor, i.e. remove all but letters, digits and whitespaces
 *
 * @param input
 * @param ignoreChars A set of character (no digits, no letters) that are ignored when removing
 * @param ignoreSequences if true, it does not remove sequences of the same characters i.e. '!!!'
 * @return A new MutableString
 */
public static MutableString removePunctuations(MutableString input, String ignoreChars, boolean ignoreSequences)
{
	int len = input.length();
	char[] array = input.array();
	MutableString norm = new MutableString(len);

	int i=0, last=0;
	while(i<len)
	{
		while(i<len && (
				Character.isLetter(array[i]) ||
				Character.isDigit(array[i]) ||
				(ignoreChars!=null && ignoreChars.indexOf(array[i])>=0) ||
				(ignoreSequences && !Character.isWhitespace(array[i]) && (i>0 && array[i-1]==array[i] || i<len-1 && array[i+1]==array[i]) )
				))
			i++;

		if (i>last) {
			if (norm.length() > 0) norm.append(' ');
			norm.append(array, last, i-last);
		}

		while(i<len && !(
				Character.isLetter(array[i]) ||
				Character.isDigit(array[i]) ||
				(ignoreChars!=null && ignoreChars.indexOf(array[i])>=0) ||
				(ignoreSequences && !Character.isWhitespace(array[i]) && (i>0 && array[i-1]==array[i] || i<len-1 && array[i+1]==array[i]) )
				)){
			i++;
			last=i;
		}
	}
	return norm;
}

Source File: TransformationStrategies.java From database with GNU General Public License v2.0

4 votes

public Utf16MutableStringBitVector( final MutableString s, final boolean prefixFree ) {
	this.a = s.array();
	actualEnd = s.length() * Character.SIZE;
	length = actualEnd + ( prefixFree ? Character.SIZE : 0 );
}

Source File: TransformationStrategies.java From database with GNU General Public License v2.0

4 votes

public ISOMutableStringBitVector( final MutableString s, final boolean prefixFree ) {
	this.a = s.array();
	actualEnd = s.length() * Byte.SIZE;
	length = actualEnd + ( prefixFree ? Byte.SIZE : 0 );
}

Source File: ImmutableExternalPrefixMap.java From database with GNU General Public License v2.0

4 votes

public Interval getInterval( final CharSequence prefix ) {
	ensureStream();
	// If prefix contains any character not coded by the prefix coder, we can return the empty interval.
	if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL;

	// We recover the left extremes of the intervals where extensions of prefix could possibly lie.
	Interval interval = intervalApproximator.getApproximatedInterval( prefix );
	// System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" );

	if ( interval == Intervals.EMPTY_INTERVAL ) return interval;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1;

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.startsWith( prefix ) ) {
				start = count;
				break; 
			}
			count++;
		}
		
		/* If we did not find our string, there are two possibilities: if the
		 * interval contains one point, there is no string extending prefix. But
		 * if  the interval  is larger, the first string of the second block in the
		 * interval must be an extension of prefix. */
		if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL;
		else start = count;
		
		end = start + 1;
		//assert dumpStream.readBits() <= blockSize;

		/* If the interval contains more than one point, the last string with
		 * given prefix is necessarily contained in the last block, and we
		 * must restart the search process. */
		if ( interval.length() > 1  ) {
			dumpStream.position( blockOffset[ interval.right ] * blockSize );
			dumpStream.readBits( 0 );
			s.length( 0 );
			end = blockStart[ interval.right ];
			blockEnd = blockStart[ interval.right + 1 ];
			prefixLength = -1;
		}
		
		
		while( end < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( ! s.startsWith( prefix ) ) break;
			end++;
		}
		
		return Interval.valueOf( start, end - 1 );
	} catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
	
}

Source File: WikiTextExtractor.java From tagme with Apache License 2.0

4 votes

public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}

Source File: LineWordReader.java From database with GNU General Public License v2.0

4 votes

public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
	nonWord.length( 0 );
	return fastBufferedReader.readLine( word ) != null;
}

Source File: FastBufferedReader.java From database with GNU General Public License v2.0

4 votes

/** Reads a line into the given mutable string.
 *
 * <P>The next line of input (defined as in {@link java.io.BufferedReader#readLine()})
 * will be stored into <code>s</code>. Note that if <code>s</code> is 
 * not {@linkplain it.unimi.dsi.lang.MutableString loose}
 * this method will be quite inefficient.
 *
 * @param s a mutable string that will be used to store the next line (which could be empty).
 * @return <code>s</code>, or <code>null</code> if the end of file was found, in which
 * case <code>s</code> is unchanged.
 */

public MutableString readLine( final MutableString s ) throws IOException {
	char c = 0;
	int i;

	if ( noMoreCharacters() ) return null;

	s.length( 0 );

	for(;;) {
		for( i = 0; i < avail && ( c = buffer[ pos + i ] ) != '\n' && c != '\r' ; i++ );

		s.append( buffer, pos, i  );
		pos += i; 
		avail -= i;

		if ( avail > 0 ) {
			if ( c == '\n' ) { // LF only.
				pos++;
				avail--;
			}
			else { // c == '\r'
				pos++;
				avail--;
				if ( avail > 0 ) {
					if ( buffer[ pos ] == '\n' ) { // CR/LF with LF already in the buffer.
						pos ++;
						avail--;
					}
				}
				else { // We must search for the LF.
					if ( noMoreCharacters() ) return s;
					if ( buffer[ 0 ] == '\n' ) {
						pos++;
						avail--;
					}
				}
			}
			return s;
		}
		else if ( noMoreCharacters() ) return s;
	}
}

Source File: WikipediaAnchorParser.java From tagme with Apache License 2.0

4 votes

/**
 * It normalizes the anchor text:
 * 1. ascii normalization
 * 2. delete brackets at the end (i.e. in titles)
 * 3. delete a pattern at the beginning of the text, see anchorStart
 * 4. delete all dots '.'
 * 5. replace all punctuations with whitespaces, except for {@link WikipediaAnchorParser#SPECIAL_PUNCTS}
 * 6. if the original contained any of {@link WikipediaAnchorParser#SPECIAL_PUNCTS},
 * 	  it returns 2 anchors (with those puncts replaced by whitespace, and with those puncts deleted)
 *    otherwise, the normalization at 5.
 * If the normalization process doesn't produce a valid anchor, an empty array is returned
 * @param original
 * @param anchorStart A pattern that identifies some common articles or preposition to be deleted if they occur at the beginning of the anchor
 * @return
 */
public static CharSequence[] parseAnchor(CharSequence original, Pattern anchorStart)
{

	MutableString anchor = Chars.toNormalizedASCII(original);
	anchor.squeezeSpace();
	anchor.trim();

	if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

	anchor.loose();
	anchor.toLowerCase();

	Matcher m = P_FINAL_BRACKETS.matcher(anchor);
	if (m.find()) anchor.delete(m.start(), m.end());
	anchor.trim();
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchorStart != null)
	{
		Matcher m2 = anchorStart.matcher(anchor);
		if (m2.find()) anchor.delete(m2.start(), m2.end());
		anchor.trim();
		if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;
	}

	anchor = removeDots(anchor);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	anchor = removePunctuations(anchor, SPECIAL_PUNCTS, false);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchor.indexOfAnyOf(SPECIAL_PUNCTS_CHARS, 0)<0){
		if (!contaisText(anchor)) return Chars.EMPTY_STRINGS;
		else return new CharSequence[]{anchor};
	} else {
		MutableString anchorNoPuncts = new MutableString(anchor);
		anchor.replace(SPECIAL_PUNCTS_CHARS, SPECIAL_PUNCTS_CHAR_MAP);
		//			Chars.trimMultispace(anchor);
		anchor.squeezeSpace().trim();
		if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

		anchorNoPuncts.delete(SPECIAL_PUNCTS_CHARS);
		anchorNoPuncts.squeezeSpace().trim();
		if (anchorNoPuncts.length() < MIN_ANCHOR_LEN || !contaisText(anchorNoPuncts)) return new CharSequence[]{anchor};
		else return new CharSequence[]{anchor, anchorNoPuncts};
	}
}

Source File: FastBufferedReader.java From database with GNU General Public License v2.0

2 votes

/** Creates a new fast buffered reader by wrapping a given mutable string and using a set of additional word constituents.
 * 
 * <p>The effect of {@link #setReader(Reader)} on a buffer created with
 * this constructor is undefined.
 *
 * @param s the mutable string that will be wrapped by the reader.
 * @param wordConstituents a set of characters that will be considered word constituents.
 */
public FastBufferedReader( final MutableString s, final CharSet wordConstituents ) {
	this( s.array(), 0, s.length(), wordConstituents );
}

Source File: FastBufferedReader.java From database with GNU General Public License v2.0

2 votes

/** Creates a new fast buffered reader by wrapping a given mutable string.
 * <p>The effect of {@link #setReader(Reader)} on a buffer created with
 * this constructor is undefined.
 *
 * @param s the mutable string that will be wrapped by the reader.
 */
public FastBufferedReader( final MutableString s ) {
	this( s.array(), 0, s.length() );
}

Java Code Examples for it.unimi.dsi.lang.MutableString#length()