it.unimi.dsi.lang.MutableString#delete

Source File: ImmutableBinaryTrie.java From database with GNU General Public License v2.0

6 votes

private void recToString( final Node n, final MutableString printPrefix, final MutableString result, final MutableString path, final int level ) {
	if ( n == null ) return;
	
	//System.err.println( "Called with prefix " + printPrefix );
	
	result.append( printPrefix ).append( '(' ).append( level ).append( ')' );
	
	if ( n.path != null ) {
		path.append( LongArrayBitVector.wrap( n.path, n.pathLength ) );
		result.append( " path:" ).append( LongArrayBitVector.wrap( n.path, n.pathLength ) );
	}
	if ( n.word >= 0 ) result.append( " word: " ).append( n.word ).append( " (" ).append( path ).append( ')' );

	result.append( '\n' );
	
	path.append( '0' );
	recToString( n.left, printPrefix.append( '\t' ).append( "0 => " ), result, path, level + 1 );
	path.charAt( path.length() - 1, '1' ); 
	recToString( n.right, printPrefix.replace( printPrefix.length() - 5, printPrefix.length(), "1 => "), result, path, level + 1 );
	path.delete( path.length() - 1, path.length() ); 
	printPrefix.delete( printPrefix.length() - 6, printPrefix.length() );
	
	//System.err.println( "Path now: " + path + " Going to delete from " + ( path.length() - n.pathLength));
	
	path.delete( path.length() - n.pathLength, path.length() );
}

Source File: WikiTextExtractor.java From tagme with Apache License 2.0

5 votes

@Override
		public int handleLink(MutableString input, int start, int end, WikiLink link) {
			if (link == WikiLink.EMPTY){
				input.delete(start, end);
				return start;
			} else {
				input.replace(start, end, link.anchor);
				return start+link.anchor.length();
//				return 0;
			}
		}

Source File: WikiTextExtractor.java From tagme with Apache License 2.0

5 votes

@Override
public int handleLink(MutableString input, int start, int end, WikiLink link) {
	if (link == WikiLink.EMPTY){
		input.delete(start, end);
		return start;
	} else {
		return end;
	}
}

Source File: ImmutableExternalPrefixMap.java From database with GNU General Public License v2.0

5 votes

protected MutableString getTerm( final int index, final MutableString s ) {
	ensureStream();
	// We perform a binary search to find the  block to which s could possibly belong.
	int block = Arrays.binarySearch( blockStart, index );
	if ( block < 0 ) block = - block - 2;

	try {
		dumpStream.position( blockOffset[ block ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		int suffixLength, prefixLength = -1;

		for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { 
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
		}
		
		return s;
	}
	catch( IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}

Source File: ImmutableExternalPrefixMap.java From database with GNU General Public License v2.0

5 votes

private long getIndex( final Object o ) {
	final CharSequence term = (CharSequence)o;
	ensureStream();
	// If term contains any character not coded by the prefix coder, we can return -1
	if ( ! isEncodable( term ) ) return -1;

	/* If term is in the map, any string extending term must follow term. Thus,
	 * term can be in the map only if it can be found in the left block
	 * of an approximated interval for itself. */
	Interval interval = intervalApproximator.getApproximatedInterval( term );
	if ( interval == Intervals.EMPTY_INTERVAL ) return -1;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ];

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.equals( term ) ) return count;
			count++;
		}
		
		return -1;
	}
	catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
}

Source File: WikipediaAnchorParser.java From tagme with Apache License 2.0

4 votes

/**
 * It normalizes the anchor text:
 * 1. ascii normalization
 * 2. delete brackets at the end (i.e. in titles)
 * 3. delete a pattern at the beginning of the text, see anchorStart
 * 4. delete all dots '.'
 * 5. replace all punctuations with whitespaces, except for {@link WikipediaAnchorParser#SPECIAL_PUNCTS}
 * 6. if the original contained any of {@link WikipediaAnchorParser#SPECIAL_PUNCTS},
 * 	  it returns 2 anchors (with those puncts replaced by whitespace, and with those puncts deleted)
 *    otherwise, the normalization at 5.
 * If the normalization process doesn't produce a valid anchor, an empty array is returned
 * @param original
 * @param anchorStart A pattern that identifies some common articles or preposition to be deleted if they occur at the beginning of the anchor
 * @return
 */
public static CharSequence[] parseAnchor(CharSequence original, Pattern anchorStart)
{

	MutableString anchor = Chars.toNormalizedASCII(original);
	anchor.squeezeSpace();
	anchor.trim();

	if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

	anchor.loose();
	anchor.toLowerCase();

	Matcher m = P_FINAL_BRACKETS.matcher(anchor);
	if (m.find()) anchor.delete(m.start(), m.end());
	anchor.trim();
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchorStart != null)
	{
		Matcher m2 = anchorStart.matcher(anchor);
		if (m2.find()) anchor.delete(m2.start(), m2.end());
		anchor.trim();
		if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;
	}

	anchor = removeDots(anchor);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	anchor = removePunctuations(anchor, SPECIAL_PUNCTS, false);
	if (anchor.length() < MIN_ANCHOR_LEN) return Chars.EMPTY_STRINGS;

	if (anchor.indexOfAnyOf(SPECIAL_PUNCTS_CHARS, 0)<0){
		if (!contaisText(anchor)) return Chars.EMPTY_STRINGS;
		else return new CharSequence[]{anchor};
	} else {
		MutableString anchorNoPuncts = new MutableString(anchor);
		anchor.replace(SPECIAL_PUNCTS_CHARS, SPECIAL_PUNCTS_CHAR_MAP);
		//			Chars.trimMultispace(anchor);
		anchor.squeezeSpace().trim();
		if (anchor.length() < MIN_ANCHOR_LEN || !contaisText(anchor)) return Chars.EMPTY_STRINGS;

		anchorNoPuncts.delete(SPECIAL_PUNCTS_CHARS);
		anchorNoPuncts.squeezeSpace().trim();
		if (anchorNoPuncts.length() < MIN_ANCHOR_LEN || !contaisText(anchorNoPuncts)) return new CharSequence[]{anchor};
		else return new CharSequence[]{anchor, anchorNoPuncts};
	}
}

Source File: WikiTextExtractor.java From tagme with Apache License 2.0

4 votes

public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}

Source File: ImmutableExternalPrefixMap.java From database with GNU General Public License v2.0

4 votes

public Interval getInterval( final CharSequence prefix ) {
	ensureStream();
	// If prefix contains any character not coded by the prefix coder, we can return the empty interval.
	if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL;

	// We recover the left extremes of the intervals where extensions of prefix could possibly lie.
	Interval interval = intervalApproximator.getApproximatedInterval( prefix );
	// System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" );

	if ( interval == Intervals.EMPTY_INTERVAL ) return interval;
	try {
		dumpStream.position( blockOffset[ interval.left ] * blockSize );
		dumpStream.readBits( 0 );
		iteratorIsUsable = false;
		MutableString s = new MutableString();
		int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1;

		/* We scan the dump file, stopping if we exhaust the block */
		while( count < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( s.startsWith( prefix ) ) {
				start = count;
				break; 
			}
			count++;
		}
		
		/* If we did not find our string, there are two possibilities: if the
		 * interval contains one point, there is no string extending prefix. But
		 * if  the interval  is larger, the first string of the second block in the
		 * interval must be an extension of prefix. */
		if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL;
		else start = count;
		
		end = start + 1;
		//assert dumpStream.readBits() <= blockSize;

		/* If the interval contains more than one point, the last string with
		 * given prefix is necessarily contained in the last block, and we
		 * must restart the search process. */
		if ( interval.length() > 1  ) {
			dumpStream.position( blockOffset[ interval.right ] * blockSize );
			dumpStream.readBits( 0 );
			s.length( 0 );
			end = blockStart[ interval.right ];
			blockEnd = blockStart[ interval.right + 1 ];
			prefixLength = -1;
		}
		
		
		while( end < blockEnd ) {
			if ( prefixLength < 0 ) prefixLength = 0;
			else prefixLength = dumpStream.readUnary();
			suffixLength = dumpStream.readUnary();
			s.delete( prefixLength, s.length() );
			s.length( prefixLength + suffixLength );
			for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			if ( ! s.startsWith( prefix ) ) break;
			end++;
		}
		
		return Interval.valueOf( start, end - 1 );
	} catch (IOException rethrow ) {
		throw new RuntimeException( rethrow );
	}
	
}

Java Code Examples for it.unimi.dsi.lang.MutableString#delete()