it.unimi.dsi.io.FastBufferedReader Java Examples

The following examples show how to use it.unimi.dsi.io.FastBufferedReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FastBufferedReaderTest.java    From database with GNU General Public License v2.0 5 votes vote down vote up
public void testToSpec() {
	String className = FastBufferedReader.class.getName();
	assertEquals( className, new FastBufferedReader().toSpec() );
	assertEquals( className + "(100)", new FastBufferedReader( 100 ).toSpec() );
	assertEquals( className + "(\"_\")", new FastBufferedReader( "_" ).toSpec() );
	assertEquals( className + "(100,\"_\")", new FastBufferedReader( "100", "_" ).toSpec() );
}
 
Example #2
Source File: LineIteratorTest.java    From database with GNU General Public License v2.0 5 votes vote down vote up
public void testLineIterator( ProgressLogger pl ) {
	final LineIterator lineIterator = new LineIterator( new FastBufferedReader( new StringReader( TEXT ) ), pl );
	int i = 0;
	while( lineIterator.hasNext() )
		assertEquals( LINES[ i++ ].toString(), lineIterator.next().toString() );

	assertEquals( i, LINES.length );
}
 
Example #3
Source File: BloomFilter.java    From database with GNU General Public License v2.0 5 votes vote down vote up
public static void main( final String[] arg ) throws IOException, JSAPException, NoSuchMethodException {
	
	final SimpleJSAP jsap = new SimpleJSAP( BloomFilter.class.getName(), "Creates a Bloom filter reading from standard input a newline-separated list of terms.",
			new Parameter[] {
				new FlaggedOption( "bufferSize", IntSizeStringParser.getParser(), "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms." ),
				new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ),
				new UnflaggedOption( "bloomFilter", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised front-coded list." ),
				new UnflaggedOption( "size", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The size of the filter (i.e., the expected number of elements in the filter; usually, the number of terms)." ),
				new UnflaggedOption( "precision", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The precision of the filter." )
	});
	
	JSAPResult jsapResult = jsap.parse( arg );
	if ( jsap.messagePrinted() ) return;
	
	final int bufferSize = jsapResult.getInt( "bufferSize" );
	final String filterName = jsapResult.getString( "bloomFilter" );
	final Charset encoding = (Charset)jsapResult.getObject( "encoding" );

	BloomFilter filter = new BloomFilter( jsapResult.getInt( "size" ), jsapResult.getInt( "precision" ) );
	final ProgressLogger pl = new ProgressLogger();
	pl.itemsName = "terms";
	pl.start( "Reading terms..." );
	MutableString s = new MutableString();
	FastBufferedReader reader = new FastBufferedReader( new InputStreamReader( System.in, encoding ), bufferSize );
	while( reader.readLine( s ) != null ) { 
		filter.add( s );
		pl.lightUpdate();
	}
	pl.done();

	BinIO.storeObject( filter, filterName );
}
 
Example #4
Source File: WikiTextExtractor.java    From tagme with Apache License 2.0 5 votes vote down vote up
public List<WikiLink> extractDisambiguationLinks(MutableString cleanText)
	{
		FastBufferedReader tokenizer = new FastBufferedReader(cleanText);
		MutableString buffer = new MutableString(1024);
		List<WikiLink> links = new ArrayList<WikiLink>();
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				buffer.trim();
				if (buffer.length() == 0) continue;
				
				if (buffer.charAt(0) == '*')
				{
					int start = 1;
					for(; start<buffer.length() && buffer.charAt(start)=='*' ; start++);
					buffer.delete(start, buffer.length()).trim();
					
					if (buffer.length() == 0) continue;
//					if (!buffer.startsWith("[[")) continue;
					
					List<WikiLink> lineLinks = extractLinkFromCleanedLine(buffer);
					if (lineLinks.size()>0) links.add(lineLinks.get(0));
				}
			}
		} catch (IOException ioe){}
		
		return links;
		
	}
 
Example #5
Source File: TernaryIntervalSearchTree.java    From database with GNU General Public License v2.0 5 votes vote down vote up
public static void main( final String[] arg ) throws IOException, JSAPException, NoSuchMethodException {

		final SimpleJSAP jsap = new SimpleJSAP( TernaryIntervalSearchTree.class.getName(), "Builds a ternary interval search tree reading from standard input a newline-separated list of terms.",
			new Parameter[] {
				new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms." ),
				new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ),
				new UnflaggedOption( "tree", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised tree." )
		});

		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;

		final TernaryIntervalSearchTree tree = new TernaryIntervalSearchTree();
		
		MutableString term = new MutableString();
		final ProgressLogger pl = new ProgressLogger();
		pl.itemsName = "terms";
		final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( System.in, (Charset)jsapResult.getObject( "encoding" ) ), jsapResult.getInt( "bufferSize" ) );
				
		pl.start( "Reading terms..." );

		while( terms.readLine( term ) != null ) {
			pl.update();
			tree.add( term );
		}

		pl.done();

		BinIO.storeObject( tree, jsapResult.getString( "tree" ) );
	}
 
Example #6
Source File: BuildRepetitionSet.java    From BUbiNG with Apache License 2.0 5 votes vote down vote up
public static void main(String[] arg) throws IOException {
	if (arg.length == 0) {
		System.err.println("Usage: " + BuildRepetitionSet.class.getSimpleName() + " REPETITIONSET");
		System.exit(1);
	}

	final FastBufferedReader fastBufferedReader = new FastBufferedReader(new InputStreamReader(System.in, Charsets.US_ASCII));
	final MutableString s = new MutableString();
	final LongOpenHashSet repeatedSet = new LongOpenHashSet();
	final String outputFilename = arg[0];
	final ProgressLogger pl = new ProgressLogger();

	MutableString lastUrl = new MutableString();
	pl.itemsName = "lines";
	pl.start("Reading... ");
	while(fastBufferedReader.readLine(s) != null) {
		final int firstTab = s.indexOf('\t');
		final int secondTab = s.indexOf('\t', firstTab + 1);
		MutableString url = s.substring(secondTab + 1);
		if (url.equals(lastUrl)) {
			final int storeIndex = Integer.parseInt(new String(s.array(), 0, firstTab));
			final long storePosition = Long.parseLong(new String(s.array(), firstTab + 1, secondTab - firstTab - 1));
			repeatedSet.add((long)storeIndex << 48 | storePosition);
			System.out.print(storeIndex);
			System.out.print('\t');
			System.out.print(storePosition);
			System.out.print('\t');
			System.out.println(url);
		}

		lastUrl = url;
		pl.lightUpdate();
	}

	pl.done();

	fastBufferedReader.close();
	BinIO.storeObject(repeatedSet, outputFilename);
}
 
Example #7
Source File: RuntimeConfiguration.java    From BUbiNG with Apache License 2.0 5 votes vote down vote up
/** Adds a (or a set of) new IPv4 to the black list; the IPv4 can be specified directly or it can be a file (prefixed by
 *  <code>file:</code>).
 *
 * @param spec the specification (an IP address, or a file prefixed by <code>file</code>).
 * @throws ConfigurationException
 * @throws FileNotFoundException
 */
public void addBlackListedIPv4(final String spec) throws ConfigurationException, FileNotFoundException {
		if (spec.length() == 0) return; // Skip empty specs
		if (spec.startsWith("file:")) {
			final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1)));
			while (lineIterator.hasNext()) {
				final MutableString line = lineIterator.next();
				if (line.length() > 0) blackListedIPv4Addresses.add(handleIPv4(line.toString()));
			}
		}
		else blackListedIPv4Addresses.add(handleIPv4(spec));
}
 
Example #8
Source File: RuntimeConfiguration.java    From BUbiNG with Apache License 2.0 5 votes vote down vote up
/** Adds a (or a set of) new host to the black list; the host can be specified directly or it can be a file (prefixed by
 *  <code>file:</code>).
 *
 * @param spec the specification (a host, or a file prefixed by <code>file</code>).
 * @throws ConfigurationException
 * @throws FileNotFoundException
 */
public void addBlackListedHost(final String spec) throws ConfigurationException, FileNotFoundException 	{
	if (spec.length() == 0) return; // Skip empty specs
	if (spec.startsWith("file:")) {
		final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1)));
		while (lineIterator.hasNext()) {
			final MutableString line = lineIterator.next();
			blackListedHostHashes.add(line.toString().trim().hashCode());
		}
	}
	else blackListedHostHashes.add(spec.trim().hashCode());
}
 
Example #9
Source File: AnchorIndexer.java    From tagme with Apache License 2.0 5 votes vote down vote up
public AnchorIterator(File inputFile) throws IOException
{
	anchor = null;
	links = new Int2IntOpenHashMap(1024);
	links.defaultReturnValue(0);
	originals = new HashSet<String>(32);
	in = new FastBufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
	line = new MutableString(1024);
	in.readLine(line);
	lastAnchor = Chars.split(line, TextDataset.SEP_CHAR)[0].toString();
	scroll = 1;
	end = false;
}
 
Example #10
Source File: URLRespectsRobots.java    From BUbiNG with Apache License 2.0 5 votes vote down vote up
public static void main(String arg[]) throws IOException {
	char[][] robotsResult = URLRespectsRobots.parseRobotsReader(new FileReader(arg[0]), arg[1]);
	for(char[] a: robotsResult) System.err.println(new String(a));
	final FastBufferedReader in = new FastBufferedReader(new InputStreamReader(System.in, Charsets.US_ASCII));
	final MutableString s = new MutableString();
	while(in.readLine(s) != null) {
		final URI uri = BURL.parse(s);
		System.out.println(apply(robotsResult, uri) + "\t" + uri);
	}
	in.close();

}
 
Example #11
Source File: ShiftAddXorSignedStringMap.java    From database with GNU General Public License v2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static void main( final String[] arg ) throws NoSuchMethodException, IOException, JSAPException, ClassNotFoundException {

	final SimpleJSAP jsap = new SimpleJSAP( ShiftAddXorSignedStringMap.class.getName(), "Builds a shift-add-xor signed string map by reading a newline-separated list of strings and a function built on the same list of strings.",
			new Parameter[] {
		new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read strings." ),
		new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding." ),
		new Switch( "zipped", 'z', "zipped", "The string list is compressed in gzip format." ),
		new FlaggedOption( "width", JSAP.INTEGER_PARSER, Integer.toString( Integer.SIZE ), JSAP.NOT_REQUIRED, 'w', "width", "The signature width in bits." ),
		new UnflaggedOption( "function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename of the function to be signed." ),
		new UnflaggedOption( "map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename of the resulting serialised signed string map." ),
		new UnflaggedOption( "stringFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "Read strings from this file instead of standard input." ),
	});

	JSAPResult jsapResult = jsap.parse( arg );
	if ( jsap.messagePrinted() ) return;

	final int bufferSize = jsapResult.getInt( "bufferSize" );
	final String functionName = jsapResult.getString( "function" );
	final String mapName = jsapResult.getString( "map" );
	final String stringFile = jsapResult.getString( "stringFile" );
	final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
	final int width = jsapResult.getInt( "width" );
	final boolean zipped = jsapResult.getBoolean( "zipped" );

	final InputStream inputStream = stringFile != null ? new FileInputStream( stringFile ) : System.in;
	final Iterator<MutableString> iterator = new LineIterator( new FastBufferedReader( new InputStreamReader( zipped ? new GZIPInputStream( inputStream ) : inputStream, encoding ), bufferSize ) );
	final Object2LongFunction<CharSequence> function = (Object2LongFunction<CharSequence>)BinIO.loadObject( functionName );
	LOGGER.info( "Signing..." );
	BinIO.storeObject( new ShiftAddXorSignedStringMap( iterator, function, width ), mapName );
	LOGGER.info( "Completed." );
}
 
Example #12
Source File: FrontCodedStringList.java    From database with GNU General Public License v2.0 5 votes vote down vote up
public static void main( final String[] arg ) throws IOException, JSAPException, NoSuchMethodException {
	
	final SimpleJSAP jsap = new SimpleJSAP( FrontCodedStringList.class.getName(), "Builds a front-coded string list reading from standard input a newline-separated ordered list of terms.",
			new Parameter[] {
				new FlaggedOption( "bufferSize", IntSizeStringParser.getParser(), "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms." ),
				new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ),
				new FlaggedOption( "ratio", IntSizeStringParser.getParser(), "4", JSAP.NOT_REQUIRED, 'r',  "ratio", "The compression ratio." ),
				new Switch( "utf8", 'u', "utf8", "Store the strings as UTF-8 byte arrays." ),
				new Switch( "zipped", 'z', "zipped", "The term list is compressed in gzip format." ),
				new UnflaggedOption( "frontCodedList", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised front-coded list." )
	});
	
	JSAPResult jsapResult = jsap.parse( arg );
	if ( jsap.messagePrinted() ) return;
	
	final int bufferSize = jsapResult.getInt( "bufferSize" );
	final int ratio = jsapResult.getInt( "ratio" );
	final boolean utf8 = jsapResult.getBoolean( "utf8" );
	final boolean zipped = jsapResult.getBoolean( "zipped" );
	final String listName = jsapResult.getString( "frontCodedList" );
	final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
	
	final ProgressLogger pl = new ProgressLogger();
	pl.itemsName = "words";
	pl.start( "Reading words..." );
	final FrontCodedStringList frontCodedStringList = new FrontCodedStringList( new LineIterator( new FastBufferedReader( 
			new InputStreamReader( zipped ? new GZIPInputStream( System.in ) : System.in, encoding ), bufferSize ), pl ), ratio, utf8 );
	pl.done();

	System.err.print( "Writing to file..." );
	BinIO.storeObject( frontCodedStringList, listName );
	System.err.println( " done." );
}
 
Example #13
Source File: URLRespectsRobots.java    From BUbiNG with Apache License 2.0 4 votes vote down vote up
/** Parses the argument as if it were the content of a <code>robots.txt</code> file,
 * and returns a sorted array of prefixes of URLs that the agent should not follow.
 *
 * @param content the content of the  <code>robots.txt</code> file.
 * @param userAgent the string representing the user agent of interest.
 * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order.
 */
public static char[][] parseRobotsReader(final Reader content, final String userAgent) throws IOException {
	/* The set of disallowed paths specifically aimed at userAgent. */
	Set<String> set = new ObjectOpenHashSet<>();
	/* The set of disallowed paths specifically aimed at *. */
	Set<String> setStar = new ObjectOpenHashSet<>();
	/* True if the currently examined record is targetted to us. */
	boolean doesMatter = false;
	/* True if we have seen a section targetted to our agent. */
	boolean specific = false;
	/* True if we have seen a section targetted to *. */
	boolean generic = false;
	/* True if we are in a star section. */
	boolean starSection = false;

	StreamTokenizer st = new StreamTokenizer(new FastBufferedReader(content));
	int token;

	st.resetSyntax();
	st.eolIsSignificant(true); // We need EOLs to separate records
	st.wordChars(33, 255); // All characters may appear
	st.whitespaceChars(0, 32);
	st.ordinaryChar('#'); // We must manually simulate comments 8^(
	st.lowerCaseMode(false);

	while (true) {
		int lineFirstToken = st.nextToken();
		if (lineFirstToken == StreamTokenizer.TT_EOF) break;

			switch (lineFirstToken) {
				// Blank line: a new block is starting
			case StreamTokenizer.TT_EOL:
				doesMatter = false;
				break;

			// Comment or number: ignore until the end of line
			case StreamTokenizer.TT_NUMBER:
			case '#':
				do {
					token = st.nextToken();
				} while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF);
				break;

			// A string
			case StreamTokenizer.TT_WORD:
				if (st.sval.equalsIgnoreCase("user-agent:")) {
					token = st.nextToken();
					if (token == StreamTokenizer.TT_WORD)
						if (StringUtils.startsWithIgnoreCase(userAgent, st.sval)) {
							doesMatter = true;
							specific = true;
							starSection = false;
						}
						else if (st.sval.equals("*")) {
							starSection = true;
							generic = true;
						} else starSection = false;
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (st.sval.equalsIgnoreCase("disallow:")) {
					token = st.nextToken();
					//System.out.println(st.sval + " " + starSection + " " + set + " " + setStar);
					if (token == StreamTokenizer.TT_EOL) {
						if (doesMatter) set.clear();
						else if (starSection) setStar.clear();
					} else if (token == StreamTokenizer.TT_WORD) {
						String disallowed = st.sval;
						if (disallowed.endsWith("*")) disallowed = disallowed.substring(0, disallowed.length()-1); // Someone (erroneously) uses * to denote any suffix
						if (doesMatter) set.add(disallowed);
						else if (starSection) setStar.add(disallowed);
					}
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (LOGGER.isTraceEnabled()) LOGGER.trace("Line first token {} ununderstandable in robots.txt", st.sval);
				break;

			// Something else: a syntax error
			default:
				if (LOGGER.isTraceEnabled()) LOGGER.trace("Found unknown token type {} in robots.txt", Integer.valueOf(lineFirstToken));
		}
	}

	if (specific) return toSortedPrefixFreeCharArrays(set); // Some instructions specific to us
	if (! specific && generic) return toSortedPrefixFreeCharArrays(setStar); // No specific instruction, but some generic ones
	return toSortedPrefixFreeCharArrays(set);
}
 
Example #14
Source File: ImmutableExternalPrefixMap.java    From database with GNU General Public License v2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public static void main( final String[] arg ) throws ClassNotFoundException, IOException, JSAPException, SecurityException, NoSuchMethodException {

	final SimpleJSAP jsap = new SimpleJSAP( ImmutableExternalPrefixMap.class.getName(), "Builds an external map reading from standard input a newline-separated list of terms or a serialised term list. If the dump stream name is not specified, the map will be self-contained.", 
			new Parameter[] {
				new FlaggedOption( "blockSize", JSAP.INTSIZE_PARSER, ( STD_BLOCK_SIZE / 1024 ) + "Ki", JSAP.NOT_REQUIRED, 'b', "block-size", "The size of a block in the dump stream." ),
				new Switch( "serialised", 's', "serialised", "The data source (file or standard input) provides a serialised java.util.List of terms." ),
				new Switch( "zipped", 'z', "zipped", "Standard input is compressed in gzip format." ),
				new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file instead of standard input." ),					
				new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term list encoding." ),
				new UnflaggedOption( "map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised map." ),
				new UnflaggedOption( "dump", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "An optional dump stream (the resulting map will not be self-contained)." )
		}
	);

	JSAPResult jsapResult = jsap.parse( arg );
	if ( jsap.messagePrinted() ) return;
	
	Collection<? extends CharSequence> termList;
	
	final String termFile = jsapResult.getString( "termFile" );
	final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
	final boolean zipped = jsapResult.getBoolean( "zipped" );
	final boolean serialised = jsapResult.getBoolean( "serialised" );

	if ( zipped && serialised ) throw new IllegalArgumentException( "The zipped and serialised options are incompatible" );

	if ( serialised ) termList = (List<? extends CharSequence>) ( termFile != null ? BinIO.loadObject( termFile ) : BinIO.loadObject( System.in ) );
	else {
		if ( termFile != null ) termList = new FileLinesCollection( termFile, encoding.name(), zipped );
		else {
			final ObjectArrayList<MutableString> list = new ObjectArrayList<MutableString>();
			termList = list;
			final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( 
					zipped ? new GZIPInputStream( System.in ) : System.in, encoding.name() ) );
			final MutableString term = new MutableString();
			while( terms.readLine( term ) != null ) list.add( term.copy() );
			terms.close();
		}
	}

	BinIO.storeObject( new ImmutableExternalPrefixMap( termList, jsapResult.getInt( "blockSize" ), jsapResult.getString( "dump" ) ), jsapResult.getString( "map" ) );
}
 
Example #15
Source File: WikiTextExtractor.java    From tagme with Apache License 2.0 4 votes vote down vote up
public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}
 
Example #16
Source File: WikipediaIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
public static HashMap<String, List<String>> parseDBPediaCategories(String lang) throws IOException
{
	PLogger plog = new PLogger(log, Step.TEN_MINUTES, "Lines", "Articles", "Errors");
	plog.start("Parsing DBPEDIA categories");
	
	HashMap<String, List<String>> cats = new HashMap<String, List<String>>(1600000);

	Pattern patTitle = Pattern.compile("/resource/([^>]*)>");
	Pattern patCat = Pattern.compile("/resource/[^:</]*:([^>]*)>");

	
	File dbpedia_cat = WikipediaFiles.DBPEDIA_CAT.getSourceFile(lang);
	FastBufferedReader fbr = new FastBufferedReader(new InputStreamReader(new FileInputStream(dbpedia_cat), Charset.forName("UTF-8")));
	
	MutableString line = new MutableString(1024);
	while(fbr.readLine(line) != null)
	{
		plog.update(0);
		line.trim();
		if (line.startsWith("#")) continue;
		
		Matcher m = patTitle.matcher(line);
		if (!m.find())
		{
			plog.update(2);
			continue;
		}
		String title = m.group(1).replace('_', ' ');
		
		int lastCharTitle = m.end();
		m = patCat.matcher(line);
		if (!m.find(lastCharTitle))
		{
			plog.update(2);
			continue;
		}
		String cat = m.group(1).replace('_', ' ');
		
		if (!cats.containsKey(title))
		{
			plog.update(1);
			cats.put(title, new ArrayList<String>());
		}
		cats.get(title).add(cat);
	}
	plog.stop();
	
	fbr.close();
	return cats;
}
 
Example #17
Source File: WikipediaArticleParser.java    From tagme with Apache License 2.0 4 votes vote down vote up
protected void start() throws IOException
{
	reader = new FastBufferedReader(new InputStreamReader(new FileInputStream(input), Charset.forName("UTF-8")));
	state = State.IDLE;
}
 
Example #18
Source File: ExternalSort.java    From tagme with Apache License 2.0 3 votes vote down vote up
protected void loadNextPage(long runNumber) throws IOException {
	
	int currPageSize = 0;
	String key = null;
	
	FastBufferedReader reader = (FastBufferedReader)runsMap.get(runNumber);
	
	for (MutableString row = reader.readLine(buff); row != null; row = reader.readLine(buff)) {
		
		String line = buff.toString();
		
		numberOfInputRows++;
		
		currPageSize += buff.length();
		
		key = ExternalSortUtils.getKey(line, columns, sep, numeric);
		
		Tuple tuple = new Tuple();
		Tuple oldTuple = (Tuple)map.put(key, tuple);
		
		if (oldTuple != null) {
			tuple.run = oldTuple.run;
			tuple.lines = oldTuple.lines;
		}
		
		if (!uniq || (tuple.lines.size() == 0 && !key.equals(currKey)))
			tuple.append(line);
		
		if (currPageSize >= pageSize) {
			tuple.appendRun(runNumber);
			break;
		}
	}
}