Java Code Examples for java.io.StreamTokenizer#ordinaryChar()
The following examples show how to use
java.io.StreamTokenizer#ordinaryChar() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ArffLoader.java From moa with GNU General Public License v3.0 | 6 votes |
/** * Instantiates a new arff loader. * * @param reader the reader * @param range the range */ public ArffLoader(Reader reader, Range range) { this.range = range; BufferedReader br = new BufferedReader(reader); //Init streamTokenizer streamTokenizer = new StreamTokenizer(br); streamTokenizer.resetSyntax(); streamTokenizer.whitespaceChars(0, ' '); streamTokenizer.wordChars(' ' + 1, '\u00FF'); streamTokenizer.whitespaceChars(',', ','); streamTokenizer.commentChar('%'); streamTokenizer.quoteChar('"'); streamTokenizer.quoteChar('\''); streamTokenizer.ordinaryChar('{'); streamTokenizer.ordinaryChar('}'); streamTokenizer.eolIsSignificant(true); this.instanceInformation = this.getHeader(); if (range != null) { //is MultiLabel this.instanceInformation.setRangeOutputIndices(range); } }
Example 2
Source File: ArffLoader.java From incubator-samoa with Apache License 2.0 | 6 votes |
/** * Instantiates a new arff loader. * * @param reader the reader * @param range */ public ArffLoader(Reader reader, Range range) { this.range = range; BufferedReader br = new BufferedReader(reader); //Init streamTokenizer streamTokenizer = new StreamTokenizer(br); streamTokenizer.resetSyntax(); streamTokenizer.whitespaceChars(0, ' '); streamTokenizer.wordChars(' ' + 1, '\u00FF'); streamTokenizer.whitespaceChars(',', ','); streamTokenizer.commentChar('%'); streamTokenizer.quoteChar('"'); streamTokenizer.quoteChar('\''); streamTokenizer.ordinaryChar('{'); streamTokenizer.ordinaryChar('}'); streamTokenizer.eolIsSignificant(true); this.instanceInformation = this.getHeader(); if (range != null) { //is MultiLabel this.instanceInformation.setRangeOutputIndices(range); } }
Example 3
Source File: Algorithm.java From KEEL with GNU General Public License v3.0 | 6 votes |
/** Function to initialize the stream tokenizer. * * @param tokenizer The tokenizer. */ protected void initTokenizer( StreamTokenizer tokenizer ) { tokenizer.resetSyntax(); tokenizer.whitespaceChars( 0, ' ' ); tokenizer.wordChars( ' '+1,'\u00FF' ); tokenizer.whitespaceChars( ',',',' ); tokenizer.quoteChar( '"' ); tokenizer.quoteChar( '\'' ); tokenizer.ordinaryChar( '=' ); tokenizer.ordinaryChar( '{' ); tokenizer.ordinaryChar( '}' ); tokenizer.ordinaryChar( '[' ); tokenizer.ordinaryChar( ']' ); tokenizer.eolIsSignificant( true ); }
Example 4
Source File: ArffLoader.java From samoa with Apache License 2.0 | 6 votes |
private void initStreamTokenizer(Reader reader) { BufferedReader br = new BufferedReader(reader); //Init streamTokenizer streamTokenizer = new StreamTokenizer(br); streamTokenizer.resetSyntax(); streamTokenizer.whitespaceChars(0, ' '); streamTokenizer.wordChars(' ' + 1, '\u00FF'); streamTokenizer.whitespaceChars(',', ','); streamTokenizer.commentChar('%'); streamTokenizer.quoteChar('"'); streamTokenizer.quoteChar('\''); streamTokenizer.ordinaryChar('{'); streamTokenizer.ordinaryChar('}'); streamTokenizer.eolIsSignificant(true); this.instanceInformation = this.getHeader(); if (classAttribute < 0) { this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1); //System.out.print(this.instanceInformation.classIndex()); } else if (classAttribute > 0) { this.instanceInformation.setClassIndex(classAttribute - 1); } }
Example 5
Source File: Parser.java From hadoop with Apache License 2.0 | 5 votes |
Lexer(String s) { tok = new StreamTokenizer(new CharArrayReader(s.toCharArray())); tok.quoteChar('"'); tok.parseNumbers(); tok.ordinaryChar(','); tok.ordinaryChar('('); tok.ordinaryChar(')'); tok.wordChars('$','$'); tok.wordChars('_','_'); }
Example 6
Source File: Parser.java From hadoop with Apache License 2.0 | 5 votes |
Lexer(String s) { tok = new StreamTokenizer(new CharArrayReader(s.toCharArray())); tok.quoteChar('"'); tok.parseNumbers(); tok.ordinaryChar(','); tok.ordinaryChar('('); tok.ordinaryChar(')'); tok.wordChars('$','$'); tok.wordChars('_','_'); }
Example 7
Source File: StringUtils.java From CloverETL-Engine with GNU Lesser General Public License v2.1 | 5 votes |
/** * Abbreviates input char sequence by using first letter of each word composing the sequence. * Skips any spaces, punctuation and other special chars. * * @param input character sequence to abbreviate * @param elementLength how many characters from each word to use (if more than 1, then each element is divided by "_" in output * @param capitalize capitalize letters - i.e. convert to uppercase * @param useNumbers also include numbers in sequence * @return abbreviation of input sequence */ public static CharSequence abbreviateString(CharSequence input, int elementLength, boolean capitalize,boolean useNumbers){ StringBuilder out=new StringBuilder(); StreamTokenizer st = new StreamTokenizer(new CharSequenceReader(input)); st.ordinaryChar('.'); st.ordinaryChar('-'); try{ while(st.nextToken() != StreamTokenizer.TT_EOF) { switch(st.ttype) { case StreamTokenizer.TT_WORD: if (elementLength>1 && out.length()>0) out.append('_'); // append underscore to divide abbr.pieces if (capitalize){ String s = st.sval.subSequence(0, elementLength).toString(); out.append(s.toUpperCase()); }else{ out.append(st.sval.subSequence(0, elementLength)); } break; case StreamTokenizer.TT_NUMBER: if (useNumbers) out.append((int)st.nval); break; default: // do nothing } } }catch(IOException ex){ } return out; }
Example 8
Source File: Parser.java From big-c with Apache License 2.0 | 5 votes |
Lexer(String s) { tok = new StreamTokenizer(new CharArrayReader(s.toCharArray())); tok.quoteChar('"'); tok.parseNumbers(); tok.ordinaryChar(','); tok.ordinaryChar('('); tok.ordinaryChar(')'); tok.wordChars('$','$'); tok.wordChars('_','_'); }
Example 9
Source File: Parser.java From big-c with Apache License 2.0 | 5 votes |
Lexer(String s) { tok = new StreamTokenizer(new CharArrayReader(s.toCharArray())); tok.quoteChar('"'); tok.parseNumbers(); tok.ordinaryChar(','); tok.ordinaryChar('('); tok.ordinaryChar(')'); tok.wordChars('$','$'); tok.wordChars('_','_'); }
Example 10
Source File: NewAnalyzerTask.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Set the params (analyzerName only), Comma-separate list of Analyzer class names. If the Analyzer lives in * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name. * <p> * Analyzer names may also refer to previously defined AnalyzerFactory's. * <p> * Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) > * <p> * Example AnalyzerFactory usage: * <pre> * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer) * -NewAnalyzer('whitespace tokenized') * </pre> * @param params analyzerClassName, or empty for the StandardAnalyzer */ @Override public void setParams(String params) { super.setParams(params); final StreamTokenizer stok = new StreamTokenizer(new StringReader(params)); stok.quoteChar('"'); stok.quoteChar('\''); stok.eolIsSignificant(false); stok.ordinaryChar(','); try { while (stok.nextToken() != StreamTokenizer.TT_EOF) { switch (stok.ttype) { case ',': { // Do nothing break; } case '\'': case '\"': case StreamTokenizer.TT_WORD: { analyzerNames.add(stok.sval); break; } default: { throw new RuntimeException("Unexpected token: " + stok.toString()); } } } } catch (RuntimeException e) { if (e.getMessage().startsWith("Line #")) { throw e; } else { throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e); } } catch (Throwable t) { throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t); } }
Example 11
Source File: Jatalog.java From Jatalog with Apache License 2.0 | 5 votes |
private static StreamTokenizer getTokenizer(Reader reader) throws IOException { StreamTokenizer scan = new StreamTokenizer(reader); scan.ordinaryChar('.'); // '.' looks like a number to StreamTokenizer by default scan.commentChar('%'); // Prolog-style % comments; slashSlashComments and slashStarComments can stay as well. scan.quoteChar('"'); scan.quoteChar('\''); // WTF? You can't disable parsing of numbers unless you reset the syntax (http://stackoverflow.com/q/8856750/115589) //scan.parseNumbers(); return scan; }
Example 12
Source File: RunCART.java From KEEL with GNU General Public License v3.0 | 5 votes |
/** Function to initialize the stream tokenizer. * * @param tokenizer The tokenizer. */ private void initTokenizer(StreamTokenizer tokenizer) { tokenizer.resetSyntax(); tokenizer.whitespaceChars(0, ' '); tokenizer.wordChars(' ' + 1, '\u00FF'); tokenizer.whitespaceChars(',', ','); tokenizer.quoteChar('"'); tokenizer.quoteChar('\''); tokenizer.ordinaryChar('='); tokenizer.ordinaryChar('{'); tokenizer.ordinaryChar('}'); tokenizer.ordinaryChar('['); tokenizer.ordinaryChar(']'); tokenizer.eolIsSignificant(true); }
Example 13
Source File: Lexer.java From jackrabbit-filevault with Apache License 2.0 | 5 votes |
public Lexer(Reader r, String systemId) { this.systemId = systemId; st = new StreamTokenizer(r); st.eolIsSignificant(false); st.lowerCaseMode(false); st.slashSlashComments(true); st.slashStarComments(true); st.wordChars('a', 'z'); st.wordChars('A', 'Z'); st.wordChars(':', ':'); st.wordChars('_', '_'); st.quoteChar(SINGLE_QUOTE); st.quoteChar(DOUBLE_QUOTE); st.ordinaryChar(BEGIN_NODE_TYPE_NAME); st.ordinaryChar(END_NODE_TYPE_NAME); st.ordinaryChar(EXTENDS); st.ordinaryChar(LIST_DELIMITER); st.ordinaryChar(PROPERTY_DEFINITION); st.ordinaryChar(CHILD_NODE_DEFINITION); st.ordinaryChar(BEGIN_TYPE); st.ordinaryChar(END_TYPE); st.ordinaryChar(DEFAULT); st.ordinaryChar(CONSTRAINT); }
Example 14
Source File: Parser.java From hadoop-gpu with Apache License 2.0 | 5 votes |
Lexer(String s) { tok = new StreamTokenizer(new CharArrayReader(s.toCharArray())); tok.quoteChar('"'); tok.parseNumbers(); tok.ordinaryChar(','); tok.ordinaryChar('('); tok.ordinaryChar(')'); tok.wordChars('$','$'); tok.wordChars('_','_'); }
Example 15
Source File: STExample.java From icafe with Eclipse Public License 1.0 | 4 votes |
public static void main(String args[]) throws IOException { Hashtable<String, Double> variables = new Hashtable<String, Double>(); @SuppressWarnings("deprecation") StreamTokenizer st = new StreamTokenizer(System.in); st.eolIsSignificant(true); st.lowerCaseMode(true); st.ordinaryChar('/'); st.ordinaryChar('-'); while (true) { Expression res; int c = StreamTokenizer.TT_EOL; String varName = null; System.out.println("Enter an expression..."); try { while (true) { c = st.nextToken(); if (c == StreamTokenizer.TT_EOF) { System.exit(1); } else if (c == StreamTokenizer.TT_EOL) { continue; } else if (c == StreamTokenizer.TT_WORD) { if (st.sval.compareTo("dump") == 0) { dumpVariables(variables); continue; } else if (st.sval.compareTo("clear") == 0) { variables = new Hashtable<String, Double>(); continue; } else if (st.sval.compareTo("quit") == 0) { System.exit(0); } else if (st.sval.compareTo("exit") == 0) { System.exit(0); } else if (st.sval.compareTo("help") == 0) { help(); continue; } varName = st.sval; c = st.nextToken(); } break; } if (c != '=') { throw new SyntaxError("missing initial '=' sign."); } res = ParseExpression.expression(st); } catch (SyntaxError se) { res = null; varName = null; System.out.println("\nSyntax Error detected! - "+se.getMsg()); while (c != StreamTokenizer.TT_EOL) c = st.nextToken(); continue; } c = st.nextToken(); if (c != StreamTokenizer.TT_EOL) { if (c == ')') System.out.println("\nSyntax Error detected! - To many closing parens."); else System.out.println("\nBogus token on input - "+c); while (c != StreamTokenizer.TT_EOL) c = st.nextToken(); } else { try { Double z; System.out.println("Parsed expression : "+res.unparse()); z = new Double(res.value(variables)); System.out.println("Value is : "+z); if (varName != null) { variables.put(varName, z); System.out.println("Assigned to : "+varName); } } catch (ExecError ee) { System.out.println("Execution error, "+ee.getMsg()+"!"); } } } }
Example 16
Source File: ReportStructureMatcher.java From pentaho-reporting with GNU Lesser General Public License v2.1 | 4 votes |
public static NodeMatcher parse( final String s ) throws IOException { final StreamTokenizer tokenizer = new StreamTokenizer( new StringReader( s ) ); tokenizer.wordChars( '0', '9' ); tokenizer.ordinaryChar( '.' ); tokenizer.ordinaryChar( ',' ); tokenizer.ordinaryChars( 0, ' ' ); ElementMatcher elementMatcher = null; NodeMatcher n = null; Type selectorType = Type.Start; int token; while ( ( token = tokenizer.nextToken() ) != StreamTokenizer.TT_EOF ) { if ( token == StreamTokenizer.TT_WORD || token == '*' ) { NodeMatcher matcher = null; switch ( selectorType ) { case Start: elementMatcher = createMatcher( tokenizer ); matcher = elementMatcher; break; case Child: n = new ChildMatcher( n ); elementMatcher = createMatcher( tokenizer ); matcher = elementMatcher; break; case Descendant: n = new DescendantMatcher( n ); elementMatcher = createMatcher( tokenizer ); matcher = elementMatcher; break; case Id: if ( elementMatcher == null ) { if ( n != null ) { n = new DescendantMatcher( n ); } elementMatcher = createMatcher( tokenizer ); matcher = elementMatcher; } elementMatcher.add( new AttributeMatcher( AttributeNames.Xml.NAMESPACE, AttributeNames.Xml.ID, tokenizer.sval ) ); break; case Class: if ( elementMatcher == null ) { if ( n != null ) { n = new DescendantMatcher( n ); } elementMatcher = createMatcher( tokenizer ); matcher = elementMatcher; } elementMatcher.add( new AttributeMatcher( AttributeNames.Core.NAMESPACE, AttributeNames.Core.STYLE_CLASS, tokenizer.sval ) ); break; default: throw new IOException(); } selectorType = Type.Element; if ( matcher != null ) { if ( n != null ) { n = new AndMatcher( matcher, n ); } else { n = matcher; } } } else { if ( token == '>' ) { selectorType = Type.Child; } if ( token == '.' ) { selectorType = Type.Class; } if ( token == '#' ) { selectorType = Type.Id; } if ( Character.isWhitespace( token ) ) { if ( selectorType == Type.Class || selectorType == Type.Id ) { throw new IllegalStateException(); } if ( selectorType != Type.Child ) { selectorType = Type.Descendant; } } } } return n; }
Example 17
Source File: OldStreamTokenizerTest.java From j2objc with Apache License 2.0 | 4 votes |
public void test_basicStringTokenizerMethods() throws IOException { String str = "Testing 12345 \n alpha \r\n omega"; String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run"; StringReader aa = new StringReader(str); StringReader ba = new StringReader(strb); StreamTokenizer a = new StreamTokenizer(aa); StreamTokenizer b = new StreamTokenizer(ba); Assert.assertTrue(a.lineno() == 1); Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD); Assert.assertTrue(a.toString().equals("Token[Testing], line 1")); Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_NUMBER); Assert.assertTrue(a.toString().equals("Token[n=12345.0], line 1")); Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD); Assert.assertTrue(a.toString().equals("Token[alpha], line 2")); Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD); Assert.assertTrue(a.toString().equals("Token[omega], line 3")); Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_EOF); Assert.assertTrue(a.toString().equals("Token[EOF], line 3")); b.commentChar('u'); b.eolIsSignificant(true); b.lowerCaseMode(true); b.ordinaryChar('y'); b.slashStarComments(true); Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_NUMBER); Assert.assertTrue(b.nval == -3.8); Assert.assertTrue(b.toString().equals("Token[n=-3.8], line 1")); Assert.assertTrue(b.nextToken() == 39); // ' Assert.assertTrue(b.toString().equals("Token[BLIND mice], line 1")); Assert.assertTrue(b.nextToken() == 10); // \n Assert.assertTrue(b.toString().equals("Token[EOL], line 2")); Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD); Assert.assertTrue(b.toString().equals("Token[see], line 2")); Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD); Assert.assertTrue(b.toString().equals("Token[the], line 2")); Assert.assertTrue(b.nextToken() == 121); // y Assert.assertTrue(b.toString().equals("Token['y'], line 2")); Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD); Assert.assertTrue(b.toString().equals("Token[r], line 2")); Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_EOF); Assert.assertTrue(b.toString().equals("Token[EOF], line 2")); }
Example 18
Source File: UnparsedTag.java From hlsparserj with Apache License 2.0 | 4 votes |
/** * Parses the tag line. * @param line playlist line item */ private void parseTagLine(final String line) { final Matcher lineMatcher = TAGPATTERN.matcher(line); // Create a matcher that uses the TAGPATTERN if (lineMatcher.find()) { tagName = lineMatcher.group(1); final String attributeList = lineMatcher.group(2); final StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(attributeList)); tokenizer.resetSyntax(); tokenizer.wordChars(' ', 255); tokenizer.quoteChar('"'); tokenizer.ordinaryChar(','); tokenizer.ordinaryChar('='); String attributeName = null; String attributeValue = null; int noNameCount = 0; do { int ttype; try { ttype = tokenizer.nextToken(); } catch (IOException e) { // Should never get here because reading from String throw new IllegalStateException(e); } if (ttype == ',' || ttype == StreamTokenizer.TT_EOF) { if (attributeValue == null) { // Not actually an attribute - just a single value attributes.put("NONAME" + noNameCount, attributeName); noNameCount++; attributeName = null; } else { attributes.put(attributeName, attributeValue); attributeName = null; attributeValue = null; } } else if (ttype == StreamTokenizer.TT_WORD || ttype == '"') { if (attributeName == null) { attributeName = tokenizer.sval; } else { attributeValue = tokenizer.sval; } } } while (tokenizer.ttype != StreamTokenizer.TT_EOF); // Set the URI if a URI attribute is present if (attributes.containsKey(URI_ATTR)) { uri = attributes.get(URI_ATTR); } } else { // If the line startex with #EXT but does not contain a colon it is a // tag with no attributes tagName = line.substring(1); } }
Example 19
Source File: OldAndroidStreamTokenizerTest.java From j2objc with Apache License 2.0 | 4 votes |
public void testStreamTokenizer() throws Exception { String str = "Testing 12345 \n alpha \r\n omega"; String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run"; StringReader aa = new StringReader(str); StringReader ba = new StringReader(strb); StreamTokenizer a = new StreamTokenizer(aa); StreamTokenizer b = new StreamTokenizer(ba); assertEquals(1, a.lineno()); assertEquals(StreamTokenizer.TT_WORD, a.nextToken()); assertEquals("Token[Testing], line 1", a.toString()); assertEquals(StreamTokenizer.TT_NUMBER, a.nextToken()); assertEquals("Token[n=12345.0], line 1", a.toString()); assertEquals(StreamTokenizer.TT_WORD, a.nextToken()); assertEquals("Token[alpha], line 2", a.toString()); assertEquals(StreamTokenizer.TT_WORD, a.nextToken()); assertEquals("Token[omega], line 3", a.toString()); assertEquals(StreamTokenizer.TT_EOF, a.nextToken()); assertEquals("Token[EOF], line 3", a.toString()); b.commentChar('u'); b.eolIsSignificant(true); b.lowerCaseMode(true); b.ordinaryChar('y'); b.slashStarComments(true); assertEquals(StreamTokenizer.TT_NUMBER, b.nextToken()); assertEquals(-3.8, b.nval); assertEquals("Token[n=-3.8], line 1", b.toString()); assertEquals(39, b.nextToken()); // ' assertEquals("Token[BLIND mice], line 1", b.toString()); assertEquals(10, b.nextToken()); // \n assertEquals("Token[EOL], line 2", b.toString()); assertEquals(StreamTokenizer.TT_WORD, b.nextToken()); assertEquals("Token[see], line 2", b.toString()); assertEquals(StreamTokenizer.TT_WORD, b.nextToken()); assertEquals("Token[the], line 2", b.toString()); assertEquals(121, b.nextToken()); // y assertEquals("Token['y'], line 2", b.toString()); assertEquals(StreamTokenizer.TT_WORD, b.nextToken()); assertEquals("Token[r], line 2", b.toString()); assertEquals(StreamTokenizer.TT_EOF, b.nextToken()); assertEquals("Token[EOF], line 2", b.toString()); // A harmony regression test byte[] data = new byte[]{(byte) '-'}; StreamTokenizer tokenizer = new StreamTokenizer(new ByteArrayInputStream(data)); tokenizer.nextToken(); String result = tokenizer.toString(); assertEquals("Token['-'], line 1", result); // another harmony regression test byte[] data2 = new byte[]{(byte) '"', (byte) 'H', (byte) 'e', (byte) 'l', (byte) 'l', (byte) 'o', (byte) '"'}; StreamTokenizer tokenizer2 = new StreamTokenizer(new ByteArrayInputStream(data2)); tokenizer2.nextToken(); result = tokenizer2.toString(); assertEquals("Token[Hello], line 1", result); }
Example 20
Source File: URLRespectsRobots.java From BUbiNG with Apache License 2.0 | 4 votes |
/** Parses the argument as if it were the content of a <code>robots.txt</code> file, * and returns a sorted array of prefixes of URLs that the agent should not follow. * * @param content the content of the <code>robots.txt</code> file. * @param userAgent the string representing the user agent of interest. * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order. */ public static char[][] parseRobotsReader(final Reader content, final String userAgent) throws IOException { /* The set of disallowed paths specifically aimed at userAgent. */ Set<String> set = new ObjectOpenHashSet<>(); /* The set of disallowed paths specifically aimed at *. */ Set<String> setStar = new ObjectOpenHashSet<>(); /* True if the currently examined record is targetted to us. */ boolean doesMatter = false; /* True if we have seen a section targetted to our agent. */ boolean specific = false; /* True if we have seen a section targetted to *. */ boolean generic = false; /* True if we are in a star section. */ boolean starSection = false; StreamTokenizer st = new StreamTokenizer(new FastBufferedReader(content)); int token; st.resetSyntax(); st.eolIsSignificant(true); // We need EOLs to separate records st.wordChars(33, 255); // All characters may appear st.whitespaceChars(0, 32); st.ordinaryChar('#'); // We must manually simulate comments 8^( st.lowerCaseMode(false); while (true) { int lineFirstToken = st.nextToken(); if (lineFirstToken == StreamTokenizer.TT_EOF) break; switch (lineFirstToken) { // Blank line: a new block is starting case StreamTokenizer.TT_EOL: doesMatter = false; break; // Comment or number: ignore until the end of line case StreamTokenizer.TT_NUMBER: case '#': do { token = st.nextToken(); } while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF); break; // A string case StreamTokenizer.TT_WORD: if (st.sval.equalsIgnoreCase("user-agent:")) { token = st.nextToken(); if (token == StreamTokenizer.TT_WORD) if (StringUtils.startsWithIgnoreCase(userAgent, st.sval)) { doesMatter = true; specific = true; starSection = false; } else if (st.sval.equals("*")) { starSection = true; generic = true; } else starSection = false; // Ignore the rest of the line while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF) token = st.nextToken(); } else if (st.sval.equalsIgnoreCase("disallow:")) { token = st.nextToken(); //System.out.println(st.sval + " " + starSection + " " + set + " " + setStar); if (token == StreamTokenizer.TT_EOL) { if (doesMatter) set.clear(); else if (starSection) setStar.clear(); } else if (token == StreamTokenizer.TT_WORD) { String disallowed = st.sval; if (disallowed.endsWith("*")) disallowed = disallowed.substring(0, disallowed.length()-1); // Someone (erroneously) uses * to denote any suffix if (doesMatter) set.add(disallowed); else if (starSection) setStar.add(disallowed); } // Ignore the rest of the line while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF) token = st.nextToken(); } else if (LOGGER.isTraceEnabled()) LOGGER.trace("Line first token {} ununderstandable in robots.txt", st.sval); break; // Something else: a syntax error default: if (LOGGER.isTraceEnabled()) LOGGER.trace("Found unknown token type {} in robots.txt", Integer.valueOf(lineFirstToken)); } } if (specific) return toSortedPrefixFreeCharArrays(set); // Some instructions specific to us if (! specific && generic) return toSortedPrefixFreeCharArrays(setStar); // No specific instruction, but some generic ones return toSortedPrefixFreeCharArrays(set); }