java.io.StreamTokenizer#ordinaryChar

Source File: ArffLoader.java From moa with GNU General Public License v3.0

6 votes

/**
 * Instantiates a new arff loader.
 *
 * @param reader the reader
 * @param range the range
 */
public ArffLoader(Reader reader, Range range) {
    this.range = range;
    BufferedReader br = new BufferedReader(reader);

    //Init streamTokenizer
    streamTokenizer = new StreamTokenizer(br);
    streamTokenizer.resetSyntax();
    streamTokenizer.whitespaceChars(0, ' ');
    streamTokenizer.wordChars(' ' + 1, '\u00FF');
    streamTokenizer.whitespaceChars(',', ',');
    streamTokenizer.commentChar('%');
    streamTokenizer.quoteChar('"');
    streamTokenizer.quoteChar('\'');
    streamTokenizer.ordinaryChar('{');
    streamTokenizer.ordinaryChar('}');
    streamTokenizer.eolIsSignificant(true);

    this.instanceInformation = this.getHeader();

    if (range != null) { //is MultiLabel
        this.instanceInformation.setRangeOutputIndices(range);
    }

}

Source File: ArffLoader.java From incubator-samoa with Apache License 2.0

6 votes

/**
 * Instantiates a new arff loader.
 *
 * @param reader the reader
 * @param range
 */
public ArffLoader(Reader reader, Range range) {
  this.range = range;
  BufferedReader br = new BufferedReader(reader);

  //Init streamTokenizer
  streamTokenizer = new StreamTokenizer(br);
  streamTokenizer.resetSyntax();
  streamTokenizer.whitespaceChars(0, ' ');
  streamTokenizer.wordChars(' ' + 1, '\u00FF');
  streamTokenizer.whitespaceChars(',', ',');
  streamTokenizer.commentChar('%');
  streamTokenizer.quoteChar('"');
  streamTokenizer.quoteChar('\'');
  streamTokenizer.ordinaryChar('{');
  streamTokenizer.ordinaryChar('}');
  streamTokenizer.eolIsSignificant(true);

  this.instanceInformation = this.getHeader();

  if (range != null) { //is MultiLabel
    this.instanceInformation.setRangeOutputIndices(range);
  }

}

Source File: Algorithm.java From KEEL with GNU General Public License v3.0

6 votes

/** Function to initialize the stream tokenizer.
 *
 * @param tokenizer The tokenizer.
 */
protected void initTokenizer( StreamTokenizer tokenizer )
{
  tokenizer.resetSyntax();
  tokenizer.whitespaceChars( 0, ' ' );
  tokenizer.wordChars( ' '+1,'\u00FF' );
  tokenizer.whitespaceChars( ',',',' );
  tokenizer.quoteChar( '"' );
  tokenizer.quoteChar( '\''  );
  tokenizer.ordinaryChar( '=' );
  tokenizer.ordinaryChar( '{' );
  tokenizer.ordinaryChar( '}' );
  tokenizer.ordinaryChar( '[' );
  tokenizer.ordinaryChar( ']' );
  tokenizer.eolIsSignificant( true );
}

Source File: ArffLoader.java From samoa with Apache License 2.0

6 votes

private void initStreamTokenizer(Reader reader) {
    BufferedReader br = new BufferedReader(reader);

    //Init streamTokenizer
    streamTokenizer = new StreamTokenizer(br);

    streamTokenizer.resetSyntax();
    streamTokenizer.whitespaceChars(0, ' ');
    streamTokenizer.wordChars(' ' + 1, '\u00FF');
    streamTokenizer.whitespaceChars(',', ',');
    streamTokenizer.commentChar('%');
    streamTokenizer.quoteChar('"');
    streamTokenizer.quoteChar('\'');
    streamTokenizer.ordinaryChar('{');
    streamTokenizer.ordinaryChar('}');
    streamTokenizer.eolIsSignificant(true);

    this.instanceInformation = this.getHeader();
    if (classAttribute < 0) {
        this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
        //System.out.print(this.instanceInformation.classIndex());
    } else if (classAttribute > 0) {
        this.instanceInformation.setClassIndex(classAttribute - 1);
    }
}

Source File: Parser.java From hadoop with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Source File: Parser.java From hadoop with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Source File: StringUtils.java From CloverETL-Engine with GNU Lesser General Public License v2.1

5 votes

/**
 * Abbreviates input char sequence by using first letter of each word composing the sequence.
 * Skips any spaces, punctuation and other special chars.
 * 
 * @param input	character sequence to abbreviate
 * @param elementLength how many characters from each word to use (if more than 1, then each element is divided by "_" in output
 * @param capitalize capitalize letters - i.e. convert to uppercase
 * @param useNumbers also include numbers in sequence
 * @return abbreviation of input sequence
 */
public static CharSequence abbreviateString(CharSequence input, int elementLength, boolean capitalize,boolean useNumbers){
	StringBuilder out=new StringBuilder();      
	StreamTokenizer st = new StreamTokenizer(new CharSequenceReader(input));
	st.ordinaryChar('.');
	st.ordinaryChar('-');
	try{
	while(st.nextToken() !=
        StreamTokenizer.TT_EOF) {
        switch(st.ttype) {
          case StreamTokenizer.TT_WORD:
        	  if (elementLength>1 && out.length()>0) out.append('_'); // append underscore to divide abbr.pieces
        	  if (capitalize){
        		  String s = st.sval.subSequence(0, elementLength).toString();
        		  out.append(s.toUpperCase());
        	  }else{
        		  out.append(st.sval.subSequence(0, elementLength));
        	  }
            break;
          case StreamTokenizer.TT_NUMBER:
        	  if (useNumbers) out.append((int)st.nval);
        	  break;
          default: 
        	  // do nothing
        }
	}
	}catch(IOException ex){
	}
	return out;
}

Source File: Parser.java From big-c with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Source File: Parser.java From big-c with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Source File: NewAnalyzerTask.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Set the params (analyzerName only),  Comma-separate list of Analyzer class names.  If the Analyzer lives in
 * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
 * <p>
 * Analyzer names may also refer to previously defined AnalyzerFactory's.
 * <p>
 * Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) &gt;
 * <p>
 * Example AnalyzerFactory usage:
 * <pre>
 * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
 * -NewAnalyzer('whitespace tokenized')
 * </pre>
 * @param params analyzerClassName, or empty for the StandardAnalyzer
 */
@Override
public void setParams(String params) {
  super.setParams(params);
  final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
  stok.quoteChar('"');
  stok.quoteChar('\'');
  stok.eolIsSignificant(false);
  stok.ordinaryChar(',');
  try {
    while (stok.nextToken() != StreamTokenizer.TT_EOF) {
      switch (stok.ttype) {
        case ',': {
          // Do nothing
          break;
        }
        case '\'':
        case '\"':
        case StreamTokenizer.TT_WORD: {
          analyzerNames.add(stok.sval);
          break;
        }
        default: {
          throw new RuntimeException("Unexpected token: " + stok.toString());
        }
      }
    }
  } catch (RuntimeException e) {
    if (e.getMessage().startsWith("Line #")) {
      throw e;
    } else {
      throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e);
    }
  } catch (Throwable t) {
    throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t);
  }


}

Source File: Jatalog.java From Jatalog with Apache License 2.0

5 votes

private static StreamTokenizer getTokenizer(Reader reader) throws IOException {
	StreamTokenizer scan = new StreamTokenizer(reader);
	scan.ordinaryChar('.'); // '.' looks like a number to StreamTokenizer by default
	scan.commentChar('%'); // Prolog-style % comments; slashSlashComments and slashStarComments can stay as well.
	scan.quoteChar('"');
	scan.quoteChar('\'');
	// WTF? You can't disable parsing of numbers unless you reset the syntax (http://stackoverflow.com/q/8856750/115589)
	//scan.parseNumbers(); 
	return scan;
}

Source File: RunCART.java From KEEL with GNU General Public License v3.0

5 votes

/** Function to initialize the stream tokenizer.
 *
 * @param tokenizer		The tokenizer.
 */
private void initTokenizer(StreamTokenizer tokenizer) {
	tokenizer.resetSyntax();
	tokenizer.whitespaceChars(0, ' ');
	tokenizer.wordChars(' ' + 1, '\u00FF');
	tokenizer.whitespaceChars(',', ',');
	tokenizer.quoteChar('"');
	tokenizer.quoteChar('\'');
	tokenizer.ordinaryChar('=');
	tokenizer.ordinaryChar('{');
	tokenizer.ordinaryChar('}');
	tokenizer.ordinaryChar('[');
	tokenizer.ordinaryChar(']');
	tokenizer.eolIsSignificant(true);
}

Source File: Lexer.java From jackrabbit-filevault with Apache License 2.0

5 votes

public Lexer(Reader r, String systemId) {
    this.systemId = systemId;
    st = new StreamTokenizer(r);

    st.eolIsSignificant(false);

    st.lowerCaseMode(false);

    st.slashSlashComments(true);
    st.slashStarComments(true);

    st.wordChars('a', 'z');
    st.wordChars('A', 'Z');
    st.wordChars(':', ':');
    st.wordChars('_', '_');

    st.quoteChar(SINGLE_QUOTE);
    st.quoteChar(DOUBLE_QUOTE);

    st.ordinaryChar(BEGIN_NODE_TYPE_NAME);
    st.ordinaryChar(END_NODE_TYPE_NAME);
    st.ordinaryChar(EXTENDS);
    st.ordinaryChar(LIST_DELIMITER);
    st.ordinaryChar(PROPERTY_DEFINITION);
    st.ordinaryChar(CHILD_NODE_DEFINITION);
    st.ordinaryChar(BEGIN_TYPE);
    st.ordinaryChar(END_TYPE);
    st.ordinaryChar(DEFAULT);
    st.ordinaryChar(CONSTRAINT);
}

Source File: Parser.java From hadoop-gpu with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Source File: STExample.java From icafe with Eclipse Public License 1.0

4 votes

public static void main(String args[]) throws IOException {
      Hashtable<String, Double> variables = new Hashtable<String, Double>();
      @SuppressWarnings("deprecation")
StreamTokenizer st = new StreamTokenizer(System.in);
      st.eolIsSignificant(true);
      st.lowerCaseMode(true);
      st.ordinaryChar('/');
      st.ordinaryChar('-');

      while (true) {
          Expression res;
          int c = StreamTokenizer.TT_EOL;
          String varName = null;

          System.out.println("Enter an expression...");
          try {
              while (true) {
                  c = st.nextToken();
                  if (c == StreamTokenizer.TT_EOF) {
                      System.exit(1);
                  } else if (c == StreamTokenizer.TT_EOL) {
                      continue;
                  } else if (c == StreamTokenizer.TT_WORD) {
                      if (st.sval.compareTo("dump") == 0) {
                          dumpVariables(variables);
                          continue;
                      } else if (st.sval.compareTo("clear") == 0) {
                          variables = new Hashtable<String, Double>();
                          continue;
                      } else if (st.sval.compareTo("quit") == 0) {
                          System.exit(0);
                      } else if (st.sval.compareTo("exit") == 0) {
                          System.exit(0);
                      } else if (st.sval.compareTo("help") == 0) {
                          help();
                          continue;
                      }
                      varName = st.sval;
                      c = st.nextToken();
                  }
                  break;
              }
              if (c != '=') {
                  throw new SyntaxError("missing initial '=' sign.");
              }
              res = ParseExpression.expression(st);
          } catch (SyntaxError se) {
              res = null;
              varName = null;
              System.out.println("\nSyntax Error detected! - "+se.getMsg());
              while (c != StreamTokenizer.TT_EOL)
                  c = st.nextToken();
              continue;
          }

          c = st.nextToken();
          if (c != StreamTokenizer.TT_EOL) {
              if (c == ')')
                  System.out.println("\nSyntax Error detected! - To many closing parens.");
              else
                  System.out.println("\nBogus token on input - "+c);
              while (c != StreamTokenizer.TT_EOL)
                  c = st.nextToken();
          } else {
              try {
                  Double z;
                  System.out.println("Parsed expression : "+res.unparse());
                  z = new Double(res.value(variables));
                  System.out.println("Value is : "+z);
                  if (varName != null) {
                      variables.put(varName, z);
                      System.out.println("Assigned to : "+varName);
                  }
              } catch (ExecError ee) {
                  System.out.println("Execution error, "+ee.getMsg()+"!");
              }
          }
      }
  }

Source File: ReportStructureMatcher.java From pentaho-reporting with GNU Lesser General Public License v2.1

4 votes

public static NodeMatcher parse( final String s ) throws IOException {
  final StreamTokenizer tokenizer = new StreamTokenizer( new StringReader( s ) );
  tokenizer.wordChars( '0', '9' );
  tokenizer.ordinaryChar( '.' );
  tokenizer.ordinaryChar( ',' );
  tokenizer.ordinaryChars( 0, ' ' );

  ElementMatcher elementMatcher = null;
  NodeMatcher n = null;
  Type selectorType = Type.Start;
  int token;
  while ( ( token = tokenizer.nextToken() ) != StreamTokenizer.TT_EOF ) {
    if ( token == StreamTokenizer.TT_WORD || token == '*' ) {
      NodeMatcher matcher = null;

      switch ( selectorType ) {
        case Start:
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Child:
          n = new ChildMatcher( n );
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Descendant:
          n = new DescendantMatcher( n );
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Id:
          if ( elementMatcher == null ) {
            if ( n != null ) {
              n = new DescendantMatcher( n );
            }
            elementMatcher = createMatcher( tokenizer );
            matcher = elementMatcher;
          }
          elementMatcher.add( new AttributeMatcher( AttributeNames.Xml.NAMESPACE, AttributeNames.Xml.ID,
              tokenizer.sval ) );
          break;
        case Class:
          if ( elementMatcher == null ) {
            if ( n != null ) {
              n = new DescendantMatcher( n );
            }
            elementMatcher = createMatcher( tokenizer );
            matcher = elementMatcher;
          }
          elementMatcher.add( new AttributeMatcher( AttributeNames.Core.NAMESPACE, AttributeNames.Core.STYLE_CLASS,
              tokenizer.sval ) );
          break;
        default:
          throw new IOException();
      }

      selectorType = Type.Element;

      if ( matcher != null ) {
        if ( n != null ) {
          n = new AndMatcher( matcher, n );
        } else {
          n = matcher;
        }
      }
    } else {
      if ( token == '>' ) {
        selectorType = Type.Child;
      }
      if ( token == '.' ) {
        selectorType = Type.Class;
      }
      if ( token == '#' ) {
        selectorType = Type.Id;
      }
      if ( Character.isWhitespace( token ) ) {
        if ( selectorType == Type.Class || selectorType == Type.Id ) {
          throw new IllegalStateException();
        }

        if ( selectorType != Type.Child ) {
          selectorType = Type.Descendant;
        }
      }
    }
  }
  return n;
}

Source File: OldStreamTokenizerTest.java From j2objc with Apache License 2.0

4 votes

public void test_basicStringTokenizerMethods() throws IOException {
    String str = "Testing 12345 \n alpha \r\n omega";
    String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run";
    StringReader aa = new StringReader(str);
    StringReader ba = new StringReader(strb);
    StreamTokenizer a = new StreamTokenizer(aa);
    StreamTokenizer b = new StreamTokenizer(ba);

    Assert.assertTrue(a.lineno() == 1);
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[Testing], line 1"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_NUMBER);
    Assert.assertTrue(a.toString().equals("Token[n=12345.0], line 1"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[alpha], line 2"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[omega], line 3"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_EOF);
    Assert.assertTrue(a.toString().equals("Token[EOF], line 3"));

    b.commentChar('u');
    b.eolIsSignificant(true);
    b.lowerCaseMode(true);
    b.ordinaryChar('y');
    b.slashStarComments(true);

    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_NUMBER);
    Assert.assertTrue(b.nval == -3.8);
    Assert.assertTrue(b.toString().equals("Token[n=-3.8], line 1"));
    Assert.assertTrue(b.nextToken() == 39); // '
    Assert.assertTrue(b.toString().equals("Token[BLIND mice], line 1"));
    Assert.assertTrue(b.nextToken() == 10); // \n
    Assert.assertTrue(b.toString().equals("Token[EOL], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[see], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[the], line 2"));
    Assert.assertTrue(b.nextToken() == 121); // y
    Assert.assertTrue(b.toString().equals("Token['y'], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[r], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_EOF);
    Assert.assertTrue(b.toString().equals("Token[EOF], line 2"));
}

Source File: UnparsedTag.java From hlsparserj with Apache License 2.0

4 votes

/**
 * Parses the tag line.
 * @param line playlist line item
 */
private void parseTagLine(final String line) {
    final Matcher lineMatcher = TAGPATTERN.matcher(line);

    // Create a matcher that uses the TAGPATTERN
    if (lineMatcher.find()) {
        tagName = lineMatcher.group(1);

        final String attributeList = lineMatcher.group(2);

        final StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(attributeList));
        tokenizer.resetSyntax();
        tokenizer.wordChars(' ', 255);
        tokenizer.quoteChar('"');
        tokenizer.ordinaryChar(',');
        tokenizer.ordinaryChar('=');

        String attributeName = null;
        String attributeValue = null;
        int noNameCount = 0;
        do {
            int ttype;
            try {
                ttype = tokenizer.nextToken();
            } catch (IOException e) {
                // Should never get here because reading from String
                throw new IllegalStateException(e);
            }

            if (ttype == ',' || ttype == StreamTokenizer.TT_EOF) {
                if (attributeValue == null) {
                    // Not actually an attribute - just a single value
                    attributes.put("NONAME" + noNameCount, attributeName);
                    noNameCount++;
                    attributeName = null;
                } else {
                    attributes.put(attributeName, attributeValue);
                    attributeName = null;
                    attributeValue = null;
                }
            } else if (ttype == StreamTokenizer.TT_WORD || ttype == '"') {
                if (attributeName == null) {
                    attributeName = tokenizer.sval;
                } else {
                    attributeValue = tokenizer.sval;
                }
            }
        } while (tokenizer.ttype != StreamTokenizer.TT_EOF);

        // Set the URI if a URI attribute is present
        if (attributes.containsKey(URI_ATTR)) {
            uri = attributes.get(URI_ATTR);
        }
    } else {
        // If the line startex with #EXT but does not contain a colon it is a
        // tag with no attributes
        tagName = line.substring(1);
    }
}

Source File: OldAndroidStreamTokenizerTest.java From j2objc with Apache License 2.0

4 votes

public void testStreamTokenizer() throws Exception {
    String str = "Testing 12345 \n alpha \r\n omega";
    String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run";
    StringReader aa = new StringReader(str);
    StringReader ba = new StringReader(strb);
    StreamTokenizer a = new StreamTokenizer(aa);
    StreamTokenizer b = new StreamTokenizer(ba);

    assertEquals(1, a.lineno());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[Testing], line 1", a.toString());
    assertEquals(StreamTokenizer.TT_NUMBER, a.nextToken());
    assertEquals("Token[n=12345.0], line 1", a.toString());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[alpha], line 2", a.toString());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[omega], line 3", a.toString());
    assertEquals(StreamTokenizer.TT_EOF, a.nextToken());
    assertEquals("Token[EOF], line 3", a.toString());

    b.commentChar('u');
    b.eolIsSignificant(true);
    b.lowerCaseMode(true);
    b.ordinaryChar('y');
    b.slashStarComments(true);

    assertEquals(StreamTokenizer.TT_NUMBER, b.nextToken());
    assertEquals(-3.8, b.nval);
    assertEquals("Token[n=-3.8], line 1", b.toString());
    assertEquals(39, b.nextToken()); // '
    assertEquals("Token[BLIND mice], line 1", b.toString());
    assertEquals(10, b.nextToken()); // \n
    assertEquals("Token[EOL], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[see], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[the], line 2", b.toString());
    assertEquals(121, b.nextToken()); // y
    assertEquals("Token['y'], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[r], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_EOF, b.nextToken());
    assertEquals("Token[EOF], line 2", b.toString());

    // A harmony regression test
    byte[] data = new byte[]{(byte) '-'};
    StreamTokenizer tokenizer = new StreamTokenizer(new ByteArrayInputStream(data));
    tokenizer.nextToken();
    String result = tokenizer.toString();
    assertEquals("Token['-'], line 1", result);

    // another harmony regression test
    byte[] data2 = new byte[]{(byte) '"',
            (byte) 'H',
            (byte) 'e',
            (byte) 'l',
            (byte) 'l',
            (byte) 'o',
            (byte) '"'};
    StreamTokenizer tokenizer2 = new StreamTokenizer(new ByteArrayInputStream(data2));
    tokenizer2.nextToken();
    result = tokenizer2.toString();
    assertEquals("Token[Hello], line 1", result);
}

Source File: URLRespectsRobots.java From BUbiNG with Apache License 2.0

4 votes

/** Parses the argument as if it were the content of a <code>robots.txt</code> file,
 * and returns a sorted array of prefixes of URLs that the agent should not follow.
 *
 * @param content the content of the  <code>robots.txt</code> file.
 * @param userAgent the string representing the user agent of interest.
 * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order.
 */
public static char[][] parseRobotsReader(final Reader content, final String userAgent) throws IOException {
	/* The set of disallowed paths specifically aimed at userAgent. */
	Set<String> set = new ObjectOpenHashSet<>();
	/* The set of disallowed paths specifically aimed at *. */
	Set<String> setStar = new ObjectOpenHashSet<>();
	/* True if the currently examined record is targetted to us. */
	boolean doesMatter = false;
	/* True if we have seen a section targetted to our agent. */
	boolean specific = false;
	/* True if we have seen a section targetted to *. */
	boolean generic = false;
	/* True if we are in a star section. */
	boolean starSection = false;

	StreamTokenizer st = new StreamTokenizer(new FastBufferedReader(content));
	int token;

	st.resetSyntax();
	st.eolIsSignificant(true); // We need EOLs to separate records
	st.wordChars(33, 255); // All characters may appear
	st.whitespaceChars(0, 32);
	st.ordinaryChar('#'); // We must manually simulate comments 8^(
	st.lowerCaseMode(false);

	while (true) {
		int lineFirstToken = st.nextToken();
		if (lineFirstToken == StreamTokenizer.TT_EOF) break;

			switch (lineFirstToken) {
				// Blank line: a new block is starting
			case StreamTokenizer.TT_EOL:
				doesMatter = false;
				break;

			// Comment or number: ignore until the end of line
			case StreamTokenizer.TT_NUMBER:
			case '#':
				do {
					token = st.nextToken();
				} while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF);
				break;

			// A string
			case StreamTokenizer.TT_WORD:
				if (st.sval.equalsIgnoreCase("user-agent:")) {
					token = st.nextToken();
					if (token == StreamTokenizer.TT_WORD)
						if (StringUtils.startsWithIgnoreCase(userAgent, st.sval)) {
							doesMatter = true;
							specific = true;
							starSection = false;
						}
						else if (st.sval.equals("*")) {
							starSection = true;
							generic = true;
						} else starSection = false;
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (st.sval.equalsIgnoreCase("disallow:")) {
					token = st.nextToken();
					//System.out.println(st.sval + " " + starSection + " " + set + " " + setStar);
					if (token == StreamTokenizer.TT_EOL) {
						if (doesMatter) set.clear();
						else if (starSection) setStar.clear();
					} else if (token == StreamTokenizer.TT_WORD) {
						String disallowed = st.sval;
						if (disallowed.endsWith("*")) disallowed = disallowed.substring(0, disallowed.length()-1); // Someone (erroneously) uses * to denote any suffix
						if (doesMatter) set.add(disallowed);
						else if (starSection) setStar.add(disallowed);
					}
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (LOGGER.isTraceEnabled()) LOGGER.trace("Line first token {} ununderstandable in robots.txt", st.sval);
				break;

			// Something else: a syntax error
			default:
				if (LOGGER.isTraceEnabled()) LOGGER.trace("Found unknown token type {} in robots.txt", Integer.valueOf(lineFirstToken));
		}
	}

	if (specific) return toSortedPrefixFreeCharArrays(set); // Some instructions specific to us
	if (! specific && generic) return toSortedPrefixFreeCharArrays(setStar); // No specific instruction, but some generic ones
	return toSortedPrefixFreeCharArrays(set);
}

Java Code Examples for java.io.StreamTokenizer#ordinaryChar()