org.apache.commons.csv.CSVParser#iterator

Source File: SymSpellSearchBenchMark.java From customized-symspell with MIT License

7 votes

private List<String> readQueries(String queryFile) {
  List<String> queries = new ArrayList<>();
  try {
    URL queryResourceUrl = this.getClass().getClassLoader().getResource(queryFile);
    CSVParser qparser = CSVParser
        .parse(queryResourceUrl, Charset.forName("UTF-8"),
            CSVFormat.DEFAULT.withDelimiter(' '));
    java.util.Iterator<CSVRecord> csvIterator = qparser.iterator();
    while (csvIterator.hasNext()) {
      CSVRecord csvRecord = csvIterator.next();
      queries.add(csvRecord.get(0));
    }
  } catch (IOException ex) {
    System.err.println("Error occured " + ex);
  }
  return queries;
}

Source File: DelimitedTextReader.java From marklogic-contentpump with Apache License 2.0

6 votes

protected void initParser(InputSplit inSplit) throws IOException,
    InterruptedException {
    fileIn = openFile(inSplit, true);
    if (fileIn == null) {
        return;
    }
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.
    		getFormat(delimiter, encapsulator, true,
    				true));
    parserIterator = parser.iterator();
}

Source File: SymSpellSearchBenchMark.java From customized-symspell with MIT License

5 votes

private void indexData(String dataResourceName, DataHolder dataHolder)
    throws IOException, SpellCheckException {
  URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName);
  CSVParser parser = CSVParser
      .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' '));
  java.util.Iterator<CSVRecord> csvIterator = parser.iterator();
  while (csvIterator.hasNext()) {
    CSVRecord csvRecord = csvIterator.next();
    dataHolder
        .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d));
  }
}

Source File: SymSpellIndexBenchMark.java From customized-symspell with MIT License

5 votes

private void indexData(String dataResourceName, DataHolder dataHolder)
    throws IOException, SpellCheckException {
  URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName);
  CSVParser parser = CSVParser
      .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' '));
  java.util.Iterator<CSVRecord> csvIterator = parser.iterator();
  while (csvIterator.hasNext()) {
    CSVRecord csvRecord = csvIterator.next();
    dataHolder
        .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d));
  }
}

Source File: CSVIngester.java From macrobase with Apache License 2.0

5 votes

@Override
public RowSet getRows(String baseQuery,
                      Map<String, String> preds,
                      int limit,
                      int offset) throws Exception{

    filename = conf.getString(MacroBaseConf.CSV_INPUT_FILE);
    Compression compression = conf.getCsvCompression();

    if (compression == Compression.GZIP) {
        InputStream fileStream = new FileInputStream(filename);
        InputStream gzipStream = new GZIPInputStream(fileStream);
        Reader decoder = new InputStreamReader(gzipStream);
        csvParser = new CSVParser(decoder, CSVFormat.DEFAULT.withHeader());
    } else {
        File csvFile = new File(conf.getString(MacroBaseConf.CSV_INPUT_FILE));
        csvParser = CSVParser.parse(csvFile, Charset.defaultCharset(), CSVFormat.DEFAULT.withHeader());
    }
    schema = csvParser.getHeaderMap();
    Iterator<CSVRecord> rawIterator = csvParser.iterator();
    int rowCount = 0;

    List<RowSet.Row> rows = Lists.newArrayList();
    while (rawIterator.hasNext() && rowCount < limit) {
        CSVRecord record = rawIterator.next();
        List<ColumnValue> columnValues = Lists.newArrayList();

        if (includeRow(record, preds)) {
            for (Map.Entry<String, Integer> se : schema.entrySet()) {
                columnValues.add(new ColumnValue(se.getKey(),record.get(se.getValue())));
            }

            rows.add(new RowSet.Row(columnValues));
            rowCount++;
        }
    }
     return new RowSet(rows);
}

Source File: Step8bTaskValidationGoldAnnotator.java From argument-reasoning-comprehension-task with Apache License 2.0

4 votes

public static Map<String, CorrectedInstance> loadCorrectedInstancesFromCSV()
            throws IOException
    {
        Map<String, CorrectedInstance> result = new TreeMap<>();
        // read corrections
        List<String> fileNames = Arrays.asList("mturk/annotation-task/97-post-validation.csv",
                "mturk/annotation-task/97-post-validation2.csv");
        for (String fileName : fileNames) {
            CSVParser csvParser = CSVParser
                    .parse(new File(fileName), Charset.forName("utf-8"), CSVFormat.RFC4180);

            Iterator<CSVRecord> iterator = csvParser.iterator();

            while (iterator.hasNext()) {
                CSVRecord firstLine = iterator.next();
                CSVRecord secondLine = iterator.next();
                CSVRecord thirdLine = iterator.next();

                String id = firstLine.get(0);
                boolean skipRecord = "x".equals(firstLine.get(1)) || firstLine.get(1).isEmpty();

                if (!skipRecord) {
                    int correctLabel = Integer.valueOf(firstLine.get(1));

                    //                String[] split = secondLine.get(2).split("\\W", 2);
                    //                System.out.println(Arrays.toString(split));
                    int secondLineLabel = Integer.valueOf(secondLine.get(2).split("\\W", 2)[0]);
                    String secondLineText = secondLine.get(2).split("\\W", 2)[1];

                    int thirdLineLabel = Integer.valueOf(thirdLine.get(2).split("\\W", 2)[0]);
                    String thirdLineText = thirdLine.get(2).split("\\W", 2)[1];

                    System.out.println(correctLabel);
                    System.out.println(secondLineLabel + ", " + secondLineText);
                    System.out.println(thirdLineLabel + ", " + thirdLineText);

                    String originalWarrant;
                    String alternativeWarrant;
                    if (correctLabel == secondLineLabel) {
                        originalWarrant = secondLineText;
                        alternativeWarrant = thirdLineText;
                    }
                    else {
                        originalWarrant = thirdLineText;
                        alternativeWarrant = secondLineText;
                    }

                    CorrectedInstance correctedInstance = new CorrectedInstance(originalWarrant,
                            alternativeWarrant);
//                    System.out.println(correctedInstance);

                    result.put(id, correctedInstance);
                }
            }

            System.out.println(result.size());
        }
        return result;
    }

Source File: DelimitedTextInputFormat.java From marklogic-contentpump with Apache License 2.0

4 votes

public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }

    if (splits.size()>= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:"
            + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter =0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file: splits) {
        FileSplit fsplit = ((FileSplit)file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);
        
        if (fsplit.getStart() == 0) {
        // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER,
                ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter
                    + ". Expects single character.");
            }
            String encoding = conf.get(
                MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream, CSVParserFormatter.
            		getFormat(delimiter, DelimitedTextReader.encapsulator,
            				true, true));
            Iterator<CSVRecord> it = parser.iterator();
            
            String[] header = null;
            if (it.hasNext()) {
            	CSVRecord record = (CSVRecord)it.next();
            	Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                	if (recordIterator.hasNext()) {
                		header[i] = (String)recordIterator.next();
                	} else {
                		throw new IOException("Record size doesn't match the real size");
                	}
                }
                
                EncodingUtil.handleBOMUTF8(header, 0);
                
                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }
        
        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(
            hlist.toArray(new Text[hlist.size()])), path,
            fsplit.getStart(), fsplit.getLength(),
            fsplit.getLocations());
        populatedSplits.add(ds);
    }
    
    return populatedSplits;
}

Source File: CSVIngester.java From macrobase with Apache License 2.0

4 votes

@Override
public MBStream<Datum> getStream() throws Exception {
    if(!loaded) {
        long st = System.currentTimeMillis();

        filename = conf.getString(MacroBaseConf.CSV_INPUT_FILE);
        Compression compression = conf.getCsvCompression();

        if (compression == Compression.GZIP) {
            InputStream fileStream = new FileInputStream(filename);
            InputStream gzipStream = new GZIPInputStream(fileStream);
            Reader decoder = new InputStreamReader(gzipStream);
            csvParser = new CSVParser(decoder, CSVFormat.DEFAULT.withHeader());
        } else {
            File csvFile = new File(conf.getString(MacroBaseConf.CSV_INPUT_FILE));
            csvParser = CSVParser.parse(csvFile, Charset.defaultCharset(), CSVFormat.DEFAULT.withHeader());
        }
        schema = csvParser.getHeaderMap(); //equal to resultSet.getmetadata or smt

        for (Map.Entry<String, Integer> se : schema.entrySet()) {
            conf.getEncoder().recordAttributeName(se.getValue() + 1, se.getKey()); //numbering off each column for encoding
        }

        // Load all records into memory to filter out rows with missing data
        Iterator<CSVRecord> rawIterator = csvParser.iterator();

        int numRows = 0;
        while (rawIterator.hasNext()) {
            try {
                CSVRecord record = rawIterator.next();
                Datum curRow = parseRecord(record);
                dataStream.add(curRow);
                numRows++;
            } catch (NumberFormatException e) {
                badRows++;
            }
        }
        log.info("{}/{} rows successfully parsed ({} malformed rows)", numRows, numRows + badRows, badRows);
    }

    return dataStream;
}

Source File: TestDataProvider.java From preDict with GNU Lesser General Public License v3.0

3 votes

/**
 * expects the name of a csv resource that matches the following format:
 * 
 * <pre>
 * 0 = correct word
 * 1 = true if this is a desired match,
 *     false if this is a false-positive match
 * 2 = comma separated list of similar word
 * </pre>
 * 
 * @param resourceName
 * @throws IOException
 */
public TestDataProvider(String resourceName) throws IOException {
	URL resourceUrl = this.getClass().getClassLoader().getResource(resourceName);
	CSVParser parser = CSVParser.parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(':'));
	Iterator<CSVRecord> csvIterator = parser.iterator();
	while (csvIterator.hasNext()) {
		CSVRecord csvRecord = csvIterator.next();
		baseWords.add(csvRecord.get(0));
		queries.addAll(Arrays.asList(csvRecord.get(2).split(",")));
	}
}

Java Code Examples for org.apache.commons.csv.CSVParser#iterator()