org.apache.flink.api.common.io.ParseException Java Exaples

Source File: RowCsvInputFormat.java From Flink-CEPplus with Apache License 2.0

4 votes

@Override
protected boolean parseRecord(Object[] holders, byte[] bytes, int offset, int numBytes) throws ParseException {
	byte[] fieldDelimiter = this.getFieldDelimiter();
	boolean[] fieldIncluded = this.fieldIncluded;

	int startPos = offset;
	int limit = offset + numBytes;

	int field = 0;
	int output = 0;
	while (field < fieldIncluded.length) {

		// check valid start position
		if (startPos > limit || (startPos == limit && field != fieldIncluded.length - 1)) {
			if (isLenient()) {
				return false;
			} else {
				throw new ParseException("Row too short: " + new String(bytes, offset, numBytes, getCharset()));
			}
		}

		if (fieldIncluded[field]) {
			// parse field
			FieldParser<Object> parser = (FieldParser<Object>) this.getFieldParsers()[fieldPosMap[output]];
			int latestValidPos = startPos;
			startPos = parser.resetErrorStateAndParse(
				bytes,
				startPos,
				limit,
				fieldDelimiter,
				holders[fieldPosMap[output]]);

			if (!isLenient() && (parser.getErrorState() != FieldParser.ParseErrorState.NONE)) {
				// the error state EMPTY_COLUMN is ignored
				if (parser.getErrorState() != FieldParser.ParseErrorState.EMPTY_COLUMN) {
					throw new ParseException(String.format("Parsing error for column %1$s of row '%2$s' originated by %3$s: %4$s.",
						field + 1, new String(bytes, offset, numBytes), parser.getClass().getSimpleName(), parser.getErrorState()));
				}
			}
			holders[fieldPosMap[output]] = parser.getLastResult();

			// check parse result:
			// the result is null if it is invalid
			// or empty with emptyColumnAsNull enabled
			if (startPos < 0 ||
				(emptyColumnAsNull && (parser.getErrorState().equals(FieldParser.ParseErrorState.EMPTY_COLUMN)))) {
				holders[fieldPosMap[output]] = null;
				startPos = skipFields(bytes, latestValidPos, limit, fieldDelimiter);
			}
			output++;
		} else {
			// skip field
			startPos = skipFields(bytes, startPos, limit, fieldDelimiter);
		}

		// check if something went wrong
		if (startPos < 0) {
			throw new ParseException(String.format("Unexpected parser position for column %1$s of row '%2$s'",
				field + 1, new String(bytes, offset, numBytes)));
		}
		else if (startPos == limit
				&& field != fieldIncluded.length - 1
				&& !FieldParser.endsWithDelimiter(bytes, startPos - 1, fieldDelimiter)) {
			// We are at the end of the record, but not all fields have been read
			// and the end is not a field delimiter indicating an empty last field.
			if (isLenient()) {
				return false;
			} else {
				throw new ParseException("Row too short: " + new String(bytes, offset, numBytes));
			}
		}

		field++;
	}
	return true;
}

Source File: CsvInputFormatTest.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testTailingEmptyFields() throws Exception {
	final String fileContent = "aa,bb,cc\n" + // ok
			"aa,bb,\n" +  // the last field is empty
			"aa,,\n" +    // the last two fields are empty
			",,\n" +      // all fields are empty
			"aa,bb";      // row too short
	final FileInputSplit split = createTempFile(fileContent);

	final TupleTypeInfo<Tuple3<String, String, String>> typeInfo =
			TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
	final CsvInputFormat<Tuple3<String, String, String>> format =
			new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);

	format.setFieldDelimiter(",");

	format.configure(new Configuration());
	format.open(split);

	Tuple3<String, String, String> result = new Tuple3<String, String, String>();

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("bb", result.f1);
	assertEquals("cc", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("bb", result.f1);
	assertEquals("", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("", result.f1);
	assertEquals("", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.f0);
	assertEquals("", result.f1);
	assertEquals("", result.f2);

	try {
		format.nextRecord(result);
		fail("Parse Exception was not thrown! (Row too short)");
	} catch (ParseException e) {}
}

Source File: RowCsvInputFormatTest.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testTailingEmptyFields() throws Exception {
	String fileContent = "abc|-def|-ghijk\n" +
			"abc|-def|-\n" +
			"abc|-|-\n" +
			"|-|-|-\n" +
			"|-|-\n" +
			"abc|-def\n";

	FileInputSplit split = createTempFile(fileContent);

	TypeInformation[] fieldTypes = new TypeInformation[]{
			BasicTypeInfo.STRING_TYPE_INFO,
			BasicTypeInfo.STRING_TYPE_INFO,
			BasicTypeInfo.STRING_TYPE_INFO};

	RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
	format.setFieldDelimiter("|-");
	format.configure(new Configuration());
	format.open(split);

	Row result = new Row(3);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("def", result.getField(1));
	assertEquals("ghijk", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("def", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	try {
		format.nextRecord(result);
		fail("Parse Exception was not thrown! (Row too short)");
	} catch (ParseException e) {}
}

Source File: RowCsvInputFormat.java From flink with Apache License 2.0

4 votes

@Override
protected boolean parseRecord(Object[] holders, byte[] bytes, int offset, int numBytes) throws ParseException {
	byte[] fieldDelimiter = this.getFieldDelimiter();
	boolean[] fieldIncluded = this.fieldIncluded;

	int startPos = offset;
	int limit = offset + numBytes;

	int field = 0;
	int output = 0;
	while (field < fieldIncluded.length) {

		// check valid start position
		if (startPos > limit || (startPos == limit && field != fieldIncluded.length - 1)) {
			if (isLenient()) {
				return false;
			} else {
				throw new ParseException("Row too short: " + new String(bytes, offset, numBytes, getCharset()));
			}
		}

		if (fieldIncluded[field]) {
			// parse field
			FieldParser<Object> parser = (FieldParser<Object>) this.getFieldParsers()[fieldPosMap[output]];
			int latestValidPos = startPos;
			startPos = parser.resetErrorStateAndParse(
				bytes,
				startPos,
				limit,
				fieldDelimiter,
				holders[fieldPosMap[output]]);

			if (!isLenient() && (parser.getErrorState() != FieldParser.ParseErrorState.NONE)) {
				// the error state EMPTY_COLUMN is ignored
				if (parser.getErrorState() != FieldParser.ParseErrorState.EMPTY_COLUMN) {
					throw new ParseException(String.format("Parsing error for column %1$s of row '%2$s' originated by %3$s: %4$s.",
						field + 1, new String(bytes, offset, numBytes), parser.getClass().getSimpleName(), parser.getErrorState()));
				}
			}
			holders[fieldPosMap[output]] = parser.getLastResult();

			// check parse result:
			// the result is null if it is invalid
			// or empty with emptyColumnAsNull enabled
			if (startPos < 0 ||
				(emptyColumnAsNull && (parser.getErrorState().equals(FieldParser.ParseErrorState.EMPTY_COLUMN)))) {
				holders[fieldPosMap[output]] = null;
				startPos = skipFields(bytes, latestValidPos, limit, fieldDelimiter);
			}
			output++;
		} else {
			// skip field
			startPos = skipFields(bytes, startPos, limit, fieldDelimiter);
		}

		// check if something went wrong
		if (startPos < 0) {
			throw new ParseException(String.format("Unexpected parser position for column %1$s of row '%2$s'",
				field + 1, new String(bytes, offset, numBytes)));
		}
		else if (startPos == limit
				&& field != fieldIncluded.length - 1
				&& !FieldParser.endsWithDelimiter(bytes, startPos - 1, fieldDelimiter)) {
			// We are at the end of the record, but not all fields have been read
			// and the end is not a field delimiter indicating an empty last field.
			if (isLenient()) {
				return false;
			} else {
				throw new ParseException("Row too short: " + new String(bytes, offset, numBytes));
			}
		}

		field++;
	}
	return true;
}

Source File: CsvInputFormatTest.java From flink with Apache License 2.0

4 votes

@Test
public void testTailingEmptyFields() throws Exception {
	final String fileContent = "aa,bb,cc\n" + // ok
			"aa,bb,\n" +  // the last field is empty
			"aa,,\n" +    // the last two fields are empty
			",,\n" +      // all fields are empty
			"aa,bb";      // row too short
	final FileInputSplit split = createTempFile(fileContent);

	final TupleTypeInfo<Tuple3<String, String, String>> typeInfo =
			TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
	final CsvInputFormat<Tuple3<String, String, String>> format =
			new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);

	format.setFieldDelimiter(",");

	format.configure(new Configuration());
	format.open(split);

	Tuple3<String, String, String> result = new Tuple3<String, String, String>();

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("bb", result.f1);
	assertEquals("cc", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("bb", result.f1);
	assertEquals("", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("", result.f1);
	assertEquals("", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.f0);
	assertEquals("", result.f1);
	assertEquals("", result.f2);

	try {
		format.nextRecord(result);
		fail("Parse Exception was not thrown! (Row too short)");
	} catch (ParseException e) {}
}

Source File: RowCsvInputFormatTest.java From flink with Apache License 2.0

4 votes

@Test
public void testTailingEmptyFields() throws Exception {
	String fileContent = "abc|-def|-ghijk\n" +
			"abc|-def|-\n" +
			"abc|-|-\n" +
			"|-|-|-\n" +
			"|-|-\n" +
			"abc|-def\n";

	FileInputSplit split = createTempFile(fileContent);

	TypeInformation[] fieldTypes = new TypeInformation[]{
			BasicTypeInfo.STRING_TYPE_INFO,
			BasicTypeInfo.STRING_TYPE_INFO,
			BasicTypeInfo.STRING_TYPE_INFO};

	RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
	format.setFieldDelimiter("|-");
	format.configure(new Configuration());
	format.open(split);

	Row result = new Row(3);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("def", result.getField(1));
	assertEquals("ghijk", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("def", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	try {
		format.nextRecord(result);
		fail("Parse Exception was not thrown! (Row too short)");
	} catch (ParseException e) {}
}

Source File: RowCsvInputFormat.java From flink with Apache License 2.0

4 votes

@Override
protected boolean parseRecord(Object[] holders, byte[] bytes, int offset, int numBytes) throws ParseException {
	byte[] fieldDelimiter = this.getFieldDelimiter();
	boolean[] fieldIncluded = this.fieldIncluded;

	int startPos = offset;
	int limit = offset + numBytes;

	int field = 0;
	int output = 0;
	while (field < fieldIncluded.length) {

		// check valid start position
		if (startPos > limit || (startPos == limit && field != fieldIncluded.length - 1)) {
			if (isLenient()) {
				return false;
			} else {
				throw new ParseException("Row too short: " + new String(bytes, offset, numBytes, getCharset()));
			}
		}

		if (fieldIncluded[field]) {
			// parse field
			FieldParser<Object> parser = (FieldParser<Object>) this.getFieldParsers()[fieldPosMap[output]];
			int latestValidPos = startPos;
			startPos = parser.resetErrorStateAndParse(
				bytes,
				startPos,
				limit,
				fieldDelimiter,
				holders[fieldPosMap[output]]);

			if (!isLenient() && (parser.getErrorState() != FieldParser.ParseErrorState.NONE)) {
				// the error state EMPTY_COLUMN is ignored
				if (parser.getErrorState() != FieldParser.ParseErrorState.EMPTY_COLUMN) {
					throw new ParseException(String.format("Parsing error for column %1$s of row '%2$s' originated by %3$s: %4$s.",
						field + 1, new String(bytes, offset, numBytes), parser.getClass().getSimpleName(), parser.getErrorState()));
				}
			}
			holders[fieldPosMap[output]] = parser.getLastResult();

			// check parse result:
			// the result is null if it is invalid
			// or empty with emptyColumnAsNull enabled
			if (startPos < 0 ||
				(emptyColumnAsNull && (parser.getErrorState().equals(FieldParser.ParseErrorState.EMPTY_COLUMN)))) {
				holders[fieldPosMap[output]] = null;
				startPos = skipFields(bytes, latestValidPos, limit, fieldDelimiter);
			}
			output++;
		} else {
			// skip field
			startPos = skipFields(bytes, startPos, limit, fieldDelimiter);
		}

		// check if something went wrong
		if (startPos < 0) {
			throw new ParseException(String.format("Unexpected parser position for column %1$s of row '%2$s'",
				field + 1, new String(bytes, offset, numBytes)));
		}
		else if (startPos == limit
				&& field != fieldIncluded.length - 1
				&& !FieldParser.endsWithDelimiter(bytes, startPos - 1, fieldDelimiter)) {
			// We are at the end of the record, but not all fields have been read
			// and the end is not a field delimiter indicating an empty last field.
			if (isLenient()) {
				return false;
			} else {
				throw new ParseException("Row too short: " + new String(bytes, offset, numBytes));
			}
		}

		field++;
	}
	return true;
}

Source File: CsvInputFormatTest.java From flink with Apache License 2.0

4 votes

@Test
public void testTailingEmptyFields() throws Exception {
	final String fileContent = "aa,bb,cc\n" + // ok
			"aa,bb,\n" +  // the last field is empty
			"aa,,\n" +    // the last two fields are empty
			",,\n" +      // all fields are empty
			"aa,bb";      // row too short
	final FileInputSplit split = createTempFile(fileContent);

	final TupleTypeInfo<Tuple3<String, String, String>> typeInfo =
			TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class, String.class);
	final CsvInputFormat<Tuple3<String, String, String>> format =
			new TupleCsvInputFormat<Tuple3<String, String, String>>(PATH, typeInfo);

	format.setFieldDelimiter(",");

	format.configure(new Configuration());
	format.open(split);

	Tuple3<String, String, String> result = new Tuple3<String, String, String>();

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("bb", result.f1);
	assertEquals("cc", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("bb", result.f1);
	assertEquals("", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("aa", result.f0);
	assertEquals("", result.f1);
	assertEquals("", result.f2);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.f0);
	assertEquals("", result.f1);
	assertEquals("", result.f2);

	try {
		format.nextRecord(result);
		fail("Parse Exception was not thrown! (Row too short)");
	} catch (ParseException e) {}
}

Source File: RowCsvInputFormatTest.java From flink with Apache License 2.0

4 votes

@Test
public void testTailingEmptyFields() throws Exception {
	String fileContent = "abc|-def|-ghijk\n" +
			"abc|-def|-\n" +
			"abc|-|-\n" +
			"|-|-|-\n" +
			"|-|-\n" +
			"abc|-def\n";

	FileInputSplit split = createTempFile(fileContent);

	TypeInformation[] fieldTypes = new TypeInformation[]{
			BasicTypeInfo.STRING_TYPE_INFO,
			BasicTypeInfo.STRING_TYPE_INFO,
			BasicTypeInfo.STRING_TYPE_INFO};

	RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
	format.setFieldDelimiter("|-");
	format.configure(new Configuration());
	format.open(split);

	Row result = new Row(3);

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("def", result.getField(1));
	assertEquals("ghijk", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("def", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("abc", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	result = format.nextRecord(result);
	assertNotNull(result);
	assertEquals("", result.getField(0));
	assertEquals("", result.getField(1));
	assertEquals("", result.getField(2));

	try {
		format.nextRecord(result);
		fail("Parse Exception was not thrown! (Row too short)");
	} catch (ParseException e) {}
}

org.apache.flink.api.common.io.ParseException Java Examples