Skip to content

Commit

Permalink
implement skipLines
Browse files Browse the repository at this point in the history
  • Loading branch information
osiegmar committed Nov 3, 2024
1 parent 931bea8 commit 4b20383
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 33 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Nothing yet
### Added
- Add `skipLines(int)` and `skipLines(Predicate<String>, int)` to `CsvReader` to skip lines before the actual CSV data starts

## [3.3.1] - 2024-09-23
### Fixed
Expand Down
13 changes: 7 additions & 6 deletions docs/src/content/docs/guides/Examples/skip-non-csv-head.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ Strictly speaking, such a file **is not a valid CSV file** as defined by the CSV
The main problem with those files is:

- An exception would be thrown unless the options `ignoreDifferentFieldCount()` and `skipEmptyLines()` are set.
- When working with named fields, the very first line (*This is an example of a CSV file that contains*)
- When working with named fields, the very first line (`This is an example of a CSV file that contains`)
would be interpreted as the actual header line.

FastCSV itself does currently not provide a way to skip non-CSV head lines when reading a CSV file.
However, you can skip non-CSV head lines by reading the file line by line and only hand over to FastCSV
when the actual CSV data starts. This could be done based on a fixed number of lines or by detecting
the actual CSV data.
FastCSV comes with two features to handle such files:

- `skipLines(int lineCount)`: Skip a specific number of lines (`lineCount`) regardless of their content.
- `skipLines(Predicate<String> predicate, int maxLines)`: Skip lines until a specific line (e.g., the header) is found.
Stop skipping after a specific number of lines (`maxLines`).

:::note
Do not mix this with comments in CSV files. Comments are lines that start with a specific character
Expand All @@ -40,4 +41,4 @@ See the example on how to [handle comments](/guides/examples/handle-comments/) f

This example demonstrates how to skip non-CSV head lines when reading such a CSV file with FastCSV.

<SourceExample filename="ExampleCsvReaderWithNonCsvAtStart.java"/>
<SourceExample filename="ExampleCsvReaderWithNonCsvAtStart.java" highlights={'skipLines'}/>
1 change: 1 addition & 0 deletions docs/src/content/docs/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ The main features of FastCSV include:
- Configurable data validation
- Supports optional header records (access fields by name)
- Supports skipping empty lines
- Supports skipping non-CSV header (either by a fixed number of lines or by peeking data)
- Supports commented lines (skipping & reading) with configurable comment character
- Configurable field modifiers (e.g., to trim fields)
- Flexible callback handlers (e.g., to directly map to domain objects)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
package example;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.function.Predicate;

import de.siegmar.fastcsv.reader.CsvReader;
import de.siegmar.fastcsv.reader.NamedCsvRecordHandler;

/**
* Example for reading CSV data with non-CSV data before the actual CSV header.
Expand All @@ -15,7 +12,7 @@ public class ExampleCsvReaderWithNonCsvAtStart {

private static final String DATA = """
Your CSV file contains some non-CSV data before the actual CSV header?
And you don't want to misinterpret them as CSV header? No problem!
And you don't want to (mis)interpret them as CSV header? No problem!
header1,header2
foo,bar
Expand All @@ -28,36 +25,30 @@ public static void main(final String[] args) throws IOException {

private static void alternative1() throws IOException {
System.out.println("Alternative 1 - ignore specific number of lines");
final CsvReader.CsvReaderBuilder builder = CsvReader.builder()
.ignoreDifferentFieldCount(false);

try (var br = new BufferedReader(new StringReader(DATA))) {
// ignore the first 3 lines
br.lines().limit(3).forEach(r -> { });
try (var csv = CsvReader.builder().ofNamedCsvRecord(DATA)) {
// Skip the first 3 lines
System.out.println("Skipping the first 3 lines");
csv.skipLines(3);

builder.ofNamedCsvRecord(br)
.forEach(System.out::println);
// Read the CSV data
csv.forEach(System.out::println);
}
}

private static void alternative2() throws IOException {
System.out.println("Alternative 2 - wait for a specific line");
final CsvReader.CsvReaderBuilder builder = CsvReader.builder()
.ignoreDifferentFieldCount(false);

try (var br = new BufferedReader(new StringReader(DATA))) {
// Look for the CSV header but read at most 100 lines
final List<String> header = br.lines()
.limit(100)
.filter(l -> l.contains("header1,header2"))
.findFirst()
.map(line -> builder.ofCsvRecord(line).stream().findFirst()
.orElseThrow(() -> new IllegalStateException("Illegal header: " + line))
.getFields())
.orElseThrow(() -> new IllegalStateException("No CSV header found"));
final Predicate<String> isHeader = line ->
line.contains("header1");

builder.build(new NamedCsvRecordHandler(header), br)
.forEach(System.out::println);
try (var csv = CsvReader.builder().ofNamedCsvRecord(DATA)) {
// Skip until the header line is found, but not more than 10 lines
final int actualSkipped = csv.skipLines(isHeader, 10);
System.out.println("Found header line after skipping " + actualSkipped + " lines");

// Read the CSV data
csv.forEach(System.out::println);
}
}

Expand Down
42 changes: 42 additions & 0 deletions lib/src/main/java/de/siegmar/fastcsv/reader/CsvParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,48 @@ public void close() throws IOException {
csvBuffer.close();
}

String peekLine() throws IOException {
final int savedPos = csvBuffer.pos;

for (; csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData(); csvBuffer.pos++) {
final char c = csvBuffer.buf[csvBuffer.pos];
if (c == CR || c == LF) {
break;
}
}

final String s = new String(csvBuffer.buf, csvBuffer.begin, csvBuffer.pos - csvBuffer.begin);
csvBuffer.pos = savedPos;
return s;
}

boolean skipLine(final int numCharsToSkip) throws IOException {
// Skip chars that have been peeked already
csvBuffer.pos += numCharsToSkip;

while (csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData()) {
final char c = csvBuffer.buf[csvBuffer.pos++];
if (c == CR) {
if ((csvBuffer.pos < csvBuffer.len || csvBuffer.fetchData())
&& csvBuffer.buf[csvBuffer.pos] == LF) {
// CRLF
csvBuffer.pos++;
}
break;
} else if (c == LF) {
break;
}
}

if (csvBuffer.begin < csvBuffer.pos) {
csvBuffer.begin = csvBuffer.pos;
startingLineNumber++;
return true;
}

return false;
}

@SuppressWarnings("checkstyle:visibilitymodifier")
private static class CsvBuffer implements Closeable {

Expand Down
72 changes: 72 additions & 0 deletions lib/src/main/java/de/siegmar/fastcsv/reader/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.Spliterator;
import java.util.StringJoiner;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

Expand Down Expand Up @@ -76,6 +77,77 @@ public static CsvReaderBuilder builder() {
return new CsvReaderBuilder();
}

/**
* Skips the specified number of lines.
* <p>
* The setting {@link CsvReaderBuilder#skipEmptyLines(boolean)} has no effect on this method.
*
* @param lineCount the number of lines to skip.
* @throws IllegalArgumentException if lineCount is negative.
* @throws UncheckedIOException if an I/O error occurs.
* @throws CsvParseException if not enough lines are available to skip.
*/
public void skipLines(final int lineCount) {
if (lineCount < 0) {
throw new IllegalArgumentException("lineCount must be non-negative");
}

try {
for (int i = 0; i < lineCount; i++) {
if (!csvParser.skipLine(0)) {
throw new CsvParseException("Not enough lines to skip. Skipped only " + i + " line(s).");
}
}
} catch (final IOException e) {
throw new UncheckedIOException(e);
}
}

/**
* Skip lines until the specified predicate matches.
* The line that matches the predicate is not skipped.
* <p>
* The method returns the number of lines actually skipped.
* <p>
* The setting {@link CsvReaderBuilder#skipEmptyLines(boolean)} has no effect on this method.
*
* @param predicate the predicate to match the lines.
* @param maxLines the maximum number of lines to skip.
* @return the number of lines actually skipped.
* @throws NullPointerException if predicate is {@code null}.
* @throws UncheckedIOException if an I/O error occurs.
* @throws CsvParseException if no matching line is found within the maximum limit of maxLines.
*/
public int skipLines(final Predicate<String> predicate, final int maxLines) {
Objects.requireNonNull(predicate, "predicate must not be null");
if (maxLines < 0) {
throw new IllegalArgumentException("maxLines must be non-negative");
}

if (maxLines == 0) {
return 0;
}

try {
for (int i = 0; i < maxLines; i++) {
final String line = csvParser.peekLine();
if (predicate.test(line)) {
return i;
}

if (!csvParser.skipLine(line.length())) {
throw new CsvParseException(String.format(
"No matching line found. Skipped %d line(s) before reaching end of data.", i));
}
}
} catch (final IOException e) {
throw new UncheckedIOException(e);
}

throw new CsvParseException(String.format(
"No matching line found within the maximum limit of %d lines.", maxLines));
}

/**
* Returns an iterator over elements of type {@link CsvRecord}.
* <p>
Expand Down

0 comments on commit 4b20383

Please sign in to comment.