001 /*
002 * Read files in comma separated value format.
003 * Copyright (C) 2001-2004 Stephen Ostermiller
004 * http://ostermiller.org/contact.pl?regarding=Java+Utilities
005 *
006 * This program is free software; you can redistribute it and/or modify
007 * it under the terms of the GNU General Public License as published by
008 * the Free Software Foundation; either version 2 of the License, or
009 * (at your option) any later version.
010 *
011 * This program is distributed in the hope that it will be useful,
012 * but WITHOUT ANY WARRANTY; without even the implied warranty of
013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014 * GNU General Public License for more details.
015 *
016 * See COPYING.TXT for details.
017 */
018
019 //package com.Ostermiller.util;
020 package net.sf.logdistiller.util.csv;
021
022 import java.io.*;
023 import java.util.*;
024
025 /**
026 * Read files in comma separated value format. More information about this class is available from <a target="_top"
027 * href= "http://ostermiller.org/utils/CSVLexer.html">ostermiller.org</a>. CSV is a file format used as a portable
028 * representation of a database. Each line is one entry or record and the fields in a record are separated by commas.
029 * Commas may be preceded or followed by arbitrary space and/or tab characters which are ignored.
030 * <P>
031 * If field includes a comma or a new line, the whole field must be surrounded with double quotes. When the field is in
032 * quotes, any quote literals must be escaped by \" Backslash literals must be escaped by \\. Otherwise a backslash and
033 * the character following will be treated as the following character, IE. "\n" is equivalent to "n". Other escape
034 * sequences may be set using the setEscapes() method. Text that comes after quotes that have been closed but come
035 * before the next comma will be ignored.
036 * <P>
037 * Empty fields are returned as as String of length zero: "". The following line has three empty fields and three
038 * non-empty fields in it. There is an empty field on each end, and one in the middle. One token is returned as a space.
039 * <br>
040 *
041 * <pre>
042 * ,second,," ",fifth,
043 * </pre>
044 * <P>
045 * Blank lines are always ignored. Other lines will be ignored if they start with a comment character as set by the
046 * setCommentStart() method.
047 * <P>
048 * An example of how CVSLexer might be used:
049 *
050 * <pre>
051 * CSVParser shredder = new CSVParser( System.in );
052 * shredder.setCommentStart( "#;!" );
053 * shredder.setEscapes( "nrtf", "\n\r\t\f" );
054 * String t;
055 * while ( ( t = shredder.nextValue() ) != null )
056 * {
057 * System.out.println( "" + shredder.lastLineNumber() + " " + t );
058 * }
059 * </pre>
060 * <P>
061 * Some applications do not output CSV according to the generally accepted standards and this parse may not be able to
062 * handle it. One such application is the Microsoft Excel spreadsheet. A separate class must be use to read <a
063 * href="http://ostermiller.org/utils/ExcelCSV.html">Excel CSV</a>.
064 *
065 * @see com.Ostermiller.util.ExcelCSVParser
066 * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
067 * @since ostermillerutils 1.00.00
068 */
069 public class CSVParser
070 implements CSVParse
071 {
072
073 /**
074 * InputStream on which this parser is based.
075 *
076 * @since ostermillerutils 1.02.22
077 */
078 private InputStream inStream;
079
080 /**
081 * Reader on which this parser is based.
082 *
083 * @since ostermillerutils 1.02.22
084 */
085 private Reader inReader;
086
087 /**
088 * Does all the dirty work. Calls for new tokens are routed through this object.
089 *
090 * @since ostermillerutils 1.00.00
091 */
092 private CSVLexer lexer;
093
094 /**
095 * Token cache. Used for when we request a token from the lexer but can't return it because its on the next line.
096 *
097 * @since ostermillerutils 1.00.00
098 */
099 private String tokenCache;
100
101 /**
102 * Line cache. The line number that goes along with the tokenCache. Not valid if the tokenCache is null.
103 *
104 * @since ostermillerutils 1.00.00
105 */
106 private int lineCache;
107
108 /**
109 * The line number the last token came from, or -1 if no tokens have been returned.
110 *
111 * @since ostermillerutils 1.00.00
112 */
113 private int lastLine = -1;
114
115 /**
116 * Create a parser to parse comma separated values from an InputStream.
117 * <p>
118 * Byte to character conversion is done using the platform default locale.
119 *
120 * @param in stream that contains comma separated values.
121 * @since ostermillerutils 1.00.00
122 */
123 public CSVParser( InputStream in )
124 {
125 inStream = in;
126 lexer = new CSVLexer( in );
127 }
128
129 /**
130 * Create a parser to parse delimited values from an InputStream.
131 * <p>
132 * Byte to character conversion is done using the platform default locale.
133 *
134 * @param in stream that contains comma separated values.
135 * @param delimiter record separator
136 * @throws BadDelimiterException if the specified delimiter cannot be used
137 * @since ostermillerutils 1.02.24
138 */
139 public CSVParser( InputStream in, char delimiter )
140 throws BadDelimiterException
141 {
142 inStream = in;
143 lexer = new CSVLexer( in );
144 changeDelimiter( delimiter );
145 }
146
147 /**
148 * Create a parser to parse comma separated values from a Reader.
149 *
150 * @param in reader that contains comma separated values.
151 * @since ostermillerutils 1.00.00
152 */
153 public CSVParser( Reader in )
154 {
155 inReader = in;
156 lexer = new CSVLexer( in );
157 }
158
159 /**
160 * Create a parser to parse delimited values from a Reader.
161 *
162 * @param in reader that contains comma separated values.
163 * @param delimiter record separator
164 * @throws BadDelimiterException if the specified delimiter cannot be used
165 * @since ostermillerutils 1.02.24
166 */
167 public CSVParser( Reader in, char delimiter )
168 throws BadDelimiterException
169 {
170 inReader = in;
171 lexer = new CSVLexer( in );
172 changeDelimiter( delimiter );
173 }
174
175 /**
176 * Create a parser to parse delimited values from an InputStream.
177 * <p>
178 * Byte to character conversion is done using the platform default locale.
179 *
180 * @param in stream that contains comma separated values.
181 * @param escapes a list of characters that will represent escape sequences.
182 * @param replacements the list of replacement characters for those escape sequences.
183 * @param commentDelims list of characters a comment line may start with.
184 * @param delimiter record separator
185 * @throws BadDelimiterException if the specified delimiter cannot be used
186 * @since ostermillerutils 1.02.24
187 */
188 public CSVParser( InputStream in, char delimiter, String escapes, String replacements, String commentDelims )
189 throws BadDelimiterException
190 {
191 inStream = in;
192 lexer = new CSVLexer( in );
193 setEscapes( escapes, replacements );
194 setCommentStart( commentDelims );
195 changeDelimiter( delimiter );
196 }
197
198 /**
199 * Create a parser to parse comma separated values from an InputStream.
200 * <p>
201 * Byte to character conversion is done using the platform default locale.
202 *
203 * @param in stream that contains comma separated values.
204 * @param escapes a list of characters that will represent escape sequences.
205 * @param replacements the list of replacement characters for those escape sequences.
206 * @param commentDelims list of characters a comment line may start with.
207 * @since ostermillerutils 1.00.00
208 */
209 public CSVParser( InputStream in, String escapes, String replacements, String commentDelims )
210 {
211 inStream = in;
212 lexer = new CSVLexer( in );
213 setEscapes( escapes, replacements );
214 setCommentStart( commentDelims );
215 }
216
217 /**
218 * Create a parser to parse delimited values from a Reader.
219 *
220 * @param in reader that contains comma separated values.
221 * @param escapes a list of characters that will represent escape sequences.
222 * @param replacements the list of replacement characters for those escape sequences.
223 * @param commentDelims list of characters a comment line may start with.
224 * @param delimiter record separator
225 * @throws BadDelimiterException if the specified delimiter cannot be used
226 * @since ostermillerutils 1.02.24
227 */
228 public CSVParser( Reader in, char delimiter, String escapes, String replacements, String commentDelims )
229 throws BadDelimiterException
230 {
231 inReader = in;
232 lexer = new CSVLexer( in );
233 setEscapes( escapes, replacements );
234 setCommentStart( commentDelims );
235 changeDelimiter( delimiter );
236 }
237
238 /**
239 * Create a parser to parse comma separated values from a Reader.
240 *
241 * @param in reader that contains comma separated values.
242 * @param escapes a list of characters that will represent escape sequences.
243 * @param replacements the list of replacement characters for those escape sequences.
244 * @param commentDelims list of characters a comment line may start with.
245 * @since ostermillerutils 1.00.00
246 */
247 public CSVParser( Reader in, String escapes, String replacements, String commentDelims )
248 {
249 inReader = in;
250 lexer = new CSVLexer( in );
251 setEscapes( escapes, replacements );
252 setCommentStart( commentDelims );
253 }
254
255 /**
256 * Close any stream upon which this parser is based.
257 *
258 * @since ostermillerutils 1.02.22
259 * @throws IOException if an error occurs while closing the stream.
260 */
261 public void close()
262 throws IOException
263 {
264 if ( inStream != null )
265 inStream.close();
266 if ( inReader != null )
267 inReader.close();
268 }
269
270 /**
271 * get the next value.
272 *
273 * @return the next value or null if there are no more values.
274 * @throws IOException if an error occurs while reading.
275 * @since ostermillerutils 1.00.00
276 */
277 public String nextValue()
278 throws IOException
279 {
280 if ( tokenCache == null )
281 {
282 tokenCache = lexer.getNextToken();
283 lineCache = lexer.getLineNumber();
284 }
285 lastLine = lineCache;
286 String result = tokenCache;
287 tokenCache = null;
288 return result;
289 }
290
291 /**
292 * Get the line number that the last token came from.
293 * <p>
294 * New line breaks that occur in the middle of a token are no counted in the line number count.
295 *
296 * @return line number or -1 if no tokens have been returned yet.
297 * @since ostermillerutils 1.00.00
298 */
299 public int lastLineNumber()
300 {
301 return lastLine;
302 }
303
304 /**
305 * Get all the values from a line.
306 * <p>
307 * If the line has already been partially read, only the values that have not already been read will be included.
308 *
309 * @return all the values from the line or null if there are no more values.
310 * @throws IOException if an error occurs while reading.
311 * @since ostermillerutils 1.00.00
312 */
313 public String[] getLine()
314 throws IOException
315 {
316 int lineNumber = -1;
317 ArrayList v = new ArrayList();
318 if ( tokenCache != null )
319 {
320 v.add( tokenCache );
321 lineNumber = lineCache;
322 }
323 while ( ( tokenCache = lexer.getNextToken() ) != null
324 && ( lineNumber == -1 || lexer.getLineNumber() == lineNumber ) )
325 {
326 v.add( tokenCache );
327 lineNumber = lexer.getLineNumber();
328 }
329 if ( v.size() == 0 )
330 {
331 return null;
332 }
333 lastLine = lineNumber;
334 lineCache = lexer.getLineNumber();
335 String[] result = new String[v.size()];
336 return ( (String[]) v.toArray( result ) );
337 }
338
339 /**
340 * Get all the values from the file.
341 * <p>
342 * If the file has already been partially read, only the values that have not already been read will be included.
343 * <p>
344 * Each line of the file that has at least one value will be represented. Comments and empty lines are ignored.
345 * <p>
346 * The resulting double array may be jagged.
347 *
348 * @return all the values from the file or null if there are no more values.
349 * @throws IOException if an error occurs while reading.
350 * @since ostermillerutils 1.00.00
351 */
352 public String[][] getAllValues()
353 throws IOException
354 {
355 ArrayList v = new ArrayList();
356 String[] line;
357 while ( ( line = getLine() ) != null )
358 {
359 v.add( line );
360 }
361 if ( v.size() == 0 )
362 {
363 return null;
364 }
365 String[][] result = new String[v.size()][];
366 return ( (String[][]) v.toArray( result ) );
367 }
368
369 /**
370 * Specify escape sequences and their replacements. Escape sequences set here are in addition to \\ and \". \\ and
371 * \" are always valid escape sequences. This method allows standard escape sequenced to be used. For example "\n"
372 * can be set to be a newline rather than an 'n'. A common way to call this method might be:<br>
373 * <code>setEscapes("nrtf", "\n\r\t\f");</code><br>
374 * which would set the escape sequences to be the Java escape sequences. Characters that follow a \ that are not
375 * escape sequences will still be interpreted as that character.<br>
376 * The two arguments to this method must be the same length. If they are not, the longer of the two will be
377 * truncated.
378 *
379 * @param escapes a list of characters that will represent escape sequences.
380 * @param replacements the list of replacement characters for those escape sequences.
381 * @since ostermillerutils 1.00.00
382 */
383 public void setEscapes( String escapes, String replacements )
384 {
385 lexer.setEscapes( escapes, replacements );
386 }
387
388 /**
389 * Change this parser so that it uses a new delimiter.
390 * <p>
391 * The initial character is a comma, the delimiter cannot be changed to a quote or other character that has special
392 * meaning in CSV.
393 *
394 * @param newDelim delimiter to which to switch.
395 * @throws BadDelimiterException if the character cannot be used as a delimiter.
396 * @since ostermillerutils 1.02.08
397 */
398 public void changeDelimiter( char newDelim )
399 throws BadDelimiterException
400 {
401 lexer.changeDelimiter( newDelim );
402 }
403
404 /**
405 * Change this parser so that it uses a new character for quoting.
406 * <p>
407 * The initial character is a double quote ("), the delimiter cannot be changed to a comma or other character that
408 * has special meaning in CSV.
409 *
410 * @param newQuote character to use for quoting.
411 * @throws BadQuoteException if the character cannot be used as a quote.
412 * @since ostermillerutils 1.02.16
413 */
414 public void changeQuote( char newQuote )
415 throws BadQuoteException
416 {
417 lexer.changeQuote( newQuote );
418 }
419
420 /**
421 * Set the characters that indicate a comment at the beginning of the line. For example if the string "#;!" were
422 * passed in, all of the following lines would be comments:<br>
423 *
424 * <pre>
425 * # Comment
426 * ; Another Comment
427 * ! Yet another comment
428 * </pre>
429 *
430 * By default there are no comments in CVS files. Commas and quotes may not be used to indicate comment lines.
431 *
432 * @param commentDelims list of characters a comment line may start with.
433 * @since ostermillerutils 1.00.00
434 */
435 public void setCommentStart( String commentDelims )
436 {
437 lexer.setCommentStart( commentDelims );
438 }
439
440 /**
441 * Get the number of the line from which the last value was retrieved.
442 *
443 * @return line number or -1 if no tokens have been returned.
444 * @since ostermillerutils 1.00.00
445 */
446 public int getLastLineNumber()
447 {
448 return lastLine;
449 }
450
451 /**
452 * Get the number of chars that have been read from the beginning.
453 *
454 * @since added in LogDistiller
455 */
456 public int getLastCharCount()
457 {
458 return lexer.yychar();
459 }
460 }