001    /*
002     * Read files in comma separated value format.
003     * Copyright (C) 2001-2004 Stephen Ostermiller
004     * http://ostermiller.org/contact.pl?regarding=Java+Utilities
005     *
006     * This program is free software; you can redistribute it and/or modify
007     * it under the terms of the GNU General Public License as published by
008     * the Free Software Foundation; either version 2 of the License, or
009     * (at your option) any later version.
010     *
011     * This program is distributed in the hope that it will be useful,
012     * but WITHOUT ANY WARRANTY; without even the implied warranty of
013     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014     * GNU General Public License for more details.
015     *
016     * See COPYING.TXT for details.
017     */
018    
019    //package com.Ostermiller.util;
020    package net.sf.logdistiller.util.csv;
021    
022    import java.io.*;
023    import java.util.*;
024    
025    /**
026     * Read files in comma separated value format. More information about this class is available from <a target="_top"
027     * href= "http://ostermiller.org/utils/CSVLexer.html">ostermiller.org</a>. CSV is a file format used as a portable
028     * representation of a database. Each line is one entry or record and the fields in a record are separated by commas.
029     * Commas may be preceded or followed by arbitrary space and/or tab characters which are ignored.
030     * <P>
031     * If field includes a comma or a new line, the whole field must be surrounded with double quotes. When the field is in
032     * quotes, any quote literals must be escaped by \" Backslash literals must be escaped by \\. Otherwise a backslash and
033     * the character following will be treated as the following character, IE. "\n" is equivalent to "n". Other escape
034     * sequences may be set using the setEscapes() method. Text that comes after quotes that have been closed but come
035     * before the next comma will be ignored.
036     * <P>
037     * Empty fields are returned as as String of length zero: "". The following line has three empty fields and three
038     * non-empty fields in it. There is an empty field on each end, and one in the middle. One token is returned as a space.
039     * <br>
040     *
041     * <pre>
042     * ,second,,&quot; &quot;,fifth,
043     * </pre>
044     * <P>
045     * Blank lines are always ignored. Other lines will be ignored if they start with a comment character as set by the
046     * setCommentStart() method.
047     * <P>
048     * An example of how CVSLexer might be used:
049     *
050     * <pre>
051     * CSVParser shredder = new CSVParser( System.in );
052     * shredder.setCommentStart( &quot;#;!&quot; );
053     * shredder.setEscapes( &quot;nrtf&quot;, &quot;\n\r\t\f&quot; );
054     * String t;
055     * while ( ( t = shredder.nextValue() ) != null )
056     * {
057     *     System.out.println( &quot;&quot; + shredder.lastLineNumber() + &quot; &quot; + t );
058     * }
059     * </pre>
060     * <P>
061     * Some applications do not output CSV according to the generally accepted standards and this parse may not be able to
062     * handle it. One such application is the Microsoft Excel spreadsheet. A separate class must be use to read <a
063     * href="http://ostermiller.org/utils/ExcelCSV.html">Excel CSV</a>.
064     *
065     * @see com.Ostermiller.util.ExcelCSVParser
066     * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
067     * @since ostermillerutils 1.00.00
068     */
069    public class CSVParser
070        implements CSVParse
071    {
072    
073        /**
074         * InputStream on which this parser is based.
075         *
076         * @since ostermillerutils 1.02.22
077         */
078        private InputStream inStream;
079    
080        /**
081         * Reader on which this parser is based.
082         *
083         * @since ostermillerutils 1.02.22
084         */
085        private Reader inReader;
086    
087        /**
088         * Does all the dirty work. Calls for new tokens are routed through this object.
089         *
090         * @since ostermillerutils 1.00.00
091         */
092        private CSVLexer lexer;
093    
094        /**
095         * Token cache. Used for when we request a token from the lexer but can't return it because its on the next line.
096         *
097         * @since ostermillerutils 1.00.00
098         */
099        private String tokenCache;
100    
101        /**
102         * Line cache. The line number that goes along with the tokenCache. Not valid if the tokenCache is null.
103         *
104         * @since ostermillerutils 1.00.00
105         */
106        private int lineCache;
107    
108        /**
109         * The line number the last token came from, or -1 if no tokens have been returned.
110         *
111         * @since ostermillerutils 1.00.00
112         */
113        private int lastLine = -1;
114    
115        /**
116         * Create a parser to parse comma separated values from an InputStream.
117         * <p>
118         * Byte to character conversion is done using the platform default locale.
119         *
120         * @param in stream that contains comma separated values.
121         * @since ostermillerutils 1.00.00
122         */
123        public CSVParser( InputStream in )
124        {
125            inStream = in;
126            lexer = new CSVLexer( in );
127        }
128    
129        /**
130         * Create a parser to parse delimited values from an InputStream.
131         * <p>
132         * Byte to character conversion is done using the platform default locale.
133         *
134         * @param in stream that contains comma separated values.
135         * @param delimiter record separator
136         * @throws BadDelimiterException if the specified delimiter cannot be used
137         * @since ostermillerutils 1.02.24
138         */
139        public CSVParser( InputStream in, char delimiter )
140            throws BadDelimiterException
141        {
142            inStream = in;
143            lexer = new CSVLexer( in );
144            changeDelimiter( delimiter );
145        }
146    
147        /**
148         * Create a parser to parse comma separated values from a Reader.
149         *
150         * @param in reader that contains comma separated values.
151         * @since ostermillerutils 1.00.00
152         */
153        public CSVParser( Reader in )
154        {
155            inReader = in;
156            lexer = new CSVLexer( in );
157        }
158    
159        /**
160         * Create a parser to parse delimited values from a Reader.
161         *
162         * @param in reader that contains comma separated values.
163         * @param delimiter record separator
164         * @throws BadDelimiterException if the specified delimiter cannot be used
165         * @since ostermillerutils 1.02.24
166         */
167        public CSVParser( Reader in, char delimiter )
168            throws BadDelimiterException
169        {
170            inReader = in;
171            lexer = new CSVLexer( in );
172            changeDelimiter( delimiter );
173        }
174    
175        /**
176         * Create a parser to parse delimited values from an InputStream.
177         * <p>
178         * Byte to character conversion is done using the platform default locale.
179         *
180         * @param in stream that contains comma separated values.
181         * @param escapes a list of characters that will represent escape sequences.
182         * @param replacements the list of replacement characters for those escape sequences.
183         * @param commentDelims list of characters a comment line may start with.
184         * @param delimiter record separator
185         * @throws BadDelimiterException if the specified delimiter cannot be used
186         * @since ostermillerutils 1.02.24
187         */
188        public CSVParser( InputStream in, char delimiter, String escapes, String replacements, String commentDelims )
189            throws BadDelimiterException
190        {
191            inStream = in;
192            lexer = new CSVLexer( in );
193            setEscapes( escapes, replacements );
194            setCommentStart( commentDelims );
195            changeDelimiter( delimiter );
196        }
197    
198        /**
199         * Create a parser to parse comma separated values from an InputStream.
200         * <p>
201         * Byte to character conversion is done using the platform default locale.
202         *
203         * @param in stream that contains comma separated values.
204         * @param escapes a list of characters that will represent escape sequences.
205         * @param replacements the list of replacement characters for those escape sequences.
206         * @param commentDelims list of characters a comment line may start with.
207         * @since ostermillerutils 1.00.00
208         */
209        public CSVParser( InputStream in, String escapes, String replacements, String commentDelims )
210        {
211            inStream = in;
212            lexer = new CSVLexer( in );
213            setEscapes( escapes, replacements );
214            setCommentStart( commentDelims );
215        }
216    
217        /**
218         * Create a parser to parse delimited values from a Reader.
219         *
220         * @param in reader that contains comma separated values.
221         * @param escapes a list of characters that will represent escape sequences.
222         * @param replacements the list of replacement characters for those escape sequences.
223         * @param commentDelims list of characters a comment line may start with.
224         * @param delimiter record separator
225         * @throws BadDelimiterException if the specified delimiter cannot be used
226         * @since ostermillerutils 1.02.24
227         */
228        public CSVParser( Reader in, char delimiter, String escapes, String replacements, String commentDelims )
229            throws BadDelimiterException
230        {
231            inReader = in;
232            lexer = new CSVLexer( in );
233            setEscapes( escapes, replacements );
234            setCommentStart( commentDelims );
235            changeDelimiter( delimiter );
236        }
237    
238        /**
239         * Create a parser to parse comma separated values from a Reader.
240         *
241         * @param in reader that contains comma separated values.
242         * @param escapes a list of characters that will represent escape sequences.
243         * @param replacements the list of replacement characters for those escape sequences.
244         * @param commentDelims list of characters a comment line may start with.
245         * @since ostermillerutils 1.00.00
246         */
247        public CSVParser( Reader in, String escapes, String replacements, String commentDelims )
248        {
249            inReader = in;
250            lexer = new CSVLexer( in );
251            setEscapes( escapes, replacements );
252            setCommentStart( commentDelims );
253        }
254    
255        /**
256         * Close any stream upon which this parser is based.
257         *
258         * @since ostermillerutils 1.02.22
259         * @throws IOException if an error occurs while closing the stream.
260         */
261        public void close()
262            throws IOException
263        {
264            if ( inStream != null )
265                inStream.close();
266            if ( inReader != null )
267                inReader.close();
268        }
269    
270        /**
271         * get the next value.
272         *
273         * @return the next value or null if there are no more values.
274         * @throws IOException if an error occurs while reading.
275         * @since ostermillerutils 1.00.00
276         */
277        public String nextValue()
278            throws IOException
279        {
280            if ( tokenCache == null )
281            {
282                tokenCache = lexer.getNextToken();
283                lineCache = lexer.getLineNumber();
284            }
285            lastLine = lineCache;
286            String result = tokenCache;
287            tokenCache = null;
288            return result;
289        }
290    
291        /**
292         * Get the line number that the last token came from.
293         * <p>
294         * New line breaks that occur in the middle of a token are no counted in the line number count.
295         *
296         * @return line number or -1 if no tokens have been returned yet.
297         * @since ostermillerutils 1.00.00
298         */
299        public int lastLineNumber()
300        {
301            return lastLine;
302        }
303    
304        /**
305         * Get all the values from a line.
306         * <p>
307         * If the line has already been partially read, only the values that have not already been read will be included.
308         *
309         * @return all the values from the line or null if there are no more values.
310         * @throws IOException if an error occurs while reading.
311         * @since ostermillerutils 1.00.00
312         */
313        public String[] getLine()
314            throws IOException
315        {
316            int lineNumber = -1;
317            ArrayList v = new ArrayList();
318            if ( tokenCache != null )
319            {
320                v.add( tokenCache );
321                lineNumber = lineCache;
322            }
323            while ( ( tokenCache = lexer.getNextToken() ) != null
324                && ( lineNumber == -1 || lexer.getLineNumber() == lineNumber ) )
325            {
326                v.add( tokenCache );
327                lineNumber = lexer.getLineNumber();
328            }
329            if ( v.size() == 0 )
330            {
331                return null;
332            }
333            lastLine = lineNumber;
334            lineCache = lexer.getLineNumber();
335            String[] result = new String[v.size()];
336            return ( (String[]) v.toArray( result ) );
337        }
338    
339        /**
340         * Get all the values from the file.
341         * <p>
342         * If the file has already been partially read, only the values that have not already been read will be included.
343         * <p>
344         * Each line of the file that has at least one value will be represented. Comments and empty lines are ignored.
345         * <p>
346         * The resulting double array may be jagged.
347         *
348         * @return all the values from the file or null if there are no more values.
349         * @throws IOException if an error occurs while reading.
350         * @since ostermillerutils 1.00.00
351         */
352        public String[][] getAllValues()
353            throws IOException
354        {
355            ArrayList v = new ArrayList();
356            String[] line;
357            while ( ( line = getLine() ) != null )
358            {
359                v.add( line );
360            }
361            if ( v.size() == 0 )
362            {
363                return null;
364            }
365            String[][] result = new String[v.size()][];
366            return ( (String[][]) v.toArray( result ) );
367        }
368    
369        /**
370         * Specify escape sequences and their replacements. Escape sequences set here are in addition to \\ and \". \\ and
371         * \" are always valid escape sequences. This method allows standard escape sequenced to be used. For example "\n"
372         * can be set to be a newline rather than an 'n'. A common way to call this method might be:<br>
373         * <code>setEscapes("nrtf", "\n\r\t\f");</code><br>
374         * which would set the escape sequences to be the Java escape sequences. Characters that follow a \ that are not
375         * escape sequences will still be interpreted as that character.<br>
376         * The two arguments to this method must be the same length. If they are not, the longer of the two will be
377         * truncated.
378         *
379         * @param escapes a list of characters that will represent escape sequences.
380         * @param replacements the list of replacement characters for those escape sequences.
381         * @since ostermillerutils 1.00.00
382         */
383        public void setEscapes( String escapes, String replacements )
384        {
385            lexer.setEscapes( escapes, replacements );
386        }
387    
388        /**
389         * Change this parser so that it uses a new delimiter.
390         * <p>
391         * The initial character is a comma, the delimiter cannot be changed to a quote or other character that has special
392         * meaning in CSV.
393         *
394         * @param newDelim delimiter to which to switch.
395         * @throws BadDelimiterException if the character cannot be used as a delimiter.
396         * @since ostermillerutils 1.02.08
397         */
398        public void changeDelimiter( char newDelim )
399            throws BadDelimiterException
400        {
401            lexer.changeDelimiter( newDelim );
402        }
403    
404        /**
405         * Change this parser so that it uses a new character for quoting.
406         * <p>
407         * The initial character is a double quote ("), the delimiter cannot be changed to a comma or other character that
408         * has special meaning in CSV.
409         *
410         * @param newQuote character to use for quoting.
411         * @throws BadQuoteException if the character cannot be used as a quote.
412         * @since ostermillerutils 1.02.16
413         */
414        public void changeQuote( char newQuote )
415            throws BadQuoteException
416        {
417            lexer.changeQuote( newQuote );
418        }
419    
420        /**
421         * Set the characters that indicate a comment at the beginning of the line. For example if the string "#;!" were
422         * passed in, all of the following lines would be comments:<br>
423         *
424         * <pre>
425         * # Comment
426         * ; Another Comment
427         * ! Yet another comment
428         * </pre>
429         *
430         * By default there are no comments in CVS files. Commas and quotes may not be used to indicate comment lines.
431         *
432         * @param commentDelims list of characters a comment line may start with.
433         * @since ostermillerutils 1.00.00
434         */
435        public void setCommentStart( String commentDelims )
436        {
437            lexer.setCommentStart( commentDelims );
438        }
439    
440        /**
441         * Get the number of the line from which the last value was retrieved.
442         *
443         * @return line number or -1 if no tokens have been returned.
444         * @since ostermillerutils 1.00.00
445         */
446        public int getLastLineNumber()
447        {
448            return lastLine;
449        }
450    
451        /**
452         * Get the number of chars that have been read from the beginning.
453         *
454         * @since added in LogDistiller
455         */
456        public int getLastCharCount()
457        {
458            return lexer.yychar();
459        }
460    }