001 /* 002 * Read files in comma separated value format. 003 * Copyright (C) 2001-2004 Stephen Ostermiller 004 * http://ostermiller.org/contact.pl?regarding=Java+Utilities 005 * 006 * This program is free software; you can redistribute it and/or modify 007 * it under the terms of the GNU General Public License as published by 008 * the Free Software Foundation; either version 2 of the License, or 009 * (at your option) any later version. 010 * 011 * This program is distributed in the hope that it will be useful, 012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 014 * GNU General Public License for more details. 015 * 016 * See COPYING.TXT for details. 017 */ 018 019 //package com.Ostermiller.util; 020 package net.sf.logdistiller.util.csv; 021 022 import java.io.*; 023 import java.util.*; 024 025 /** 026 * Read files in comma separated value format. More information about this class is available from <a target="_top" 027 * href= "http://ostermiller.org/utils/CSVLexer.html">ostermiller.org</a>. CSV is a file format used as a portable 028 * representation of a database. Each line is one entry or record and the fields in a record are separated by commas. 029 * Commas may be preceded or followed by arbitrary space and/or tab characters which are ignored. 030 * <P> 031 * If field includes a comma or a new line, the whole field must be surrounded with double quotes. When the field is in 032 * quotes, any quote literals must be escaped by \" Backslash literals must be escaped by \\. Otherwise a backslash and 033 * the character following will be treated as the following character, IE. "\n" is equivalent to "n". Other escape 034 * sequences may be set using the setEscapes() method. Text that comes after quotes that have been closed but come 035 * before the next comma will be ignored. 036 * <P> 037 * Empty fields are returned as as String of length zero: "". The following line has three empty fields and three 038 * non-empty fields in it. There is an empty field on each end, and one in the middle. One token is returned as a space. 039 * <br> 040 * 041 * <pre> 042 * ,second,," ",fifth, 043 * </pre> 044 * <P> 045 * Blank lines are always ignored. Other lines will be ignored if they start with a comment character as set by the 046 * setCommentStart() method. 047 * <P> 048 * An example of how CVSLexer might be used: 049 * 050 * <pre> 051 * CSVParser shredder = new CSVParser( System.in ); 052 * shredder.setCommentStart( "#;!" ); 053 * shredder.setEscapes( "nrtf", "\n\r\t\f" ); 054 * String t; 055 * while ( ( t = shredder.nextValue() ) != null ) 056 * { 057 * System.out.println( "" + shredder.lastLineNumber() + " " + t ); 058 * } 059 * </pre> 060 * <P> 061 * Some applications do not output CSV according to the generally accepted standards and this parse may not be able to 062 * handle it. One such application is the Microsoft Excel spreadsheet. A separate class must be use to read <a 063 * href="http://ostermiller.org/utils/ExcelCSV.html">Excel CSV</a>. 064 * 065 * @see com.Ostermiller.util.ExcelCSVParser 066 * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities 067 * @since ostermillerutils 1.00.00 068 */ 069 public class CSVParser 070 implements CSVParse 071 { 072 073 /** 074 * InputStream on which this parser is based. 075 * 076 * @since ostermillerutils 1.02.22 077 */ 078 private InputStream inStream; 079 080 /** 081 * Reader on which this parser is based. 082 * 083 * @since ostermillerutils 1.02.22 084 */ 085 private Reader inReader; 086 087 /** 088 * Does all the dirty work. Calls for new tokens are routed through this object. 089 * 090 * @since ostermillerutils 1.00.00 091 */ 092 private CSVLexer lexer; 093 094 /** 095 * Token cache. Used for when we request a token from the lexer but can't return it because its on the next line. 096 * 097 * @since ostermillerutils 1.00.00 098 */ 099 private String tokenCache; 100 101 /** 102 * Line cache. The line number that goes along with the tokenCache. Not valid if the tokenCache is null. 103 * 104 * @since ostermillerutils 1.00.00 105 */ 106 private int lineCache; 107 108 /** 109 * The line number the last token came from, or -1 if no tokens have been returned. 110 * 111 * @since ostermillerutils 1.00.00 112 */ 113 private int lastLine = -1; 114 115 /** 116 * Create a parser to parse comma separated values from an InputStream. 117 * <p> 118 * Byte to character conversion is done using the platform default locale. 119 * 120 * @param in stream that contains comma separated values. 121 * @since ostermillerutils 1.00.00 122 */ 123 public CSVParser( InputStream in ) 124 { 125 inStream = in; 126 lexer = new CSVLexer( in ); 127 } 128 129 /** 130 * Create a parser to parse delimited values from an InputStream. 131 * <p> 132 * Byte to character conversion is done using the platform default locale. 133 * 134 * @param in stream that contains comma separated values. 135 * @param delimiter record separator 136 * @throws BadDelimiterException if the specified delimiter cannot be used 137 * @since ostermillerutils 1.02.24 138 */ 139 public CSVParser( InputStream in, char delimiter ) 140 throws BadDelimiterException 141 { 142 inStream = in; 143 lexer = new CSVLexer( in ); 144 changeDelimiter( delimiter ); 145 } 146 147 /** 148 * Create a parser to parse comma separated values from a Reader. 149 * 150 * @param in reader that contains comma separated values. 151 * @since ostermillerutils 1.00.00 152 */ 153 public CSVParser( Reader in ) 154 { 155 inReader = in; 156 lexer = new CSVLexer( in ); 157 } 158 159 /** 160 * Create a parser to parse delimited values from a Reader. 161 * 162 * @param in reader that contains comma separated values. 163 * @param delimiter record separator 164 * @throws BadDelimiterException if the specified delimiter cannot be used 165 * @since ostermillerutils 1.02.24 166 */ 167 public CSVParser( Reader in, char delimiter ) 168 throws BadDelimiterException 169 { 170 inReader = in; 171 lexer = new CSVLexer( in ); 172 changeDelimiter( delimiter ); 173 } 174 175 /** 176 * Create a parser to parse delimited values from an InputStream. 177 * <p> 178 * Byte to character conversion is done using the platform default locale. 179 * 180 * @param in stream that contains comma separated values. 181 * @param escapes a list of characters that will represent escape sequences. 182 * @param replacements the list of replacement characters for those escape sequences. 183 * @param commentDelims list of characters a comment line may start with. 184 * @param delimiter record separator 185 * @throws BadDelimiterException if the specified delimiter cannot be used 186 * @since ostermillerutils 1.02.24 187 */ 188 public CSVParser( InputStream in, char delimiter, String escapes, String replacements, String commentDelims ) 189 throws BadDelimiterException 190 { 191 inStream = in; 192 lexer = new CSVLexer( in ); 193 setEscapes( escapes, replacements ); 194 setCommentStart( commentDelims ); 195 changeDelimiter( delimiter ); 196 } 197 198 /** 199 * Create a parser to parse comma separated values from an InputStream. 200 * <p> 201 * Byte to character conversion is done using the platform default locale. 202 * 203 * @param in stream that contains comma separated values. 204 * @param escapes a list of characters that will represent escape sequences. 205 * @param replacements the list of replacement characters for those escape sequences. 206 * @param commentDelims list of characters a comment line may start with. 207 * @since ostermillerutils 1.00.00 208 */ 209 public CSVParser( InputStream in, String escapes, String replacements, String commentDelims ) 210 { 211 inStream = in; 212 lexer = new CSVLexer( in ); 213 setEscapes( escapes, replacements ); 214 setCommentStart( commentDelims ); 215 } 216 217 /** 218 * Create a parser to parse delimited values from a Reader. 219 * 220 * @param in reader that contains comma separated values. 221 * @param escapes a list of characters that will represent escape sequences. 222 * @param replacements the list of replacement characters for those escape sequences. 223 * @param commentDelims list of characters a comment line may start with. 224 * @param delimiter record separator 225 * @throws BadDelimiterException if the specified delimiter cannot be used 226 * @since ostermillerutils 1.02.24 227 */ 228 public CSVParser( Reader in, char delimiter, String escapes, String replacements, String commentDelims ) 229 throws BadDelimiterException 230 { 231 inReader = in; 232 lexer = new CSVLexer( in ); 233 setEscapes( escapes, replacements ); 234 setCommentStart( commentDelims ); 235 changeDelimiter( delimiter ); 236 } 237 238 /** 239 * Create a parser to parse comma separated values from a Reader. 240 * 241 * @param in reader that contains comma separated values. 242 * @param escapes a list of characters that will represent escape sequences. 243 * @param replacements the list of replacement characters for those escape sequences. 244 * @param commentDelims list of characters a comment line may start with. 245 * @since ostermillerutils 1.00.00 246 */ 247 public CSVParser( Reader in, String escapes, String replacements, String commentDelims ) 248 { 249 inReader = in; 250 lexer = new CSVLexer( in ); 251 setEscapes( escapes, replacements ); 252 setCommentStart( commentDelims ); 253 } 254 255 /** 256 * Close any stream upon which this parser is based. 257 * 258 * @since ostermillerutils 1.02.22 259 * @throws IOException if an error occurs while closing the stream. 260 */ 261 public void close() 262 throws IOException 263 { 264 if ( inStream != null ) 265 inStream.close(); 266 if ( inReader != null ) 267 inReader.close(); 268 } 269 270 /** 271 * get the next value. 272 * 273 * @return the next value or null if there are no more values. 274 * @throws IOException if an error occurs while reading. 275 * @since ostermillerutils 1.00.00 276 */ 277 public String nextValue() 278 throws IOException 279 { 280 if ( tokenCache == null ) 281 { 282 tokenCache = lexer.getNextToken(); 283 lineCache = lexer.getLineNumber(); 284 } 285 lastLine = lineCache; 286 String result = tokenCache; 287 tokenCache = null; 288 return result; 289 } 290 291 /** 292 * Get the line number that the last token came from. 293 * <p> 294 * New line breaks that occur in the middle of a token are no counted in the line number count. 295 * 296 * @return line number or -1 if no tokens have been returned yet. 297 * @since ostermillerutils 1.00.00 298 */ 299 public int lastLineNumber() 300 { 301 return lastLine; 302 } 303 304 /** 305 * Get all the values from a line. 306 * <p> 307 * If the line has already been partially read, only the values that have not already been read will be included. 308 * 309 * @return all the values from the line or null if there are no more values. 310 * @throws IOException if an error occurs while reading. 311 * @since ostermillerutils 1.00.00 312 */ 313 public String[] getLine() 314 throws IOException 315 { 316 int lineNumber = -1; 317 ArrayList v = new ArrayList(); 318 if ( tokenCache != null ) 319 { 320 v.add( tokenCache ); 321 lineNumber = lineCache; 322 } 323 while ( ( tokenCache = lexer.getNextToken() ) != null 324 && ( lineNumber == -1 || lexer.getLineNumber() == lineNumber ) ) 325 { 326 v.add( tokenCache ); 327 lineNumber = lexer.getLineNumber(); 328 } 329 if ( v.size() == 0 ) 330 { 331 return null; 332 } 333 lastLine = lineNumber; 334 lineCache = lexer.getLineNumber(); 335 String[] result = new String[v.size()]; 336 return ( (String[]) v.toArray( result ) ); 337 } 338 339 /** 340 * Get all the values from the file. 341 * <p> 342 * If the file has already been partially read, only the values that have not already been read will be included. 343 * <p> 344 * Each line of the file that has at least one value will be represented. Comments and empty lines are ignored. 345 * <p> 346 * The resulting double array may be jagged. 347 * 348 * @return all the values from the file or null if there are no more values. 349 * @throws IOException if an error occurs while reading. 350 * @since ostermillerutils 1.00.00 351 */ 352 public String[][] getAllValues() 353 throws IOException 354 { 355 ArrayList v = new ArrayList(); 356 String[] line; 357 while ( ( line = getLine() ) != null ) 358 { 359 v.add( line ); 360 } 361 if ( v.size() == 0 ) 362 { 363 return null; 364 } 365 String[][] result = new String[v.size()][]; 366 return ( (String[][]) v.toArray( result ) ); 367 } 368 369 /** 370 * Specify escape sequences and their replacements. Escape sequences set here are in addition to \\ and \". \\ and 371 * \" are always valid escape sequences. This method allows standard escape sequenced to be used. For example "\n" 372 * can be set to be a newline rather than an 'n'. A common way to call this method might be:<br> 373 * <code>setEscapes("nrtf", "\n\r\t\f");</code><br> 374 * which would set the escape sequences to be the Java escape sequences. Characters that follow a \ that are not 375 * escape sequences will still be interpreted as that character.<br> 376 * The two arguments to this method must be the same length. If they are not, the longer of the two will be 377 * truncated. 378 * 379 * @param escapes a list of characters that will represent escape sequences. 380 * @param replacements the list of replacement characters for those escape sequences. 381 * @since ostermillerutils 1.00.00 382 */ 383 public void setEscapes( String escapes, String replacements ) 384 { 385 lexer.setEscapes( escapes, replacements ); 386 } 387 388 /** 389 * Change this parser so that it uses a new delimiter. 390 * <p> 391 * The initial character is a comma, the delimiter cannot be changed to a quote or other character that has special 392 * meaning in CSV. 393 * 394 * @param newDelim delimiter to which to switch. 395 * @throws BadDelimiterException if the character cannot be used as a delimiter. 396 * @since ostermillerutils 1.02.08 397 */ 398 public void changeDelimiter( char newDelim ) 399 throws BadDelimiterException 400 { 401 lexer.changeDelimiter( newDelim ); 402 } 403 404 /** 405 * Change this parser so that it uses a new character for quoting. 406 * <p> 407 * The initial character is a double quote ("), the delimiter cannot be changed to a comma or other character that 408 * has special meaning in CSV. 409 * 410 * @param newQuote character to use for quoting. 411 * @throws BadQuoteException if the character cannot be used as a quote. 412 * @since ostermillerutils 1.02.16 413 */ 414 public void changeQuote( char newQuote ) 415 throws BadQuoteException 416 { 417 lexer.changeQuote( newQuote ); 418 } 419 420 /** 421 * Set the characters that indicate a comment at the beginning of the line. For example if the string "#;!" were 422 * passed in, all of the following lines would be comments:<br> 423 * 424 * <pre> 425 * # Comment 426 * ; Another Comment 427 * ! Yet another comment 428 * </pre> 429 * 430 * By default there are no comments in CVS files. Commas and quotes may not be used to indicate comment lines. 431 * 432 * @param commentDelims list of characters a comment line may start with. 433 * @since ostermillerutils 1.00.00 434 */ 435 public void setCommentStart( String commentDelims ) 436 { 437 lexer.setCommentStart( commentDelims ); 438 } 439 440 /** 441 * Get the number of the line from which the last value was retrieved. 442 * 443 * @return line number or -1 if no tokens have been returned. 444 * @since ostermillerutils 1.00.00 445 */ 446 public int getLastLineNumber() 447 { 448 return lastLine; 449 } 450 451 /** 452 * Get the number of chars that have been read from the beginning. 453 * 454 * @since added in LogDistiller 455 */ 456 public int getLastCharCount() 457 { 458 return lexer.yychar(); 459 } 460 }