import java.io.*; import java.net.URL; import org.xml.sax.*; import org.xml.sax.helpers.*; /** * A utility class that parses a Comma Separated Values (CSV) file * and outputs its contents using SAX2 events. The format of CSV that * this class reads is identical to the export format for Microsoft * Excel. For simple values, the CSV file may look like this: *
 * a,b,c
 * d,e,f
 * 
* Quotes are used as delimiters when the values contain commas: *
 * a,"b,c",d
 * e,"f,g","h,i"
 * 
* And double quotes are used when the values contain quotes. This parser * is smart enough to trim spaces around commas, as well. * * @author Eric M. Burke */ public class CSVXMLReader extends AbstractXMLReader { // an empty attribute for use with SAX private static final Attributes EMPTY_ATTR = new AttributesImpl(); /** * Parse a CSV file. SAX events are delivered to the ContentHandler * that was registered via setContentHandler. * * @param input the comma separated values file to parse. */ public void parse(InputSource input) throws IOException, SAXException { // if no handler is registered to receive events, don't bother // to parse the CSV file ContentHandler ch = getContentHandler(); if (ch == null) { return; } // convert the InputSource into a BufferedReader BufferedReader br = null; if (input.getCharacterStream() != null) { br = new BufferedReader(input.getCharacterStream()); } else if (input.getByteStream() != null) { br = new BufferedReader(new InputStreamReader( input.getByteStream())); } else if (input.getSystemId() != null) { java.net.URL url = new URL(input.getSystemId()); br = new BufferedReader(new InputStreamReader(url.openStream())); } else { throw new SAXException("Invalid InputSource object"); } ch.startDocument(); // emit // ch.startElement("","","csvFile",EMPTY_ATTR); ch.startElement("","csvFile","csvFile",EMPTY_ATTR); // read each line of the file until EOF is reached String curLine = null; while ((curLine = br.readLine()) != null) { curLine = curLine.trim(); if (curLine.length() > 0) { // create the element // ch.startElement("","","line",EMPTY_ATTR); ch.startElement("","line","line",EMPTY_ATTR); // output data from this line parseLine(curLine, ch); // close the element // ch.endElement("","","line"); ch.endElement("","line","line"); } } // emit // ch.endElement("","","csvFile"); ch.endElement("","csvFile","csvFile"); ch.endDocument(); } // Break an individual line into tokens. This is a recursive function // that extracts the first token, then recursively parses the // remainder of the line. private void parseLine(String curLine, ContentHandler ch) throws IOException, SAXException { String firstToken = null; String remainderOfLine = null; int commaIndex = locateFirstDelimiter(curLine); if (commaIndex > -1) { firstToken = curLine.substring(0, commaIndex).trim(); remainderOfLine = curLine.substring(commaIndex+1).trim(); } else { // no commas, so the entire line is the token firstToken = curLine; } // remove redundant quotes firstToken = cleanupQuotes(firstToken); // emit the element // ch.startElement("","","value",EMPTY_ATTR); ch.startElement("","value","value",EMPTY_ATTR); ch.characters(firstToken.toCharArray(), 0, firstToken.length()); // ch.endElement("","","value"); ch.endElement("","value","value"); // recursively process the remainder of the line if (remainderOfLine != null) { parseLine(remainderOfLine, ch); } } // locate the position of the comma, taking into account that // a quoted token may contain ignorable commas. private int locateFirstDelimiter(String curLine) { if (curLine.startsWith("\"")) { boolean inQuote = true; int numChars = curLine.length(); for (int i=1; i