【发布时间】:2014-07-22 14:21:17
【问题描述】:
我有一个 35 GB 的 XML 文件(是的,有些组织会这样做,但我无法控制它),我想对它进行 SAX 解析。我在这里找到了一个例子:
http://www.java2s.com/Code/Java/XML/SAXDemo.htm
关于如何运行 SAX 解析器并避免加载所有内容。但是,我立即收到内存不足错误。为什么会发生这种情况?如何让这段代码完美地适应任何 XML 文件大小?
这是我的代码:
import org.apache.log4j.Logger;
import org.xml.sax.AttributeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
public class XMLSAXTools extends org.xml.sax.helpers.DefaultHandler {
/**
* Logging facility
*/
static Logger logger = Logger.getLogger(XMLSAXTools.class);
private String fileName = "C:/Data/hugefile.xml";
private int counter = 0;
/** The main method sets things up for parsing */
public void test() throws IOException, SAXException,
ParserConfigurationException {
// Create a JAXP "parser factory" for creating SAX parsers
javax.xml.parsers.SAXParserFactory spf = SAXParserFactory.newInstance();
// Configure the parser factory for the type of parsers we require
spf.setValidating(false); // No validation required
// Now use the parser factory to create a SAXParser object
// Note that SAXParser is a JAXP class, not a SAX class
javax.xml.parsers.SAXParser sp = spf.newSAXParser();
// Create a SAX input source for the file argument
org.xml.sax.InputSource input = new InputSource(new FileReader(fileName));
// Give the InputSource an absolute URL for the file, so that
// it can resolve relative URLs in a <!DOCTYPE> declaration, e.g.
input.setSystemId("file://" + new File(fileName).getAbsolutePath());
// Create an instance of this class; it defines all the handler methods
XMLSAXTools handler = new XMLSAXTools();
// Finally, tell the parser to parse the input and notify the handler
sp.parse(input, handler);
// Instead of using the SAXParser.parse() method, which is part of the
// JAXP API, we could also use the SAX1 API directly. Note the
// difference between the JAXP class javax.xml.parsers.SAXParser and
// the SAX1 class org.xml.sax.Parser
//
// org.xml.sax.Parser parser = sp.getParser(); // Get the SAX parser
// parser.setDocumentHandler(handler); // Set main handler
// parser.setErrorHandler(handler); // Set error handler
// parser.parse(input); // Parse!
}
StringBuffer accumulator = new StringBuffer(); // Accumulate parsed text
String servletName; // The name of the servlet
String servletClass; // The class name of the servlet
String servletId; // Value of id attribute of <servlet> tag
// When the parser encounters plain text (not XML elements), it calls
// this method, which accumulates them in a string buffer
public void characters(char[] buffer, int start, int length) {
accumulator.append(buffer, start, length);
}
// Every time the parser encounters the beginning of a new element, it
// calls this method, which resets the string buffer
public void startElement(String name, AttributeList attributes) {
accumulator.setLength(0); // Ready to accumulate new text
if (name.equals("item")) {
logger.info("item tag opened");
counter++;
}
}
// When the parser encounters the end of an element, it calls this method
public void endElement(String name) {
if (name.equals("item")) {
logger.info("item tag closed. Counter: " + counter);
}
}
/** This method is called when warnings occur */
public void warning(SAXParseException exception) {
System.err.println("WARNING: line " + exception.getLineNumber() + ": "
+ exception.getMessage());
}
/** This method is called when errors occur */
public void error(SAXParseException exception) {
System.err.println("ERROR: line " + exception.getLineNumber() + ": "
+ exception.getMessage());
}
/** This method is called when non-recoverable errors occur. */
public void fatalError(SAXParseException exception) throws SAXException {
System.err.println("FATAL: line " + exception.getLineNumber() + ": "
+ exception.getMessage());
throw (exception);
}
public static void main(String[] args){
XMLSAXTools t = new XMLSAXTools();
try {
t.test();
} catch (Exception e){
logger.error("Exception in XMLSAXTools: " + e.getMessage());
e.printStackTrace();
}
}
}
【问题讨论】:
-
您可能需要检查 StAX 解析。 docs.oracle.com/cd/E17802_01/webservices/webservices/docs/1.6/…
-
对于 35GB 的文件,我也认为是 StAX。试试这个答案有更多信息:stackoverflow.com/a/9382164/458008