How to Parse Big (50 GB) XML Files in Java -
currently im trying use sax parser 3/4 through file freezes up, have tried allocating more memory etc not getting improvements.
is there way speed up? better method?
stripped bare bones, have following code , when running in command line still doesn't go fast like.
running "java -xms-4096m -xmx8192m -jar reader.jar" gc overhead limit exceeded around article 700000
main:
public class read { public static void main(string[] args) { pages = xmlmanager.getpages(); } } xmlmanager
public class xmlmanager { public static arraylist<page> getpages() { arraylist<page> pages = null; saxparserfactory factory = saxparserfactory.newinstance(); try { saxparser parser = factory.newsaxparser(); file file = new file("..\\enwiki-20140811-pages-articles.xml"); pagehandler pagehandler = new pagehandler(); parser.parse(file, pagehandler); pages = pagehandler.getpages(); } catch (parserconfigurationexception e) { e.printstacktrace(); } catch (saxexception e) { e.printstacktrace(); } catch (ioexception e) { e.printstacktrace(); } return pages; } } pagehandler
public class pagehandler extends defaulthandler{ private arraylist<page> pages = new arraylist<>(); private page page; private stringbuilder stringbuilder; private boolean idset = false; public pagehandler(){ super(); } @override public void startelement(string uri, string localname, string qname, attributes attributes) throws saxexception { stringbuilder = new stringbuilder(); if (qname.equals("page")){ page = new page(); idset = false; } else if (qname.equals("redirect")){ if (page != null){ page.setredirecting(true); } } } @override public void endelement(string uri, string localname, string qname) throws saxexception { if (page != null && !page.isredirecting()){ if (qname.equals("title")){ page.settitle(stringbuilder.tostring()); } else if (qname.equals("id")){ if (!idset){ page.setid(integer.parseint(stringbuilder.tostring())); idset = true; } } else if (qname.equals("text")){ string articletext = stringbuilder.tostring(); articletext = articletext.replaceall("(?s)<ref(.+?)</ref>", " "); //remove references articletext = articletext.replaceall("(?s)\\{\\{(.+?)\\}\\}", " "); //remove links underneath headings articletext = articletext.replaceall("(?s)==see also==.+", " "); //remove after see articletext = articletext.replaceall("\\|", " "); //separate multiple links articletext = articletext.replaceall("\\n", " "); //remove new lines articletext = articletext.replaceall("[^a-za-z0-9- \\s]", " "); //remove non alphanumeric except dashes , spaces articletext = articletext.trim().replaceall(" +", " "); //convert multiple spaces 1 space pattern pattern = pattern.compile("([\\s]+\\s*){1,75}"); //get first 75 words of text matcher matcher = pattern.matcher(articletext); matcher.find(); try { page.setsummarytext(matcher.group()); } catch (illegalstateexception se){ page.setsummarytext("none"); } page.settext(articletext); } else if (qname.equals("page")){ pages.add(page); page = null; } } else { page = null; } } @override public void characters(char[] ch, int start, int length) throws saxexception { stringbuilder.append(ch,start, length); } public arraylist<page> getpages() { return pages; } }
your parsing code working fine, volume of data you're loading large hold in memory in arraylist.
you need sort of pipeline pass data on actual destination without ever store in memory @ once.
what i've done sort of situation similar following.
create interface processing single element:
public interface pageprocessor { void process(page page); } supply implementation of pagehandler through constructor:
public class read { public static void main(string[] args) { xmlmanager.load(new pageprocessor() { @override public void process(page page) { // want other printing, // don't know is... system.out.println(page); } }) ; } } public class xmlmanager { public static void load(pageprocessor processor) { saxparserfactory factory = saxparserfactory.newinstance(); try { saxparser parser = factory.newsaxparser(); file file = new file("pages-articles.xml"); pagehandler pagehandler = new pagehandler(processor); parser.parse(file, pagehandler); } catch (parserconfigurationexception e) { e.printstacktrace(); } catch (saxexception e) { e.printstacktrace(); } catch (ioexception e) { e.printstacktrace(); } } } send data processor instead of putting in list:
public class pagehandler extends defaulthandler { private final pageprocessor processor; private page page; private stringbuilder stringbuilder; private boolean idset = false; public pagehandler(pageprocessor processor) { this.processor = processor; } @override public void startelement(string uri, string localname, string qname, attributes attributes) throws saxexception { //unchanged implementation } @override public void characters(char[] ch, int start, int length) throws saxexception { //unchanged implementation } @override public void endelement(string uri, string localname, string qname) throws saxexception { // elide code not needing change } else if (qname.equals("page")){ processor.process(page); page = null; } } else { page = null; } } } of course, can make interface handle chunks of multiple records rather 1 , have pagehandler collect pages locally in smaller list , periodically send list off processing , clear list.
or (perhaps better) implement pageprocessor interface defined here , build in logic there buffers data , sends on further handling in chunks.
Comments
Post a Comment