SAX-based XML pretty printer
Russell Bateman |
--because I'm tired of messing with the broken org.dom4j stuff that can't handle missing namespaces, etc. in the in-coming XML which, besides, is super memory-heavy as compared to SAX.
package com.windofkeltia.prettyprint; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.SAXException; public class PrettyPrintXml { private static final SAXParserFactory factory = SAXParserFactory.newInstance(); private final SAXParser parser; private final Handler handler; private final InputStream inputStream; private final String tabWidth; /** * Look for an element, begin displaying output (which is just parsing status) * until that element's close is found, then stop displaying output again. */ protected PrettyPrintXml( Builder builder ) throws ParserConfigurationException, SAXException { inputStream = builder.inputStream; tabWidth = builder.tabWidth; parser = factory.newSAXParser(); handler = new Handler(); } /** Execute the parsing and generate the output. */ public String parse() throws IOException, SAXException { parser.parse( inputStream, handler ); return handler.getOutput().toString(); } public static class Builder { private InputStream inputStream; private String tabWidth = " "; public Builder inputStream( final String input ) { this.inputStream = new ByteArrayInputStream( input.getBytes() ); return this; } public Builder inputStream( final byte[] input ) { this.inputStream = new ByteArrayInputStream( input ); return this; } public Builder inputStream( InputStream inputStream ) { this.inputStream = inputStream; return this; } public Builder tabWidth ( String tabWidth ) { this.tabWidth = tabWidth; return this; } public PrettyPrintXml build() throws ParserConfigurationException, SAXException { return new PrettyPrintXml( this ); } } }
package com.windofkeltia.prettyprint; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import static java.util.Objects.isNull; import static java.util.Objects.nonNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; import com.windofkeltia.utilities.StringUtilities; /** * This SAX-based printer requires no stack because * it prints as it goes. */ @SuppressWarnings( { "DanglingJavadoc" } ) public class Handler extends DefaultHandler { private static final Logger logger = LoggerFactory.getLogger( Handler.class ); private int indentation = 0; // indentation is a count (of) tabs private String tab = " "; // default width is two spaces private ByteArrayOutputStream output = new ByteArrayOutputStream(); public void setTab ( final String tab ) { this.tab = tab; } public ByteArrayOutputStream getOutput() { return output; } protected Handler() { super(); } public void startDocument() { } public void startElement( String uri, String localName, String elementName, Attributes attributes ) { StringBuilder sb = new StringBuilder(); sb.append( indent() ) .append( '<' ).append( elementName ) .append( attributesAsString( getAttributesAsMap( attributes ) ) ) .append( ">\n" ); write( sb.toString() ); indentation++; } public void endElement( String uri, String localName, String elementName ) { indentation--; StringBuilder sb = new StringBuilder(); sb.append( indent() ) .append( "</" ).append( elementName ) .append( ">\n" ); write( sb.toString() ); } public void characters( char[] ch, int start, int length ) { String characters = new String( ch, start, length ).trim(); if( !StringUtilities.isEmpty( characters ) ) doCharacters( characters ); } public void comment( char[] ch, int start, int length ) { String comment = new String( ch, start, length ); if( !StringUtilities.isEmpty( comment ) ) doCharacters( comment ); } public void doCharacters( String input ) { StringBuilder sb = new StringBuilder(); sb.append( indent() ) .append( input.trim() ) .append( '\n' ); write( sb.toString() ); } private String indent() { StringBuilder sb = new StringBuilder(); for( int level = 0; level < indentation; level++ ) sb.append( tab ); return sb.toString(); } private void write( byte[] bytes ) { try { output.write( bytes ); } catch( IOException e ) { throw new RuntimeException( e ); } } private void write( String string ) { try { output.write( string.getBytes() ); } catch( IOException e ) { throw new RuntimeException( e ); } } /** Do some post-processing of this handler's output to clean up stuff like: * * <!-- a change here: --> * <realmCode code="US"> → <realmCode code="US" /> * </realmCode> * * But, don't go wild and clean by folding the last two lines here: * * <!-- no change here: --> * <csmk:div csmk:class="demographic" npmk:nlp="off"> * <csmk:div csmk:class="extended_data" npmk:nlp="off"> * full_name: BEITEL, ELISE * </csmk:div> * </csmk:div> */ public void endDocument() { ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); ByteArrayInputStream inputStream = new ByteArrayInputStream( output.toByteArray() ); try { BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( inputStream ) ); String firstLine = null, secondLine = null; while( true ) { try { if( isNull( firstLine ) ) firstLine = bufferedReader.readLine(); secondLine = bufferedReader.readLine(); if( isNull( secondLine ) ) { // end of the input stream... firstLine += '\n'; outputStream.write( firstLine.getBytes() ); lineNumber++; break; } int firstLineIndent = HandlerUtilities.countLeadingSpaces( firstLine ); int secondLineIndent = HandlerUtilities.countLeadingSpaces( secondLine ); boolean startsWithOpeningElement = HandlerUtilities.startWithOpeningElement( firstLine, secondLine ); if( firstLineIndent != secondLineIndent || !startsWithOpeningElement ) { // append "\n" to first line and copy it to the output as is... firstLine += '\n'; outputStream.write( firstLine.getBytes() ); lineNumber++; firstLine = secondLine; secondLine = null; continue; } List< String > firstList = HandlerUtilities.tokenize( firstLine.trim() ); List< String > secondList = HandlerUtilities.tokenize( secondLine.trim() ); String firstElement = firstList.get( 0 ); String secondElement = secondList.get( 0 ); if( firstElement.equals( secondElement ) ) { // special rebuild modifying first line and copying it to the output... firstLine = HandlerUtilities.indent( firstLine ) + "<" + firstElement + " />\n"; outputStream.write( firstLine.getBytes() ); lineNumber++; // skip second line; we're dropping it... firstLine = null; secondLine = null; } else { // append "\n" to first line and copy it to the output as is... firstLine += '\n'; outputStream.write( firstLine.getBytes() ); lineNumber++; firstLine = secondLine; } secondLine = null; } catch( Exception e ) { logger.warn( "Stopped I/O, failed tokenization and/or line folding: {}", e.getMessage() ); if( nonNull( firstLine ) ) { firstLine += '\n'; outputStream.write( firstLine.getBytes() ); } break; } } output = outputStream; } catch ( Exception e ) { logger.warn( "Failed stream I/O: {}", e.getMessage() ); } } /** * Here's how to make SAX attributes "Java-useful." If we had uri (namespaces) defined, * we'd have to get a lot more serious about how to use uri, localName and qName. */ private Map< String, String > getAttributesAsMap( Attributes saxAttributes ) { int attrLength = saxAttributes.getLength(); Map< String, String > javaAttributes = new HashMap<>( attrLength ); for( int attr = 0; attr < attrLength; attr++ ) { String attribute = saxAttributes.getQName( attr ); String value = saxAttributes.getValue( attr ); javaAttributes.put( attribute, value ); } return javaAttributes; } /** * Format XML element attributes canonically for concatenating * to their element name. */ private String attributesAsString( Map< String, String > javaAttributes ) { if( javaAttributes.isEmpty() ) return ""; StringBuilder sb = new StringBuilder(); sb.append( ' ' ); for( Map.Entry< String, String > attribute : javaAttributes.entrySet() ) sb.append( attribute.getKey() ) .append( "=\"" ) .append( StringUtilities.smash( attribute.getValue() ) ) .append( "\", " ); sb.setLength( sb.length()-2 ); return sb.toString(); } }
package com.windofkeltia.prettyprint; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; /** * These utilities are geared toward the particular output format * of the pretty-printer SAX parser and output generated. */ public class HandlerUtilities { private static final int MAXTOKENCOUNT = 2; private static final String XML_DELIMITERS = " <>/"; /** * Tokenize this line using the space character as delimiter. As, for * our purposes, we only need the first two tokens (we're looking for * '<' and an XML element name), don't spend any more time on this * than needed to accomplish that. This is an example of what we want * to solve: opening element with no characters (text) followed by * a closing element. * * <realmCode code="US"> → <realmCode code="US" /> * </realmCode> */ public static List< String > tokenize( String line ) { List< String > tokens = new ArrayList<>(); StringTokenizer tokenizer = new StringTokenizer( line, XML_DELIMITERS ); while( tokenizer.hasMoreElements() ) tokens.add( tokenizer.nextToken() ); return tokens; } /** * Allows us to short-circuit work above even though two consecutive * line might match once their leading (and trailing) spaces are * trimmed. Trimming will produce inaccurate matches because the * number of leading spaces (indentation) was different. E.g.: * * <csmk:div csmk:class="demographic" npmk:nlp="off"> * <csmk:div csmk:class="extended_data" npmk:nlp="off"> * full_name: BEITEL, ELISE * </csmk:div> * </csmk:div> */ public static int countLeadingSpaces( String line ) { int count = 0; for( char ch : line.toCharArray() ) { if( ch == ' ' ) { count++; continue; } break; } return count; } /** Count leading spaces and create indentation of the same amount. */ public static String indent( String line ) { StringBuilder tab = new StringBuilder(); int width = countLeadingSpaces( line ); for( int ch = 0; ch < width; ch++ ) tab.append( ' ' ); return tab.toString(); } public static boolean startWithOpeningElement( String line1, String line2 ) { line1 = line1.trim(); line2 = line2.trim(); return( line1.startsWith( "<" ) && line2.startsWith( "<" ) ); } }
package com.windofkeltia.prettyprint; import java.io.IOException; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; public class PrettyPrintXmlTest { @Test public void test() throws ParserConfigurationException, SAXException, IOException { final String CCD_BODY = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<record>\n" + " <justforfun>\n" + " ...\n" + " </justforfun>\n" + " <ccdmessage>\n" + "<ClinicalDocument xmlns=\"urn:hl7-org:v3\" xmlns:sdtc=\"urn:hl7-org:sdtc\"\n" + " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" + " xsi:schemaLocation=\"urn:hl7-org:v3\n" + " http://xreg2.nist.gov:8080/hitspValidation/schema/cdar2c32/infrastructure/cda/C32_CDA.xsd\">\n" + " <realmCode code=\"US\"/>\n" + " <typeId root=\"2.16.840.1.113883.1.3\" extension=\"POCD_HD000040\"/>\n" + " <component>\n" + " <stuff>\n" + " ...\n" + " </stuff>\n" + " </component>\n" + "</ClinicalDocument>\n" + " </ccdmessage>\n" + "</record>\n"; PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( CCD_BODY ).build(); String OUTPUT = prettyPrintXml.parse(); System.out.println( OUTPUT ); } @Test public void testWithNameSpaces() throws ParserConfigurationException, SAXException, IOException { final String NAMESPACE_CONTENT = "" + "<record>\n" + " <csmk:div csmk:class=\"demographic\" npmk:nlp=\"off\">\n" + " <csmk:div csmk:class=\"extended_data\" npmk:nlp=\"off\">\n" + " full_name: BEITEL, ELISE\n" + " </csmk:div>\n" + " </csmk:div>\n" + "</record>\n"; PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( NAMESPACE_CONTENT ).build(); String OUTPUT = prettyPrintXml.parse(); System.out.println( OUTPUT ); } @Test public void testWithAdditionalWiggle() throws ParserConfigurationException, SAXException, IOException { final String WIGGLE = "" + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<test>test\n" + " <br />test\n" + "</test>\n" + ""; PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( WIGGLE ).build(); String OUTPUT = prettyPrintXml.parse(); System.out.println( OUTPUT ); } }