SAX-based XML pretty printer

Russell Bateman
May 2024
last update:




--because I'm tired of messing with the broken org.dom4j stuff that can't handle missing namespaces, etc. in the in-coming XML which, besides, is super memory-heavy as compared to SAX.

PrettyPrintXml.java:
package com.windofkeltia.prettyprint;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.SAXException;

public class PrettyPrintXml
{
  private static final SAXParserFactory factory = SAXParserFactory.newInstance();

  private final SAXParser   parser;
  private final Handler     handler;
  private final InputStream inputStream;
  private final String      tabWidth;

  /**
   * Look for an element, begin displaying output (which is just parsing status)
   * until that element's close is found, then stop displaying output again.
   */
  protected PrettyPrintXml( Builder builder ) throws ParserConfigurationException, SAXException
  {
    inputStream = builder.inputStream;
    tabWidth    = builder.tabWidth;
    parser      = factory.newSAXParser();
    handler     = new Handler();
  }

  /** Execute the parsing and generate the output. */
  public String parse() throws IOException, SAXException
  {
    parser.parse( inputStream, handler );
    return handler.getOutput().toString();
  }

  public static class Builder
  {
    private InputStream inputStream;
    private String      tabWidth = "  ";

    public Builder inputStream( final String input )      { this.inputStream = new ByteArrayInputStream( input.getBytes() ); return this; }
    public Builder inputStream( final byte[] input )      { this.inputStream = new ByteArrayInputStream( input ); return this; }
    public Builder inputStream( InputStream inputStream ) { this.inputStream = inputStream; return this; }
    public Builder tabWidth   ( String tabWidth )         { this.tabWidth    = tabWidth;    return this; }

    public PrettyPrintXml build() throws ParserConfigurationException, SAXException
    {
      return new PrettyPrintXml( this );
    }
  }
}
Handler.java:
package com.windofkeltia.prettyprint;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

import com.windofkeltia.utilities.StringUtilities;

/**
 * This SAX-based printer requires no stack because
 * it prints as it goes.
 */
@SuppressWarnings( { "DanglingJavadoc" } )
public class Handler extends DefaultHandler
{
  private static final Logger logger = LoggerFactory.getLogger( Handler.class );

  private int                   indentation  = 0;              // indentation is a count (of) tabs
  private String                tab          = "  ";           // default width is two spaces
  private ByteArrayOutputStream output       = new ByteArrayOutputStream();

  public void                  setTab   ( final String tab ) { this.tab = tab; }
  public ByteArrayOutputStream getOutput()                   { return output; }

  protected Handler() { super(); }

  public void startDocument() { }

  public void startElement( String uri, String localName, String elementName, Attributes attributes )
  {
    StringBuilder sb = new StringBuilder();
    sb.append( indent() )
      .append( '<' ).append( elementName )
      .append( attributesAsString( getAttributesAsMap( attributes ) ) )
      .append( ">\n" );
    write( sb.toString() );
    indentation++;
  }

  public void endElement( String uri, String localName, String elementName )
  {
    indentation--;

    StringBuilder sb = new StringBuilder();
    sb.append( indent() )
      .append( "</" ).append( elementName )
      .append( ">\n" );
    write( sb.toString() );
  }

  public void characters( char[] ch, int start, int length )
  {
    String characters = new String( ch, start, length ).trim();
    if( !StringUtilities.isEmpty( characters ) )
      doCharacters( characters );
  }

  public void comment( char[] ch, int start, int length )
  {
    String comment = new String( ch, start, length );
    if( !StringUtilities.isEmpty( comment ) )
      doCharacters( comment );
  }

  public void doCharacters( String input )
  {
    StringBuilder sb = new StringBuilder();
    sb.append( indent() )
      .append( input.trim() )
      .append( '\n' );
    write( sb.toString() );
  }

  private String indent()
  {
    StringBuilder sb = new StringBuilder();
    for( int level = 0; level < indentation; level++ )
         sb.append( tab );
    return sb.toString();
  }

  private void write( byte[] bytes )
  {
    try
    {
      output.write( bytes );
    }
    catch( IOException e )
    {
      throw new RuntimeException( e );
    }
  }

  private void write( String string )
  {
    try
    {
      output.write( string.getBytes() );
    }
    catch( IOException e )
    {
      throw new RuntimeException( e );
    }
  }

  /** Do some post-processing of this handler's output to clean up stuff like:
   *
   *   <!-- a change here: -->
   *   <realmCode code="US">      →      <realmCode code="US" />
   *   </realmCode>
   *
   * But, don't go wild and clean by folding the last two lines here:
   *
   *   <!-- no change here: -->
   *   <csmk:div csmk:class="demographic" npmk:nlp="off">
   *     <csmk:div csmk:class="extended_data" npmk:nlp="off">
   *       full_name: BEITEL, ELISE
   *     </csmk:div>
   *   </csmk:div>
   */
  public void endDocument()
  {
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    ByteArrayInputStream  inputStream  = new ByteArrayInputStream( output.toByteArray() );

    try
    {
      BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( inputStream ) );
      String         firstLine = null, secondLine = null;

      while( true )
      {
        try
        {
          if( isNull( firstLine ) )
            firstLine = bufferedReader.readLine();

          secondLine = bufferedReader.readLine();

          if( isNull( secondLine ) )
          {
            // end of the input stream...
            firstLine += '\n';
            outputStream.write( firstLine.getBytes() );  lineNumber++;
            break;
          }

          int firstLineIndent  = HandlerUtilities.countLeadingSpaces( firstLine );
          int secondLineIndent = HandlerUtilities.countLeadingSpaces( secondLine );

          boolean startsWithOpeningElement = HandlerUtilities.startWithOpeningElement( firstLine, secondLine );

          if( firstLineIndent != secondLineIndent || !startsWithOpeningElement )
          {
            // append "\n" to first line and copy it to the output as is...
            firstLine += '\n';
            outputStream.write( firstLine.getBytes() );  lineNumber++;
            firstLine = secondLine;
            secondLine = null;
            continue;
          }

          List< String > firstList  = HandlerUtilities.tokenize( firstLine.trim() );
          List< String > secondList = HandlerUtilities.tokenize( secondLine.trim() );

          String firstElement  = firstList.get( 0 );
          String secondElement = secondList.get( 0 );

          if( firstElement.equals( secondElement ) )
          {
            // special rebuild modifying first line and copying it to the output...
            firstLine = HandlerUtilities.indent( firstLine ) + "<" + firstElement + " />\n";
            outputStream.write( firstLine.getBytes() );  lineNumber++;
            // skip second line; we're dropping it...
            firstLine  = null;
            secondLine = null;
          }
          else
          {
            // append "\n" to first line and copy it to the output as is...
            firstLine += '\n';
            outputStream.write( firstLine.getBytes() );  lineNumber++;
            firstLine = secondLine;
          }
          secondLine = null;
        }
        catch( Exception e )
        {
          logger.warn( "Stopped I/O, failed tokenization and/or line folding: {}", e.getMessage() );
          if( nonNull( firstLine ) )
          {
            firstLine += '\n';
            outputStream.write( firstLine.getBytes() );
          }
          break;
        }
      }

      output = outputStream;
    }
    catch ( Exception e )
    {
      logger.warn( "Failed stream I/O: {}", e.getMessage() );
    }
  }

  /**
   * Here's how to make SAX attributes "Java-useful." If we had uri (namespaces) defined,
   * we'd have to get a lot more serious about how to use uri, localName and qName.
   */
  private Map< String, String > getAttributesAsMap( Attributes saxAttributes )
  {
    int                   attrLength     = saxAttributes.getLength();
    Map< String, String > javaAttributes = new HashMap<>( attrLength );

    for( int attr = 0; attr < attrLength; attr++ )
    {
      String attribute = saxAttributes.getQName( attr );
      String value     = saxAttributes.getValue( attr );
      javaAttributes.put( attribute, value );
    }

    return javaAttributes;
  }

  /**
   * Format XML element attributes canonically for concatenating
   * to their element name.
   */
  private String attributesAsString( Map< String, String > javaAttributes )
  {
    if( javaAttributes.isEmpty() )
      return "";

    StringBuilder sb = new StringBuilder();
    sb.append( ' ' );
    for( Map.Entry< String, String > attribute : javaAttributes.entrySet() )
      sb.append( attribute.getKey() )
        .append( "=\"" )
        .append( StringUtilities.smash( attribute.getValue() ) )
        .append( "\", " );
    sb.setLength( sb.length()-2 );
    return sb.toString();
  }
}
HandlerUtilities.java:
package com.windofkeltia.prettyprint;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

/**
 * These utilities are geared toward the particular output format
 * of the pretty-printer SAX parser and output generated.
 */
public class HandlerUtilities
{
  private static final int    MAXTOKENCOUNT  = 2;
  private static final String XML_DELIMITERS = " <>/";

  /**
   * Tokenize this line using the space character as delimiter. As, for
   * our purposes, we only need the first two tokens (we're looking for
   * '<' and an XML element name), don't spend any more time on this
   * than needed to accomplish that. This is an example of what we want
   * to solve: opening element with no characters (text) followed by
   * a closing element.
   *
   *   <realmCode code="US">      →      <realmCode code="US" />
   *   </realmCode>
   */
  public static List< String > tokenize( String line )
  {
    List< String >  tokens    = new ArrayList<>();
    StringTokenizer tokenizer = new StringTokenizer( line, XML_DELIMITERS );
    while( tokenizer.hasMoreElements() )
      tokens.add( tokenizer.nextToken() );
    return tokens;
  }

  /**
   * Allows us to short-circuit work above even though two consecutive
   * line might match once their leading (and trailing) spaces are
   * trimmed. Trimming will produce inaccurate matches because the
   * number of leading spaces (indentation) was different. E.g.:
   *
   *   <csmk:div csmk:class="demographic" npmk:nlp="off">
   *     <csmk:div csmk:class="extended_data" npmk:nlp="off">
   *       full_name: BEITEL, ELISE
   *     </csmk:div>
   *   </csmk:div>
   */
  public static int countLeadingSpaces( String line )
  {
    int count = 0;

    for( char ch : line.toCharArray() )
    {
      if( ch == ' ' )
      {
        count++;
        continue;
      }

      break;
    }

    return count;
  }

  /** Count leading spaces and create indentation of the same amount. */
  public static String indent( String line )
  {
    StringBuilder tab   = new StringBuilder();
    int           width = countLeadingSpaces( line );
    for( int ch = 0; ch < width; ch++ )
         tab.append( ' ' );
    return tab.toString();
  }

  public static boolean startWithOpeningElement( String line1, String line2 )
  {
    line1 = line1.trim();
    line2 = line2.trim();

    return( line1.startsWith( "<" ) && line2.startsWith( "<" ) );
  }
}
PrettyPrintXmlTest.java:
package com.windofkeltia.prettyprint;

import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;

public class PrettyPrintXmlTest
{
  @Test
  public void test() throws ParserConfigurationException, SAXException, IOException
  {
    final String CCD_BODY = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
      + "<record>\n"
      + "  <justforfun>\n"
      + "    ...\n"
      + "  </justforfun>\n"
      + "  <ccdmessage>\n"
      + "<ClinicalDocument xmlns=\"urn:hl7-org:v3\" xmlns:sdtc=\"urn:hl7-org:sdtc\"\n"
      + "                  xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
      + "                  xsi:schemaLocation=\"urn:hl7-org:v3\n"
      + "                                       http://xreg2.nist.gov:8080/hitspValidation/schema/cdar2c32/infrastructure/cda/C32_CDA.xsd\">\n"
      + "  <realmCode code=\"US\"/>\n"
      + "  <typeId root=\"2.16.840.1.113883.1.3\" extension=\"POCD_HD000040\"/>\n"
      + "  <component>\n"
      + "    <stuff>\n"
      + "      ...\n"
      + "    </stuff>\n"
      + "  </component>\n"
      + "</ClinicalDocument>\n"
      + "  </ccdmessage>\n"
      + "</record>\n";

    PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( CCD_BODY ).build();
    String OUTPUT = prettyPrintXml.parse();
    System.out.println( OUTPUT );
  }

  @Test
  public void testWithNameSpaces() throws ParserConfigurationException, SAXException, IOException
  {
    final String NAMESPACE_CONTENT = ""
      + "<record>\n"
      + "  <csmk:div csmk:class=\"demographic\" npmk:nlp=\"off\">\n"
      + "    <csmk:div csmk:class=\"extended_data\" npmk:nlp=\"off\">\n"
      + "      full_name: BEITEL, ELISE\n"
      + "    </csmk:div>\n"
      + "  </csmk:div>\n"
      + "</record>\n";

    PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( NAMESPACE_CONTENT ).build();
    String OUTPUT = prettyPrintXml.parse();
    System.out.println( OUTPUT );
  }

  @Test
  public void testWithAdditionalWiggle() throws ParserConfigurationException, SAXException, IOException
  {
    final String WIGGLE = ""
      + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
      + "<test>test\n"
      + "  <br />test\n"
      + "</test>\n"
      + "";
    PrettyPrintXml prettyPrintXml = new PrettyPrintXml.Builder().inputStream( WIGGLE ).build();
    String OUTPUT = prettyPrintXml.parse();
    System.out.println( OUTPUT );
  }
}