package HTML::Packer; use 5.008009; use strict; use warnings; use Carp; use Regexp::RegGrp; use Digest::SHA qw(sha256_base64 sha384_base64 sha512_base64); use Safe::Isa; # ----------------------------------------------------------------------------- our $VERSION = '2.11'; our @BOOLEAN_ACCESSORS = ( 'remove_comments', 'remove_comments_aggressive', 'remove_newlines', 'no_compress_comment', 'html5', ); our @JAVASCRIPT_OPTS = ( 'clean', 'obfuscate', 'shrink', 'best' ); our @CSS_OPTS = ( 'minify', 'pretty' ); our @CSP_OPTS = ( 'sha256', 'sha384', 'sha512' ); our $REQUIRED_JAVASCRIPT_PACKER = '1.002001'; our $REQUIRED_CSS_PACKER = '1.000001'; our @SAVE_SPACE_ELEMENTS = ( 'a', 'abbr', 'acronym', 'address', 'b', 'bdo', 'big', 'button', 'cite', 'del', 'dfn', 'em', 'font', 'i', 'input', 'ins', 'kbd', 'label', 'q', 's', 'samp', 'select', 'small', 'strike', 'strong', 'sub', 'sup', 'u', 'var' ); our @VOID_ELEMENTS = ( 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' ); # Some regular expressions are from HTML::Clean our $COMMENT = '((?>\s*))()((?>\s*))'; our $COMMENT_SAFE = '((?>\s*))()((?>\s*))'; our $PACKER_COMMENT = ''; our $DOCTYPE = '<\!DOCTYPE[^>]*>'; our $DONT_CLEAN = '(<\s*(pre|code|textarea|script|style)[^>]*>)(.*?)(<\s*\/\2[^>]*>)'; our $WHITESPACES = [ { regexp => qr/^\s*/s, replacement => '' }, { regexp => qr/\s*$/s, replacement => '' }, { regexp => '^\s*', replacement => '', modifier => 'm' }, { regexp => '[^\S\n]*$', replacement => '', modifier => 'm' }, { regexp => qr/(?<=>)[^<>]*(?=<)/sm, replacement => sub { my $match = $_[0]->{match}; $match =~ s/[^\S\n]{2,}/ /sg; $match =~ s/\s*\n+\s*/\n/sg; return $match; } }, { regexp => '<\s*(\/)?\s*', replacement => sub { return sprintf( '<%s', $_[0]->{submatches}->[0] ); }, modifier => 's' }, { regexp => '\s*(\/)?\s*>', replacement => sub { return sprintf( '%s>', $_[0]->{submatches}->[0] ); }, modifier => 's' } ]; our $NEWLINES_TAGS = [ { regexp => '(\s*)(<\s*\/?\s*(?:' . join( '|', @SAVE_SPACE_ELEMENTS ) . ')\b[^>]*>)(\s*)', replacement => sub { return sprintf( '%s%s%s', $_[0]->{submatches}->[0] ? ' ' : '', $_[0]->{submatches}->[1], $_[0]->{submatches}->[2] ? ' ' : '' ); }, modifier => 'is' } ]; our $NEWLINES = [ { regexp => '(.)\n(.)', replacement => sub { my ( $pre, $post ) = @{$_[0]->{submatches}}; my $ret; if ( $pre eq '>' and $post eq '<' ) { $ret = $pre . $post; } elsif ( $pre eq '-' and $post =~ /[\w]/ ) { $ret = $pre . $post; } else { $ret = $pre . ' ' . $post; } return $ret; } } ]; our @REGGRPS = ( 'newlines', 'newlines_tags', 'whitespaces', 'void_elements' ); our $GLOBAL_REGGRP = 'global'; ########################################################################################## { no strict 'refs'; foreach my $field ( @BOOLEAN_ACCESSORS ) { next if defined *{ __PACKAGE__ . '::' . $field }{CODE}; *{ __PACKAGE__ . '::' . $field} = sub { my ( $self, $value ) = @_; $self->{'_' . $field} = $value ? 1 : undef if ( defined( $value ) ); return $self->{'_' . $field}; }; } foreach my $reggrp ( @REGGRPS, $GLOBAL_REGGRP ) { next if defined *{ __PACKAGE__ . '::reggrp_' . $reggrp }{CODE}; *{ __PACKAGE__ . '::reggrp_' . $reggrp } = sub { my ( $self ) = shift; return $self->{ '_reggrp_' . $reggrp }; }; } } sub do_javascript { my ( $self, $value ) = @_; if ( defined( $value ) ) { if ( grep( $value eq $_, @JAVASCRIPT_OPTS ) ) { $self->{_do_javascript} = $value; } elsif ( ! $value ) { $self->{_do_javascript} = undef; } } return $self->{_do_javascript}; } sub do_stylesheet { my ( $self, $value ) = @_; if ( defined( $value ) ) { if ( grep( $value eq $_, @CSS_OPTS ) ) { $self->{_do_stylesheet} = $value; } elsif ( ! $value ) { $self->{_do_stylesheet} = undef; } } return $self->{_do_stylesheet}; } sub do_csp { my ( $self, $value ) = @_; if ( defined( $value ) ) { if ( grep( $value eq $_, @CSP_OPTS ) ) { $self->{_do_csp} = $value; } elsif ( ! $value ) { $self->{_do_csp} = undef; } } return $self->{_do_csp}; } # these variables are used in the closures defined in the init function # below - we have to use globals as using $self within the closures leads # to a reference cycle and thus memory leak, and we can't scope them to # the init method as they may change. they are set by the minify sub our $remove_comments; our $remove_comments_aggressive; our $remove_newlines; our $html5; our $do_javascript; our $do_stylesheet; our $do_csp; our $js_packer; our $css_packer; our %csp; our $reggrp_ws; sub init { my $class = shift; my $self = {}; bless( $self, $class ); $self->{whitespaces}->{reggrp_data} = $WHITESPACES; $self->{newlines}->{reggrp_data} = $NEWLINES; $self->{newlines_tags}->{reggrp_data} = $NEWLINES_TAGS; $self->{global}->{reggrp_data} = [ { regexp => $DOCTYPE, replacement => sub { return ''; }, store => sub { my $doctype = $_[0]->{match}; $doctype =~ s/\s+/ /gsm; return $doctype; } }, { # this is using a variable that won't be initialized until after we have # called ->minify so we endup calling ->init again (see FIXME) regexp => $remove_comments_aggressive ? $COMMENT : $COMMENT_SAFE, replacement => sub { return $remove_comments ? ( $remove_newlines ? ' ' : ( ( $_[0]->{submatches}->[0] =~ /\n/s or $_[0]->{submatches}->[2] =~ /\n/s ) ? "\n" : '' ) ) : ''; }, store => sub { my $ret = $remove_comments ? '' : ( ( ( not $remove_newlines and $_[0]->{submatches}->[0] =~ /\n/s ) ? "\n" : '' ) . $_[0]->{submatches}->[1] . ( ( not $remove_newlines and $_[0]->{submatches}->[2] =~ /\n/s ) ? "\n" : '' ) ); return $ret; } }, { regexp => $DONT_CLEAN, replacement => sub { return ''; }, store => sub { my ( $opening, undef, $content, $closing ) = @{$_[0]->{submatches}}; if ( $content ) { my $opening_script_re = '<\s*script' . ( $html5 ? '[^>]*>' : '[^>]*(?:java|ecma)script[^>]*>' ); my $opening_style_re = '<\s*style' . ( $html5 ? '[^>]*>' : '[^>]*text\/css[^>]*>' ); my $js_type_re = q{type=['"]((((application|text)/){0,1}(x-){0,1}(java|ecma)script)|module)['"]}; if ( $opening =~ /$opening_script_re/i && ( $opening =~ /$js_type_re/i || $opening !~ /type/i ) ) { $opening =~ s/ type="(text\/)?(java|ecma)script"//i if ( $html5 ); if ( $js_packer and $do_javascript ) { $js_packer->minify( \$content, { compress => $do_javascript } ); unless ( $html5 ) { $content = '/**/'; } } if ( $do_csp ) { no strict 'refs'; push @{ $csp{'script-src'} }, &{ "${do_csp}_base64" } ( $content ); } } elsif ( $opening =~ /$opening_style_re/i ) { $opening =~ s/ type="text\/css"//i if ( $html5 ); if ( $css_packer and $do_stylesheet ) { $css_packer->minify( \$content, { compress => $do_stylesheet } ); $content = "\n" . $content if ( $do_stylesheet eq 'pretty' ); } if ( $do_csp ) { no strict 'refs'; push @{ $csp{'style-src'} }, &{ "${do_csp}_base64" } ( $content ); } } } else { $content = ''; } $reggrp_ws->exec( \$opening ); $reggrp_ws->exec( \$closing ); return $opening . $content . $closing; }, modifier => 'ism' } ]; $self->{void_elements}->{reggrp_data} = [ { regexp => '<\s*((?:' . join( '|', @VOID_ELEMENTS ) . ')\b[^>]*)\s*\/>', replacement => sub { return '<' . $_[0]->{submatches}->[0] . '>'; }, modifier => 'ism' } ]; foreach ( @HTML::Packer::REGGRPS ) { $self->{ '_reggrp_' . $_ } = Regexp::RegGrp->new( { reggrp => $self->{$_}->{reggrp_data} } ); } $self->{ '_reggrp_' . $GLOBAL_REGGRP } = Regexp::RegGrp->new( { reggrp => $self->{$GLOBAL_REGGRP}->{reggrp_data}, restore_pattern => qr// } ); return $self; } sub minify { my ( $self, $input, $opts ); unless ( ref( $_[0] ) and $_[0]->$_isa( __PACKAGE__ ) ) { $self = __PACKAGE__->init(); shift( @_ ) unless ( ref( $_[0] ) ); ( $input, $opts ) = @_; } else { ( $self, $input, $opts ) = @_; } if ( ref( $input ) ne 'SCALAR' ) { carp( 'First argument must be a scalarref!' ); return undef; } my $html; my $cont = 'void'; if ( defined( wantarray ) ) { my $tmp_input = ref( $input ) ? ${$input} : $input; $html = \$tmp_input; $cont = 'scalar'; } else { $html = ref( $input ) ? $input : \$input; } if ( ref( $opts ) eq 'HASH' ) { foreach my $field ( @BOOLEAN_ACCESSORS ) { $self->$field( $opts->{$field} ) if ( defined( $opts->{$field} ) ); } $self->do_javascript( $opts->{do_javascript} ) if ( defined( $opts->{do_javascript} ) ); $self->do_stylesheet( $opts->{do_stylesheet} ) if ( defined( $opts->{do_stylesheet} ) ); $self->do_csp( $opts->{do_csp} ) if ( defined( $opts->{do_csp} ) ); } if ( not $self->no_compress_comment() and ${$html} =~ /$PACKER_COMMENT/s ) { my $compress = $1; if ( $compress eq '_no_compress_' ) { return ( $cont eq 'scalar' ) ? ${$html} : undef; } } # (re)initialize variables used in the closures $remove_comments = $self->remove_comments || $self->remove_comments_aggressive; $remove_comments_aggressive = $self->remove_comments_aggressive; $remove_newlines = $self->remove_newlines; $html5 = $self->html5; $do_javascript = $self->do_javascript; $do_stylesheet = $self->do_stylesheet; $do_csp = $self->do_csp; $js_packer = $self->javascript_packer; $css_packer = $self->css_packer; $reggrp_ws = $self->reggrp_whitespaces; # blank out the CSP hash before populating it again %csp = (); # FIXME: hacky way to get around ->init being called before ->minify $self = ref( $self )->init if $remove_comments_aggressive; $self->reggrp_global()->exec( $html ); $self->reggrp_whitespaces()->exec( $html ); if ( $self->remove_newlines() ) { $self->reggrp_newlines_tags()->exec( $html ); $self->reggrp_newlines()->exec( $html ); } if ( $self->html5() ) { $self->reggrp_void_elements()->exec( $html ); } $self->reggrp_global()->restore_stored( $html ); return ${$html} if ( $cont eq 'scalar' ); } sub javascript_packer { my $self = shift; unless ( $self->{_checked_javascript_packer} ) { eval "use JavaScript::Packer $REQUIRED_JAVASCRIPT_PACKER;"; unless ( $@ ) { $self->{_javascript_packer} = eval { JavaScript::Packer->init(); }; } $self->{_checked_javascript_packer} = 1; } return $self->{_javascript_packer}; } sub css_packer { my $self = shift; unless ( $self->{_checked_css_packer} ) { eval "use CSS::Packer $REQUIRED_CSS_PACKER;"; unless ( $@ ) { $self->{_css_packer} = eval { CSS::Packer->init(); }; } $self->{_checked_css_packer} = 1; } return $self->{_css_packer}; } sub csp { my $self = shift; return 'script-src' => [ ], 'style-src' => [ ] unless $do_csp and %csp; return 'script-src' => [ map "'$do_csp-$_='", @{ $csp{'script-src'} } ], 'style-src' => [ map "'$do_csp-$_='", @{ $csp{'style-src'} } ], ; } 1; __END__ =head1 NAME HTML::Packer - Another HTML code cleaner =for html Build Status Coverage Status =head1 VERSION Version 2.11 =head1 DESCRIPTION A HTML Compressor. =head1 SYNOPSIS use HTML::Packer; my $packer = HTML::Packer->init(); $packer->minify( $scalarref, $opts ); To return a scalar without changing the input simply use (e.g. example 2): my $ret = $packer->minify( $scalarref, $opts ); For backward compatibility it is still possible to call 'minify' as a function: HTML::Packer::minify( $scalarref, $opts ); First argument must be a scalarref of HTML-Code. Second argument must be a hashref of options. Possible options are =over 4 =item remove_comments HTML-Comments will be removed if 'remove_comments' has a true value. Comments starting with C< Is not set by default. =item html5 If set to a true value closing slashes will be removed from void elements. =item csp If C is set to C<'sha256'>, returns a hash that looks like this: ( 'script-src' => [qw( sha256-...= sha256-...= )], 'style-src' => [qw( sha256-...= sha256-...= )], ) with each element of the Crefs containing a CSP-friendly hash for a CscriptE> or CstyleE> tag. =back =head1 AUTHOR Merten Falk, C<< >>. Now maintained by Lee Johnson (LEEJO) with contributions from: Alexander Krizhanovsky Bas Bloemsaat girst Ankit Pati (ANKITPATI) =head1 BUGS Please report any bugs or feature requests through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc HTML::Packer =head1 COPYRIGHT & LICENSE Copyright 2009 - 2011 Merten Falk, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L =cut