package HTML::Packer;

use 5.008009;
use strict;
use warnings;
use Carp;
use Regexp::RegGrp;
use Digest::SHA qw(sha256_base64 sha384_base64 sha512_base64);
use Safe::Isa;

# -----------------------------------------------------------------------------

our $VERSION = '2.11';

our @BOOLEAN_ACCESSORS = (
    'remove_comments',
    'remove_comments_aggressive',
    'remove_newlines',
    'no_compress_comment',
    'html5',
);

our @JAVASCRIPT_OPTS    = ( 'clean', 'obfuscate', 'shrink', 'best' );
our @CSS_OPTS           = ( 'minify', 'pretty' );
our @CSP_OPTS           = ( 'sha256', 'sha384', 'sha512' );

our $REQUIRED_JAVASCRIPT_PACKER = '1.002001';
our $REQUIRED_CSS_PACKER        = '1.000001';

our @SAVE_SPACE_ELEMENTS = (
    'a', 'abbr', 'acronym', 'address', 'b', 'bdo', 'big', 'button', 'cite',
    'del', 'dfn', 'em', 'font', 'i', 'input', 'ins', 'kbd', 'label', 'q',
    's', 'samp', 'select', 'small', 'strike', 'strong', 'sub', 'sup', 'u', 'var'
);

our @VOID_ELEMENTS = (
    'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
);

# Some regular expressions are from HTML::Clean

our $COMMENT        = '((?>\s*))(<!--(?:.*?)?-->)((?>\s*))';
our $COMMENT_SAFE   = '((?>\s*))(<!--(?:(?![#\[]| google_ad_section_).*?)?-->)((?>\s*))';

our $PACKER_COMMENT = '<!--\s*HTML::Packer\s*(\w+)\s*-->';

our $DOCTYPE        = '<\!DOCTYPE[^>]*>';

our $DONT_CLEAN     = '(<\s*(pre|code|textarea|script|style)[^>]*>)(.*?)(<\s*\/\2[^>]*>)';

our $WHITESPACES    = [
    {
        regexp      => qr/^\s*/s,
        replacement => ''
    },
    {
        regexp      => qr/\s*$/s,
        replacement => ''
    },
    {
        regexp      => '^\s*',
        replacement => '',
        modifier    => 'm'
    },
    {
        regexp      => '[^\S\n]*$',
        replacement => '',
        modifier    => 'm'
    },
    {
        regexp      => qr/(?<=>)[^<>]*(?=<)/sm,
        replacement => sub {
            my $match = $_[0]->{match};

            $match =~ s/[^\S\n]{2,}/ /sg;
            $match =~ s/\s*\n+\s*/\n/sg;

            return $match;
        }
    },
    {
        regexp      => '<\s*(\/)?\s*',
        replacement => sub {
            return sprintf( '<%s', $_[0]->{submatches}->[0] );
        },
        modifier    => 's'
    },
    {
        regexp      => '\s*(\/)?\s*>',
        replacement => sub {
            return sprintf( '%s>', $_[0]->{submatches}->[0] );
        },
        modifier    => 's'
    }
];

our $NEWLINES_TAGS = [
    {
        regexp      => '(\s*)(<\s*\/?\s*(?:' . join( '|', @SAVE_SPACE_ELEMENTS ) . ')\b[^>]*>)(\s*)',
        replacement => sub {
            return sprintf( '%s%s%s', $_[0]->{submatches}->[0] ? ' ' : '', $_[0]->{submatches}->[1], $_[0]->{submatches}->[2] ? ' ' : '' );
        },
        modifier    => 'is'
    }
];

our $NEWLINES = [
    {
        regexp      => '(.)\n(.)',
        replacement => sub {
            my ( $pre, $post ) = @{$_[0]->{submatches}};

            my $ret;

            if ( $pre eq '>' and $post eq '<' ) {
                $ret = $pre . $post;
            }
            elsif ( $pre eq '-' and $post =~ /[\w]/ ) {
                $ret = $pre . $post;
            }
            else {
                $ret = $pre . ' ' . $post;
            }

            return $ret;
        }
    }
];

our @REGGRPS        = ( 'newlines', 'newlines_tags', 'whitespaces', 'void_elements' );

our $GLOBAL_REGGRP  = 'global';

##########################################################################################

{
    no strict 'refs';

    foreach my $field ( @BOOLEAN_ACCESSORS ) {
        next if defined *{ __PACKAGE__ . '::' . $field }{CODE};

        *{ __PACKAGE__ . '::' . $field} = sub {
            my ( $self, $value ) = @_;

            $self->{'_' . $field} = $value ? 1 : undef if ( defined( $value ) );

            return $self->{'_' . $field};
        };
    }

    foreach my $reggrp ( @REGGRPS, $GLOBAL_REGGRP ) {
        next if defined *{ __PACKAGE__ . '::reggrp_' . $reggrp }{CODE};

        *{ __PACKAGE__ . '::reggrp_' . $reggrp } = sub {
            my ( $self ) = shift;

            return $self->{ '_reggrp_' . $reggrp };
        };
    }
}

sub do_javascript {
    my ( $self, $value ) = @_;

    if ( defined( $value ) ) {
        if ( grep( $value eq $_, @JAVASCRIPT_OPTS ) ) {
            $self->{_do_javascript} = $value;
        }
        elsif ( ! $value ) {
            $self->{_do_javascript} = undef;
        }
    }

    return $self->{_do_javascript};
}

sub do_stylesheet {
    my ( $self, $value ) = @_;

    if ( defined( $value ) ) {
        if ( grep( $value eq $_, @CSS_OPTS ) ) {
            $self->{_do_stylesheet} = $value;
        }
        elsif ( ! $value ) {
            $self->{_do_stylesheet} = undef;
        }
    }

    return $self->{_do_stylesheet};
}

sub do_csp {
    my ( $self, $value ) = @_;

    if ( defined( $value ) ) {
        if ( grep( $value eq $_, @CSP_OPTS ) ) {
            $self->{_do_csp} = $value;
        }
        elsif ( ! $value ) {
            $self->{_do_csp} = undef;
        }
    }

    return $self->{_do_csp};
}

# these variables are used in the closures defined in the init function
# below - we have to use globals as using $self within the closures leads
# to a reference cycle and thus memory leak, and we can't scope them to
# the init method as they may change. they are set by the minify sub
our $remove_comments;
our $remove_comments_aggressive;
our $remove_newlines;
our $html5;
our $do_javascript;
our $do_stylesheet;
our $do_csp;
our $js_packer;
our $css_packer;
our %csp;
our $reggrp_ws;

sub init {
    my $class = shift;
    my $self  = {};

    bless( $self, $class );

    $self->{whitespaces}->{reggrp_data}   = $WHITESPACES;
    $self->{newlines}->{reggrp_data}      = $NEWLINES;
    $self->{newlines_tags}->{reggrp_data} = $NEWLINES_TAGS;
    $self->{global}->{reggrp_data}        = [
        {
            regexp      => $DOCTYPE,
            replacement => sub {
                return '<!--~' . $_[0]->{store_index} . '~-->';
            },
            store => sub {
                my $doctype = $_[0]->{match};

                $doctype =~ s/\s+/ /gsm;

                return $doctype;
            }
        },
        {
			# this is using a variable that won't be initialized until after we have
			# called ->minify so we endup calling ->init again (see FIXME)
            regexp      => $remove_comments_aggressive ? $COMMENT : $COMMENT_SAFE,
            replacement => sub {
                return $remove_comments ? (
                    $remove_newlines ? ' ' : (
                        ( $_[0]->{submatches}->[0] =~ /\n/s or $_[0]->{submatches}->[2] =~ /\n/s ) ? "\n" : ''
                    )
                ) : '<!--~' . $_[0]->{store_index} . '~-->';
            },
            store => sub {
                my $ret = $remove_comments ? '' : (
                     ( ( not $remove_newlines and $_[0]->{submatches}->[0] =~ /\n/s ) ? "\n" : '' ) .
                     $_[0]->{submatches}->[1] .
                     ( ( not $remove_newlines and $_[0]->{submatches}->[2] =~ /\n/s ) ? "\n" : '' )
                );

                return $ret;
            }
        },
        {
            regexp      => $DONT_CLEAN,
            replacement => sub {
                return '<!--~' . $_[0]->{store_index} . '~-->';
            },
            store => sub {
                my ( $opening, undef, $content, $closing )  = @{$_[0]->{submatches}};

                if ( $content ) {
                    my $opening_script_re   = '<\s*script' . ( $html5 ? '[^>]*>' : '[^>]*(?:java|ecma)script[^>]*>' );
                    my $opening_style_re    = '<\s*style' . ( $html5 ? '[^>]*>' : '[^>]*text\/css[^>]*>' );
					my $js_type_re          = q{type=['"]((((application|text)/){0,1}(x-){0,1}(java|ecma)script)|module)['"]};

                    if (
						$opening =~ /$opening_script_re/i
						&& ( $opening =~ /$js_type_re/i || $opening !~ /type/i )
					) {
                        $opening =~ s/ type="(text\/)?(java|ecma)script"//i if ( $html5 );

                        if ( $js_packer and $do_javascript ) {
                            $js_packer->minify( \$content, { compress => $do_javascript } );

                            unless ( $html5 ) {
                                $content = '/*<![CDATA[*/' . $content . '/*]]>*/';
                            }
                        }

                        if ( $do_csp ) {
                            no strict 'refs';
                            push @{ $csp{'script-src'} }, &{ "${do_csp}_base64" } ( $content );
                        }
                    }
                    elsif ( $opening =~ /$opening_style_re/i ) {
                        $opening =~ s/ type="text\/css"//i if ( $html5 );

                        if ( $css_packer and $do_stylesheet ) {
                            $css_packer->minify( \$content, { compress => $do_stylesheet } );
                            $content = "\n" . $content if ( $do_stylesheet eq 'pretty' );
                        }

                        if ( $do_csp ) {
                            no strict 'refs';
                            push @{ $csp{'style-src'} }, &{ "${do_csp}_base64" } ( $content );
                        }
                    }
                }
                else {
                    $content = '';
                }

                $reggrp_ws->exec( \$opening );
                $reggrp_ws->exec( \$closing );

                return $opening . $content . $closing;
            },
            modifier    => 'ism'
        }
    ];

    $self->{void_elements}->{reggrp_data} = [
        {
            regexp      => '<\s*((?:' . join( '|', @VOID_ELEMENTS ) . ')\b[^>]*)\s*\/>',
            replacement => sub {
                return '<' . $_[0]->{submatches}->[0] . '>';
            },
            modifier    => 'ism'
        }
    ];

    foreach ( @HTML::Packer::REGGRPS ) {
        $self->{ '_reggrp_' . $_ } = Regexp::RegGrp->new( { reggrp => $self->{$_}->{reggrp_data} } );
    }

    $self->{ '_reggrp_' . $GLOBAL_REGGRP } = Regexp::RegGrp->new(
        {
            reggrp          => $self->{$GLOBAL_REGGRP}->{reggrp_data},
            restore_pattern => qr/<!--~(\d+)~-->/
        }
    );

    return $self;
}

sub minify {
    my ( $self, $input, $opts );

    unless (
        ref( $_[0] ) and
        $_[0]->$_isa( __PACKAGE__ )
    ) {
        $self = __PACKAGE__->init();

        shift( @_ ) unless ( ref( $_[0] ) );

        ( $input, $opts ) = @_;
    }
    else {
        ( $self, $input, $opts ) = @_;
    }

    if ( ref( $input ) ne 'SCALAR' ) {
        carp( 'First argument must be a scalarref!' );
        return undef;
    }

    my $html;
    my $cont    = 'void';

    if ( defined( wantarray ) ) {
        my $tmp_input = ref( $input ) ? ${$input} : $input;

        $html   = \$tmp_input;
        $cont   = 'scalar';
    }
    else {
        $html = ref( $input ) ? $input : \$input;
    }

    if ( ref( $opts ) eq 'HASH' ) {
        foreach my $field ( @BOOLEAN_ACCESSORS ) {
            $self->$field( $opts->{$field} ) if ( defined( $opts->{$field} ) );
        }

        $self->do_javascript( $opts->{do_javascript} ) if ( defined( $opts->{do_javascript} ) );
        $self->do_stylesheet( $opts->{do_stylesheet} ) if ( defined( $opts->{do_stylesheet} ) );
        $self->do_csp( $opts->{do_csp} ) if ( defined( $opts->{do_csp} ) );
    }

    if ( not $self->no_compress_comment() and ${$html} =~ /$PACKER_COMMENT/s ) {
        my $compress = $1;
        if ( $compress eq '_no_compress_' ) {
            return ( $cont eq 'scalar' ) ? ${$html} : undef;
        }
    }

	# (re)initialize variables used in the closures
	$remove_comments = $self->remove_comments || $self->remove_comments_aggressive;
	$remove_comments_aggressive = $self->remove_comments_aggressive;
	$remove_newlines = $self->remove_newlines;
	$html5           = $self->html5;
	$do_javascript   = $self->do_javascript;
	$do_stylesheet   = $self->do_stylesheet;
	$do_csp          = $self->do_csp;
	$js_packer       = $self->javascript_packer;
	$css_packer      = $self->css_packer;
	$reggrp_ws       = $self->reggrp_whitespaces;

    # blank out the CSP hash before populating it again
    %csp = ();

	# FIXME: hacky way to get around ->init being called before ->minify
	$self = ref( $self )->init if $remove_comments_aggressive;

    $self->reggrp_global()->exec( $html );
    $self->reggrp_whitespaces()->exec( $html );
    if ( $self->remove_newlines() ) {
        $self->reggrp_newlines_tags()->exec( $html );
        $self->reggrp_newlines()->exec( $html );
    }
    if ( $self->html5() ) {
        $self->reggrp_void_elements()->exec( $html );
    }

    $self->reggrp_global()->restore_stored( $html );

    return ${$html} if ( $cont eq 'scalar' );
}

sub javascript_packer {
    my $self = shift;

    unless ( $self->{_checked_javascript_packer} ) {
        eval "use JavaScript::Packer $REQUIRED_JAVASCRIPT_PACKER;";

        unless ( $@ ) {
            $self->{_javascript_packer} = eval {
                JavaScript::Packer->init();
            };
        }

        $self->{_checked_javascript_packer} = 1;
    }

    return $self->{_javascript_packer};
}

sub css_packer {
    my $self = shift;

    unless ( $self->{_checked_css_packer} ) {
        eval "use CSS::Packer $REQUIRED_CSS_PACKER;";

        unless ( $@ ) {
            $self->{_css_packer} = eval {
                CSS::Packer->init();
            };
        }

        $self->{_checked_css_packer} = 1;
    }

    return $self->{_css_packer};
}

sub csp {
    my $self = shift;

    return 'script-src' => [ ], 'style-src' => [ ] unless $do_csp and %csp;

    return
        'script-src' => [ map "'$do_csp-$_='", @{ $csp{'script-src'} } ],
        'style-src' => [ map "'$do_csp-$_='", @{ $csp{'style-src'} } ],
    ;
}

1;

__END__

=head1 NAME

HTML::Packer - Another HTML code cleaner

=for html
<a href='https://travis-ci.org/leejo/html-packer-perl?branch=master'><img src='https://travis-ci.org/leejo/html-packer-perl.svg?branch=master' alt='Build Status' /></a>
<a href='https://coveralls.io/r/leejo/html-packer-perl'><img src='https://coveralls.io/repos/leejo/html-packer-perl/badge.png?branch=master' alt='Coverage Status' /></a>

=head1 VERSION

Version 2.11

=head1 DESCRIPTION

A HTML Compressor.

=head1 SYNOPSIS

    use HTML::Packer;

    my $packer = HTML::Packer->init();

    $packer->minify( $scalarref, $opts );

To return a scalar without changing the input simply use (e.g. example 2):

    my $ret = $packer->minify( $scalarref, $opts );

For backward compatibility it is still possible to call 'minify' as a function:

    HTML::Packer::minify( $scalarref, $opts );

First argument must be a scalarref of HTML-Code.
Second argument must be a hashref of options. Possible options are

=over 4

=item remove_comments

HTML-Comments will be removed if 'remove_comments' has a true value.  Comments starting with C<<!--#>,
C<<!--[> or C<<!-- google_ad_section_> will be preserved unless 'remove_comments_aggressive' has a true value. 

=item remove_comments_aggressive

See 'remove_comments'.

=item remove_newlines

ALL newlines will be removed if 'remove_newlines' has a true value.

=item do_javascript

Defines compression level for javascript. Possible values are 'clean', 'obfuscate', 'shrink' and 'best'.
Default is no compression for javascript.
This option only takes effect if L<JavaScript::Packer> is installed.

=item do_stylesheet

Defines compression level for CSS. Possible values are 'minify' and 'pretty'.
Default is no compression for CSS.
This option only takes effect if L<CSS::Packer> is installed.

=item do_csp

Defines hash algorithm for C<Content-Security-Policy>, or CSP, hashes of
embedded C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> tags.

Allowed values are C<'sha256'>, C<'sha384'>, C<'sha512'>.

It may be left blank or set to a Perl false value to indicate that hashes
should not be calculated, if performance is a concern.

=item no_compress_comment

If not set to a true value it is allowed to set a HTML comment that prevents the input being packed.

    <!-- HTML::Packer _no_compress_ -->

Is not set by default.

=item html5

If set to a true value closing slashes will be removed from void elements.

=item csp

If C<do_csp> is set to C<'sha256'>, returns a hash that looks like this:

    (
        'script-src' => [qw( sha256-...= sha256-...= )],
        'style-src'  => [qw( sha256-...= sha256-...= )],
    )

with each element of the C<ARRAY>refs containing a CSP-friendly hash for a
C<E<lt>scriptE<gt>> or C<E<lt>styleE<gt>> tag.

=back

=head1 AUTHOR

Merten Falk, C<< <nevesenin at cpan.org> >>. Now maintained by Lee
Johnson (LEEJO) with contributions from:

	Alexander Krizhanovsky <[email protected]>
	Bas Bloemsaat <[email protected]>
	girst <[email protected]>
	Ankit Pati (ANKITPATI) <[email protected]>

=head1 BUGS

Please report any bugs or feature requests through
the web interface at L<https://github.com/leejo/html-packer-perl/issues>. I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

perldoc HTML::Packer

=head1 COPYRIGHT & LICENSE

Copyright 2009 - 2011 Merten Falk, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=head1 SEE ALSO

L<HTML::Clean>

=cut