package MetaCPAN::Document::File; use strict; use warnings; use utf8; use Moose; use ElasticSearchX::Model::Document; use List::Util qw( any ); use MetaCPAN::Document::Module (); use MetaCPAN::Types qw( ESBool Module ); use MetaCPAN::Types::TypeTiny qw( ArrayRef Int Maybe Num ScalarRef Stat Str ); use MetaCPAN::Util qw(numify_version true false); use Plack::MIME (); use Pod::Text (); use Try::Tiny qw( catch try ); Plack::MIME->add_type( '.t' => 'text/x-script.perl' ); Plack::MIME->add_type( '.pod' => 'text/x-pod' ); Plack::MIME->add_type( '.xs' => 'text/x-c' ); my @NOT_PERL_FILES = qw(SIGNATURE); sub BUILD { my $self = shift; # force building of `mime` $self->_build_mime; } =head1 PROPERTIES =head2 deprecated Indicates file deprecation (the abstract contains "DEPRECATED" or "DEPRECIATED", or the x_deprecated flag is included in the corresponding "provides" entry in distribution metadata); it is also set if the entire release is marked deprecated (see L). =cut has deprecated => ( is => 'ro', isa => ESBool, default => sub {false}, writer => '_set_deprecated', ); =head2 abstract Abstract of the documentation (if any). This is built by parsing the C section. It also sets L if it succeeds. =cut has section => ( is => 'ro', isa => Maybe [Str], lazy => 1, builder => '_build_section', property => 0, ); my $RE_SECTION = qr/^\s*(\S+)((\h+-+\h+(.+))|(\r?\n\h*\r?\n\h*(.+)))?/ms; sub _build_section { my $self = shift; my $text = ${ $self->content }; my $section = MetaCPAN::Util::extract_section( $text, 'NAME' ); # if it's a POD file without a name section, let's try to generate # an abstract and name based on filename if ( !$section && $self->path =~ /\.pod$/ ) { $section = $self->path; $section =~ s{^(lib|pod|docs)/}{}; $section =~ s{\.pod$}{}; $section =~ s{/}{::}g; } return undef unless ($section); $section =~ s/^=\w+.*$//mg; $section =~ s/X<.*?>//mg; return $section; } has abstract => ( required => 1, is => 'ro', isa => Maybe [Str], lazy => 1, builder => '_build_abstract', index => 'analyzed', ); sub _build_abstract { my $self = shift; return undef unless ( $self->is_perl_file ); my $section = $self->section; return undef unless $section; my $abstract; if ( $section =~ $RE_SECTION ) { chomp( $abstract = $4 || $6 ) if ( $4 || $6 ); } if ($abstract) { $abstract =~ s/^=\w+.*$//xms; $abstract =~ s{\r?\n\h*\r?\n\h*.*$}{}xms; $abstract =~ s{\n}{ }gxms; $abstract =~ s{\s+$}{}gxms; $abstract =~ s{(\s)+}{$1}gxms; $abstract = MetaCPAN::Util::strip_pod($abstract); } return $abstract; } =head2 id Unique identifier of the release. Consists of the L's pauseid, the release L, and the file path. See L. =cut has id => ( is => 'ro', id => [qw(author release path)], ); =head2 module An ArrayRef of L objects, that represent modules defined in that class (i.e. package declarations). =cut has module => ( is => 'ro', isa => Module, type => 'nested', include_in_root => 1, coerce => 1, clearer => 'clear_module', writer => '_set_module', lazy => 1, default => sub { [] }, ); =head2 download_url B Download URL of the release =cut has download_url => ( is => 'ro', required => 1 ); =head2 date B Release date (i.e. C of the archive file). =cut has date => ( is => 'ro', required => 1, isa => 'DateTime', ); =head2 description Contains the C section of the POD if any. Will be stripped from whitespaces and POD commands. =cut has description => ( required => 1, is => 'ro', lazy => 1, builder => '_build_description', index => 'not_analyzed', ); sub _build_description { my $self = shift; return undef unless ( $self->is_perl_file ); my $section = MetaCPAN::Util::extract_section( ${ $self->content }, 'DESCRIPTION' ); return undef unless ($section); my $parser = Pod::Text->new; my $text = q{}; $parser->output_string( \$text ); try { $parser->parse_string_document("=pod\n\n$section"); } catch { warn $_; }; return undef unless $text; $text =~ s/\s+/ /g; $text =~ s/^\s+//; $text =~ s/\s+$//; return $text; } =head2 distribution =head2 distribution.analyzed =head2 distribution.camelcase Name of the distribution (e.g. C). =head2 author PAUSE ID of the author. =head2 status Valid values are C, C, and C. The most recent upload of a distribution is tagged as C as long as it's not a developer release, unless there are only developer releases. Everything else is tagged C. Once a release is deleted from PAUSE it is tagged as C. =cut has status => ( is => 'ro', required => 1, default => 'cpan' ); =head2 binary File is binary or not. =cut has binary => ( is => 'ro', isa => ESBool, required => 1, default => sub {false}, ); =head2 authorized See L. =cut has authorized => ( required => 1, is => 'ro', isa => ESBool, default => sub {true}, writer => '_set_authorized', ); =head2 maturity Maturity of the release. This can either be C or C. See L. =cut has maturity => ( is => 'ro', required => 1, default => 'released', ); =head2 directory Return true if this object represents a directory. =cut has directory => ( is => 'ro', required => 1, isa => ESBool, default => sub {false}, ); =head2 documentation Holds the name for the documentation in this file. If the file L, the name is derived from the C section. If the file L and the name from the C section matches one of the modules in L, it returns the name. Otherwise it returns the name of the first module in L. If there are no modules in the file the documentation is set to C. =cut has documentation => ( is => 'ro', isa => Maybe [Str], lazy => 1, index => 'analyzed', builder => '_build_documentation', predicate => 'has_documentation', analyzer => [qw(standard camelcase lowercase edge edge_camelcase)], clearer => 'clear_documentation', writer => '_set_documentation', ); sub _build_documentation { my $self = shift; return undef unless ( $self->is_perl_file ); my $section = $self->section; return undef unless $section; my $documentation; if ( $section =~ $RE_SECTION ) { my $name = MetaCPAN::Util::strip_pod($1); $documentation = $name if ( $name =~ /^[\w\.:\-_']+$/ ); } $documentation = MetaCPAN::Util::strip_pod($documentation) if $documentation; return undef unless length $documentation; # Modules to be indexed my @indexed = grep { $_->indexed } @{ $self->module || [] }; # This is a Pod file, return its name if ( $documentation && $self->is_pod_file ) { return $documentation; } # OR: found an indexed module with the same name if ( $documentation && grep { $_->name eq $documentation } @indexed ) { return $documentation; } # OR: found an indexed module with a name if ( my ($mod) = grep { defined $_->name } @indexed ) { return $mod->name; } # OR: we have a parsed documentation return $documentation if defined $documentation; # OR: found ANY module with a name (better than nothing) if ( my ($mod) = grep { defined $_->name } @{ $self->module || [] } ) { return $mod->name; } return undef; } has documentation_length => ( is => 'ro', isa => Maybe [Int], lazy => 1, builder => '_build_documentation_length', ); sub _build_documentation_length { my ($self) = @_; return length( $self->documentation ); } =head2 suggest Autocomplete info for documentation. =cut has suggest => ( is => 'ro', # isa => Maybe [HashRef], # remarked: breaks the suggester lazy => 1, builder => '_build_suggest', ); sub _build_suggest { my $self = shift; my $doc = $self->documentation; # return +{} unless $doc; # remarked because of 'isa' return unless $doc; my $weight = 1000 - length($doc); $weight = 0 if $weight < 0; return +{ input => [$doc], weight => $weight, }; } =head2 indexed B Indicates whether the file should be included in the search index or not. See L for a more verbose explanation. =cut has indexed => ( required => 1, is => 'ro', isa => ESBool, lazy => 1, default => sub { my ($self) = @_; return false if $self->is_in_other_files; return false if !$self->metadata->should_index_file( $self->path ); return true; }, writer => '_set_indexed', ); =head2 level Level of this file in the directory tree of the release (i.e. C has a level of C<0>). =cut has level => ( required => 1, is => 'ro', isa => Int, lazy => 1, builder => '_build_level', ); sub _build_level { my $self = shift; my @level = split( /\//, $self->path ); return @level - 1; } =head2 pod Pure text format of the pod (see L). Consecutive whitespaces are removed to save space and for better snippet previews. =cut has pod => ( required => 1, is => 'ro', isa => ScalarRef, lazy => 1, builder => '_build_pod', index => 'analyzed', not_analyzed => 0, store => 'no', term_vector => 'with_positions_offsets', ); sub _build_pod { my $self = shift; return \'' unless ( $self->is_perl_file ); my $parser = Pod::Text->new( sentence => 0, width => 78 ); # We don't need to index pod errors. $parser->no_errata_section(1); my $content = ${ $self->content }; # The pod parser is very liberal and will "start" a pod document when it # sees /^=[a-zA-Z]/ even though it might be binary like /^=F\0?\{/. # So munge any lines that might match but are not usual pod directives # that people would use (we don't need to index non-regular pod). # Also see the test and comments in t/document/file.t for how # bizarre constructs are handled. $content =~ s/ # Pod::Simple::parse_string_document() "supports \r, \n ,\r\n"... (?: \A|\r|\r\n|\n) # beginning of line \K # (keep those characters) ( =[a-zA-Z][a-zA-Z0-9]* # looks like pod (?! # but followed by something that isn't pod: [a-zA-Z0-9] # more pod chars (the star won't be greedy enough) | \s # whitespace ("=head1 NAME\n", "=item\n") | \Z # end of line or end of doc ) ) # Prefix (to hide from Pod parser) instead of removing. /\0$1/gx; my $text = ""; $parser->output_string( \$text ); try { $parser->parse_string_document($content); } catch { warn $_[0]; }; $text =~ s/\s+/ /g; $text =~ s/ \z//; # Remove any markers we put in the text. # Should we remove other non-regular bytes that may come from the source? $text =~ s/\0//g; return \$text; } =head2 pod_lines ArrayRef of ArrayRefs of offset and length of pod blocks. Example: # Two blocks of pod, starting at line 1 and line 15 with length # of 10 lines each [[1,10], [15,10]] =cut has pod_lines => ( is => 'ro', isa => ArrayRef, type => 'integer', lazy => 1, builder => '_build_pod_lines', index => 'no', ); sub _build_pod_lines { my $self = shift; return [] unless ( $self->is_perl_file ); my ( $lines, $slop ) = MetaCPAN::Util::pod_lines( ${ $self->content } ); $self->_set_slop( $slop || 0 ); return $lines; } =head2 sloc Source Lines of Code. Strips empty lines, pod and C section from L and returns the number of lines. =cut has sloc => ( required => 1, is => 'ro', isa => Int, lazy => 1, builder => '_build_sloc', ); # Metrics from Perl::Metrics2::Plugin::Core. sub _build_sloc { my $self = shift; return 0 unless ( $self->is_perl_file ); my @content = split( "\n", ${ $self->content } ); my $pods = 0; # Use pod_lines data to remove pod content from string. map { splice( @content, $_->[0], $_->[1], map {''} 1 .. $_->[1] ) } @{ $self->pod_lines }; my $sloc = 0; while (@content) { my $line = shift @content; last if ( $line =~ /^\s*__(DATA|END)__/s ); $sloc++ if ( $line !~ /^\s*#/ && $line =~ /\S/ ); } return $sloc; } =head2 slop Source Lines of Pod. Returns the number of pod lines using L. =cut has slop => ( required => 1, is => 'ro', isa => Int, lazy => 1, builder => '_build_slop', writer => '_set_slop', ); sub _build_slop { my $self = shift; return 0 unless ( $self->is_perl_file ); $self->_build_pod_lines; # danger! infinite recursion if not set by `_build_pod_lines` # we should probably find a better solution -- Mickey return $self->slop; } =head2 stat L info of the archive file. Contains C, C and C. =cut has stat => ( is => 'ro', isa => Stat, dynamic => 1, ); =head2 version Contains the raw version string. =cut has version => ( is => 'ro', ); =head2 version_numified B Numeric representation of L. Contains 0 if there is no version or the version could not be parsed. =cut has version_numified => ( required => 1, is => 'ro', isa => Num, lazy => 1, builder => '_build_version_numified', ); sub _build_version_numified { my $self = shift; return numify_version( $self->version ); } =head2 mime MIME type of file. Derived using L (for speed). =cut has mime => ( required => 1, is => 'ro', lazy => 1, builder => '_build_mime', ); sub _build_mime { my $self = shift; if ( !$self->directory && $self->name !~ /\./ && grep { $self->name ne $_ } @NOT_PERL_FILES ) { my $content = ${ $self->content }; return "text/x-script.perl" if ( $content =~ /^#!.*?perl/ ); } else { return Plack::MIME->mime_type( $self->name ) || 'text/plain'; } } has [qw(path author name)] => ( is => 'ro', required => 1 ); sub _build_path { my $self = shift; return join( '/', $self->release->name, $self->name ); } has dir => ( is => 'ro', isa => Str, lazy => 1, builder => '_build_dir', index => 'not_analyzed' ); sub _build_dir { my $self = shift; $DB::single = 1; my $dir = $self->path; $dir =~ s{/[^/]+$}{}; return $dir; } has [qw(release distribution)] => ( is => 'ro', required => 1, analyzer => [qw(standard camelcase lowercase)], ); =head1 ATTRIBUTES These attributes are not stored. =head2 content A scalar reference to the content of the file. =cut has content => ( is => 'ro', isa => ScalarRef, lazy => 1, default => sub { \"" }, property => 0, ); =head2 local_path This attribute holds the path to the file on the local filesystem. =cut has local_path => ( is => 'ro', property => 0, ); =head2 metadata Reference to the L object of the release. =cut has metadata => ( is => 'ro', isa => 'CPAN::Meta', lazy => 1, default => sub { die 'meta attribute missing' }, property => 0, ); =head1 METHODS =head2 is_perl_file Return true if the file extension is one of C, C, C, C or if the file has no extension, is not a binary file and its size is less than 131072 bytes. This is an arbitrary limit but it keeps the pod parser happy and the indexer fast. =cut sub is_perl_file { my $self = shift; return 0 if ( $self->directory ); return 1 if ( $self->name =~ /\.(pl|pm|pod|t)$/i ); return 1 if ( $self->mime eq "text/x-script.perl" ); return 1 if ( $self->name !~ /\./ && !( grep { $self->name eq $_ } @NOT_PERL_FILES ) && !$self->binary && $self->stat->{size} < 2**17 ); return 0; } =head2 is_pod_file Returns true if the file extension is C. =cut sub is_pod_file { shift->name =~ /\.pod$/i; } =head2 add_module Requires at least one parameter which can be either a HashRef or an instance of L. =cut sub add_module { my ( $self, @modules ) = @_; $_ = MetaCPAN::Document::Module->new($_) for ( grep { ref $_ eq 'HASH' } @modules ); $self->_set_module( [ @{ $self->module }, @modules ] ); } =head2 is_in_other_files Returns true if the file is one from the list below. =cut sub is_in_other_files { my $self = shift; my @other = qw( AUTHORS Build.PL Changelog ChangeLog CHANGELOG Changes CHANGES CONTRIBUTING CONTRIBUTING.md CONTRIBUTING.pod Copying COPYRIGHT cpanfile CREDITS dist.ini FAQ INSTALL INSTALL.md INSTALL.pod LICENSE Makefile.PL MANIFEST META.json META.yml NEWS README README.md README.pod THANKS Todo ToDo TODO ); return any { $self->path eq $_ } @other; } =head2 set_indexed Expects a C<$meta> parameter which is an instance of L. For each package (L) in the file and based on L it is decided, whether the module should have a true L attribute. If there are any packages with leading underscores, the module gets a false L attribute, because PAUSE doesn't allow this kind of name for packages (https://github.com/andk/pause/blob/master/lib/PAUSE/pmfile.pm#L249). If L returns true but the package declaration uses the I hack, the L property is set to false. package # hide from PAUSE MyTest::Module; # will result in indexed => 0 Once that is done, the L property of the file is determined by searching the list of L for a module that matches the value of L. If there is no such module, the L property is set to false. If the file does not include any modules, the L property is true. =cut sub set_indexed { my ( $self, $meta ) = @_; # modules explicitly listed in 'provides' should be indexed foreach my $mod ( @{ $self->module } ) { if ( exists $meta->provides->{ $mod->name } and $self->path eq $meta->provides->{ $mod->name }{file} ) { $mod->_set_indexed(true); return; } } # files listed under 'other files' are not shown in a search if ( $self->is_in_other_files() ) { foreach my $mod ( @{ $self->module } ) { $mod->_set_indexed(false); } $self->_set_indexed(false); return; } # files under no_index directories should not be indexed foreach my $dir ( @{ $meta->no_index->{directory} } ) { if ( $self->path eq $dir or $self->path =~ /^$dir\// ) { $self->_set_indexed(false); return; } } foreach my $mod ( @{ $self->module } ) { if ( $mod->name !~ /^[A-Za-z]/ or !$meta->should_index_package( $mod->name ) ) { $mod->_set_indexed(false); next; } $mod->_set_indexed( $mod->hide_from_pause( ${ $self->content }, $self->name ) ? false : true ); } if ( my $doc_name = $self->documentation ) { # normalize the documentation name for comparison the same way module # names are normalized my $normalized_doc_name = $doc_name =~ s{'}{::}gr; $self->_set_indexed( ( # .pm file with no package declaration but pod should be indexed !@{ $self->module } || # don't index if the documentation doesn't match any of its modules grep { $normalized_doc_name eq $_->name } @{ $self->module } ) ? true : false ); } } =head2 set_authorized Expects a C<$perms> parameter which is a HashRef. The key is the module name and the value an ArrayRef of author names who are allowed to release that module. The method returns a list of unauthorized, but indexed modules. Unauthorized modules are modules that were uploaded in the name of a different author than stated in the C<06perms.txt.gz> file. One problem with this file is, that it doesn't record historical data. It may very well be that an author was authorized to upload a module at the time. But then his co-maintainer rights might have been revoked, making consecutive uploads of that release unauthorized. However, since this script runs with the latest version of C<06perms.txt.gz>, the former upload will be flagged as unauthorized as well. Same holds the other way round, a previously unauthorized release would be flagged authorized if the co-maintainership was added later on. If a release contains unauthorized modules, the whole release is marked as unauthorized as well. =cut sub set_authorized { my ( $self, $perms ) = @_; if ( $self->distribution eq 'perl' ) { my $allowed = grep $_ eq $self->author, @{ $perms->{perl} }; foreach my $module ( @{ $self->module } ) { $module->_set_authorized( $allowed ? true : false ); } $self->_set_authorized( $allowed ? true : false ); } else { foreach my $module ( @{ $self->module } ) { $module->_set_authorized(false) if ( $perms->{ $module->name } && !grep { $_ eq $self->author } @{ $perms->{ $module->name } } ); } $self->_set_authorized(false) if ( $self->authorized && $self->documentation && $perms->{ $self->documentation } && !grep { $_ eq $self->author } @{ $perms->{ $self->documentation } } ); } return grep { !$_->authorized && $_->indexed } @{ $self->module }; } =head2 full_path Concatenate L, L and L. =cut sub full_path { my $self = shift; return join( '/', $self->author, $self->release, $self->path ); } __PACKAGE__->meta->make_immutable; 1;