#!/usr/bin/perl -w use strict; # # Convert a perl script into an xml file # # usage: # perlxmltok myfile.pl >myfile.xml # perlxmltok myfile.xml # # The script is broken at the line and token level. # # This file is one of the examples distributed with perltidy and demonstrates # using a callback object with Perl::Tidy to walk through a perl file and # process its tokens. It may or may not have any actual usefulness. You can # modify it to suit your own purposes; see sub get_line(). # use Perl::Tidy; use IO::File; use Getopt::Std; use vars qw($opt_h); my $file; my $usage = <outfile EOM getopts('h') or die "$usage"; if ($opt_h) {die $usage} if ( @ARGV == 1 ) { $file = $ARGV[0]; } else { die $usage } my $source; my $fh; if ($file) { $fh = IO::File->new( $file, 'r' ); unless ($fh) { die "cannot open '$file': $!\n" } $source = $fh; } else { $source = '-'; } my $formatter = Perl::Tidy::XmlWriter->new($file); my $dest; # start perltidy, which will start calling our write_line() my $err = perltidy( 'formatter' => $formatter, # callback object 'source' => $source, 'destination' => \$dest, # not really needed 'argv' => "-npro -se", # dont need .perltidyrc # errors to STDOUT ); if ($err) { die "Error calling perltidy\n"; } $fh->close() if $fh; ##################################################################### # # The Perl::Tidy::XmlWriter class writes a copy of the input stream in xml # ##################################################################### package Perl::Tidy::XmlWriter; # class variables use vars qw{ %token_short_names %short_to_long_names $rOpts $missing_html_entities }; # replace unsafe characters with HTML entity representation if HTML::Entities # is available { eval "use HTML::Entities"; $missing_html_entities = $@; } sub new { my ( $class, $input_file ) = @_; my $self = bless { }, $class; $self->print( <<"HEADER"); HEADER unless ( !$input_file || $input_file eq '-' || ref($input_file) ) { $self->print( <<"COMMENT"); COMMENT } $self->print("\n"); return $self; } sub print { my ( $self, $line ) = @_; print $line; } sub write_line { # This routine will be called once perl line by perltidy my $self = shift; my ($line_of_tokens) = @_; my $line_type = $line_of_tokens->{_line_type}; my $input_line = $line_of_tokens->{_line_text}; my $line_number = $line_of_tokens->{_line_number}; chomp $input_line; $self->print(" \n"); $self->print(" \n"); $input_line = my_encode_entities($input_line); $self->print("$input_line\n"); $self->print(" \n"); # markup line of code.. if ( $line_type eq 'CODE' ) { my $xml_line; my $rtoken_type = $line_of_tokens->{_rtoken_type}; my $rtokens = $line_of_tokens->{_rtokens}; if ( $input_line =~ /(^\s*)/ ) { $xml_line = $1; } else { $xml_line = ""; } my $rmarked_tokens = $self->markup_tokens( $rtokens, $rtoken_type ); $xml_line .= join '', @$rmarked_tokens; $self->print(" \n"); $self->print("$xml_line\n"); $self->print(" \n"); } $self->print(" \n"); } BEGIN { # This is the official list of tokens which may be identified by the # user. Long names are used as getopt keys. Short names are # convenient short abbreviations for specifying input. Short names # somewhat resemble token type characters, but are often different # because they may only be alphanumeric, to allow command line # input. Also, note that because of case insensitivity of xml, # this table must be in a single case only (I've chosen to use all # lower case). # When adding NEW_TOKENS: update this hash table # short names => long names %short_to_long_names = ( 'n' => 'numeric', 'p' => 'paren', 'q' => 'quote', 's' => 'structure', 'c' => 'comment', 'b' => 'blank', 'v' => 'v-string', 'cm' => 'comma', 'w' => 'bareword', 'co' => 'colon', 'pu' => 'punctuation', 'i' => 'identifier', 'j' => 'label', 'h' => 'here-doc-target', 'hh' => 'here-doc-text', 'k' => 'keyword', 'sc' => 'semicolon', 'm' => 'subroutine', 'pd' => 'pod-text', ); # Now we have to map actual token types into one of the above short # names; any token types not mapped will get 'punctuation' # properties. # The values of this hash table correspond to the keys of the # previous hash table. # The keys of this hash table are token types and can be seen # by running with --dump-token-types (-dtt). # When adding NEW_TOKENS: update this hash table # $type => $short_name %token_short_names = ( '#' => 'c', 'n' => 'n', 'v' => 'v', 'b' => 'b', 'k' => 'k', 'F' => 'k', 'Q' => 'q', 'q' => 'q', 'J' => 'j', 'j' => 'j', 'h' => 'h', 'H' => 'hh', 'w' => 'w', ',' => 'cm', '=>' => 'cm', ';' => 'sc', ':' => 'co', 'f' => 'sc', '(' => 'p', ')' => 'p', 'M' => 'm', 'P' => 'pd', ); # These token types will all be called identifiers for now # FIXME: need to separate user defined modules as separate type my @identifier = qw" i t U C Y Z G :: "; @token_short_names{@identifier} = ('i') x scalar(@identifier); # These token types will be called 'structure' my @structure = qw" { } "; @token_short_names{@structure} = ('s') x scalar(@structure); } sub markup_tokens { my $self = shift; my ( $rtokens, $rtoken_type ) = @_; my ( @marked_tokens, $j, $string, $type, $token ); for ( $j = 0 ; $j < @$rtoken_type ; $j++ ) { $type = $$rtoken_type[$j]; $token = $$rtokens[$j]; #------------------------------------------------------- # Patch : intercept a sub name here and split it # into keyword 'sub' and sub name if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) { $token = $self->markup_xml_element( $1, 'k' ); push @marked_tokens, $token; $token = $2; $type = 'M'; } # Patch : intercept a package name here and split it # into keyword 'package' and name if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) { $token = $self->markup_xml_element( $1, 'k' ); push @marked_tokens, $token; $token = $2; $type = 'i'; } #------------------------------------------------------- $token = $self->markup_xml_element( $token, $type ); push @marked_tokens, $token; } return \@marked_tokens; } sub my_encode_entities { my ($token) = @_; # escape any characters not allowed in XML content. # ??s/’/'/; if ($missing_html_entities) { $token =~ s/\&/&/g; $token =~ s/\/>/g; $token =~ s/\"/"/g; } else { HTML::Entities::encode_entities($token); } return $token; } sub markup_xml_element { my $self = shift; my ( $token, $type ) = @_; if ($token) { $token = my_encode_entities($token) } # get the short abbreviation for this token type my $short_name = $token_short_names{$type}; if ( !defined($short_name) ) { $short_name = "pu"; # punctuation is default } $token = qq(<$short_name>) . $token . qq(); return $token; } sub finish_formatting { # called after last line my $self = shift; $self->print("\n"); return; }