DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Text Synonymizer In Perl - Unintelligent Text Rewriter

04.14.2006
| 5916 views |
  • submit to reddit
        Very scrappy and silly, but you get some funny results. It uses the great Lingua::EN::Tagger for POS (Parts of Speech) tagging.

use WordNet::QueryData;
use Lingua::EN::Tagger;

my $t = new Lingua::EN::Tagger;
my $wn = WordNet::QueryData->new;

my $text;

open (FH, "<" . $ARGV[0]);
while (<FH>) { $text .= $_; }
close (FH);

my $tagged = $t->add_tags($text);

while ($tagged =~ /\<(.+?)\>(\w+)\<.+?\>/g) {
        my $sense = $1;
        my $word = $2;
        my $newsense = "";
        $newsense = "n" if ($sense =~ /nn/i);
        $newsense = "a" if ($sense =~ /jj/i);
        $newsense = "v" if ($sense =~ /vb/i);
        if ($newsense) {
                foreach ($wn->querySense($word . "#" . $newsense . "#1" , "syns")) {
                        s/\#.+//;
                        next if (/$word/);
                        $text =~ s/$word/$_/;
                        last;
                }
        }

};

print $text;
exit;

Or to do it to a Web page / URL, use HTML::Parser like so:

use WordNet::QueryData;
use Lingua::EN::Tagger;
use HTML::Parser;
use LWP::Simple;

my $t = new Lingua::EN::Tagger;
my $wn = WordNet::QueryData->new;
my $p = HTML::Parser->new( text_h => [\&text, "text"] );

$p->parse(get("http://www.petercooper.co.uk/"));

exit;

sub text {
        my $text = shift;
        $text =~ s/\s+/\ /g;
        if ($text =~ /\w{5}/) {        
                print "WAS: " . $text . "\n\n";
                print "BECOMES: " . &synonymize($text) . "\n\n\n\n";
        }
}

sub synonymize {
        my $text = shift;

        my $tagged = $t->add_tags($text);

        while ($tagged =~ /\<(.+?)\>(\w+)\<.+?\>/g) {
        my $sense = $1;
        my $word = $2;
        my $newsense = "";
        $newsense = "n" if ($sense =~ /nn/i);
        $newsense = "a" if ($sense =~ /jj/i);
        $newsense = "v" if ($sense =~ /vb/i);
        if ($newsense) {
                foreach ($wn->querySense($word . "#" . $newsense . "#1" , "syns")) {
                        s/\#.+//;
                        next if (/$word/);
                        $text =~ s/$word/$_/;
                        last;
                }
        }

        };
        return $text;
}