FAQ
I have written an interface for m-w.com. I found some scripts on the web but
nothing really robust. Please have a look, make comments, request
functionality, make suggestions, make changes, or anything else you feel
useful. If I have enough time, I will make this fully object-oriented.

I welcome all comments.

Thanks,
Michael

--------------begin mw.pl-----------------------------
#!/usr/bin/perl
use warnings;
use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw(POST);
use Getopt::Std;

# OPTIONS:
# 1. retain tabs a
# 2. double spaced d
# 3. help h
# 4. divider btween words i
# 5. wrap text r
# 6. thesaurus t
# 7. ugly formatting u
# 8. verbose or recursive v
# 9. set width w

local $SIG{__WARN__} = sub {
my $msg = shift;
die $msg if $msg =~ /Unknown option/;
};

getopts('adhi:rtuvw:', \my %opts);
if (@ARGV==0 or $opts{'h'} or
($opts{'w'} and $opts{'w'} !~ /^\d+$/))
{
die <<"HELP";
usage: $0 [-dhrtuv -iC -wN] word1 [word2 [word3 [...]]]
-a Do not convert tabs to single spaces, does not wrap properly
-d Double-spaced, default is single-spaced
-h Prints this usage message
-iC Prints a divider of character C between entries
-r Wrap line at width by inserting \\n, use with -wN
-t Use the thesaurus, default is dictionary
-u Ugly formattting, translate <br> to space instead of \\n
But why would you want to do this?
-v Verbose, print all entries, default is only the first
-wN Set widths of divider and text-wrap to N, default is 80\n
Examples:
# all definitions of harvest and posture separated by 80 _'s
$0 -vi_ harvest posture\n
# all thesaurus entries for hot separated by 60 +'s
$0 -tvi+ -w60 hot\n
# first definitions of cape and toad, double-spaced, separated by 65 \%'s
# retain tabs, wrap text at column 65, ugly formatting
$0 -adrui% -w65 cape toad
HELP
}

my $spacing = $opts{'d'} ? "\n\n" : "\n" ;
my $div_str = $opts{'i'} ? $opts{'i'} : '-' ;
my $book = $opts{'t'} ? 'Thesaurus' : 'Dictionary';
my $width = $opts{'w'} ? $opts{'w'} : 80 ;
my $ugly = $opts{'u'} ? ' ' : "\n" ;
my $divider = join '', substr($div_str x $width,0,$width), $spacing;

my $ua = new LWP::UserAgent or die "Cannot create UserAgent\n";
my $url = 'http://www.m-w.com/cgi-bin/dictionary';
my ($html, %entity);

while ( my $word = shift ) {
$html = get_content([ book => $book, va => $word ]);

if ($html =~ /One entry found for .+?$word/i) {
$html = join '', $&, $';
print_entry();
} elsif ($html =~ /(\d+ entries found for .+?$word)/i) {
$html = $1;
html2text(\$html);
print "$html$spacing";

$html = join '', $&, $';
my($list) = $html =~
/<input type=hidden name=list value="(.+?)">/;
my @entries = split /=|;/, $list;
splice @entries, 0, 2; # this entry already displayed
print_entry();

while ($opts{'v'} and @entries) {
$html = get_content([ book => $book,
hdwd => $word,
listword => $word,
jump => $entries[0],
list => $list
]);
print_entry();
splice @entries, 0, 2;
}
} else {
print "$word not found in the ", lc $book, " at http://www.m-w.com\n";
}
}


sub get_content {
die "trying to POST an empty request" unless @_;
my $request = POST $url, shift;
my $response = $ua->request($request);
$response->content;
}


sub print_entry {
($html) = $html =~ m!((?:Main Entry|Entry Word):.*?)</form>!is;
{ no warnings 'uninitialized';
my $x;
# neat trick that turns
# text 1 a : definition one b : definition two
# into
# text
# 1a: definition one
# 1b: definition two
$html =~
s!<b>(\d+)?
(?:\s+)?
([a-z])?
</b>
!join '', "\n", ($1 and $x=$1 or $x), $2
!xeg;
}

$html =~ s/<br>/$ugly/g unless $opts{'u'};
$html =~ s/\t/ /g unless $opts{'a'};
html2text(\$html);
# expecting \n, but server sent chr(13), that's what you get for
expectations
$html = join $spacing, split /[\n\r\x{0A}\x{0D}]+/, $html;
$html =~ s/^\s+|\s+$//g;
# substitute only if the present line is greater than width
my $wrap = $width + 1;
$html =~ s/(?=.{$wrap})(.{1,$width}) +(?=[^\n])/$1$spacing/g if
$opts{'r'};

print "$html$spacing";
print $divider if $opts{'i'};
}


sub html2text {
# typeglob aliases are supposedly faster than refs to scalars
our $htm;
local *htm = shift;
return unless length $htm;
my($begin, $end) = ('<!--', '-->');

# The three following substitutions and the %entity initialization below
# were brazenly copied from Tom Christiansens's striphtml (striff tummel)
# perl script written back in 1996. It can still be found at
# http://www.perl.com/CPAN-local/authors/Tom_Christiansen/scripts/. I
# slightly changed the first substitution: it now handles embedded
comments.
# Hopefully, tags within comments are still handled properly. Otherwise,
# all else is the same.

# 1. remove embedded comments
1 while $htm =~ s/$begin (?:(?!$begin).)*? $end//gxs;
# 2. remove tags
$htm =~ s/<(?:[^>'"]+ | ".*?" | '.*?')*?>//gxs;
load_entity() unless %entity;
{ no warnings 'uninitialized';
# 3. replace entities with actual characters
$htm =~ s/(&(\w+ | \x23\d+);?)/$entity{$2} || $1/gxe;
}
}


sub load_entity {
# &lt;
%entity = (
lt => '<', # less-than
gt => '>', # greater-than
amp => '&', # ampersand
quot => '"', # verticle double-quote
nbsp => chr 160, # no-break space
iexcl => chr 161, # inverted exclamation mark
cent => chr 162, # cent sign
pound => chr 163, # pound sterling sign CURRENCY NOT WEIGHT
curren => chr 164, # general currency sign
yen => chr 165, # yen sign
brvbar => chr 166, # broken vertical bar
sect => chr 167, # section sign
uml => chr 168, # umlaut (dieresis)
copy => chr 169, # copyright sign
ordf => chr 170, # ordinal indicator, feminine
laquo => chr 171, # angle quotation mark, left
not => chr 172, # not sign
shy => chr 173, # soft hyphen
reg => chr 174, # registered sign
macr => chr 175, # macron
deg => chr 176, # degree sign
plusmn => chr 177, # plus-or-minus sign
sup2 => chr 178, # superscript two
sup3 => chr 179, # superscript three
acute => chr 180, # acute accent
micro => chr 181, # micro sign
para => chr 182, # pilcrow (paragraph sign)
middot => chr 183, # middle dot
cedil => chr 184, # cedilla
sup1 => chr 185, # superscript one
ordm => chr 186, # ordinal indicator, masculine
raquo => chr 187, # angle quotation mark, right
frac14 => chr 188, # fraction one-quarter
frac12 => chr 189, # fraction one-half
frac34 => chr 190, # fraction three-quarters
iquest => chr 191, # inverted question mark
Agrave => chr 192, # capital A, grave accent
Aacute => chr 193, # capital A, acute accent
Acirc => chr 194, # capital A, circumflex accent
Atilde => chr 195, # capital A, tilde
Auml => chr 196, # capital A, dieresis or umlaut mark
Aring => chr 197, # capital A, ring
AElig => chr 198, # capital AE diphthong (ligature)
Ccedil => chr 199, # capital C, cedilla
Egrave => chr 200, # capital E, grave accent
Eacute => chr 201, # capital E, acute accent
Ecirc => chr 202, # capital E, circumflex accent
Euml => chr 203, # capital E, dieresis or umlaut mark
Igrave => chr 204, # capital I, grave accent
Iacute => chr 205, # capital I, acute accent
Icirc => chr 206, # capital I, circumflex accent
Iuml => chr 207, # capital I, dieresis or umlaut mark
ETH => chr 208, # capital Eth, Icelandic
Ntilde => chr 209, # capital N, tilde
Ograve => chr 210, # capital O, grave accent
Oacute => chr 211, # capital O, acute accent
Ocirc => chr 212, # capital O, circumflex accent
Otilde => chr 213, # capital O, tilde
Ouml => chr 214, # capital O, dieresis or umlaut mark
times => chr 215, # multiply sign
Oslash => chr 216, # capital O, slash
Ugrave => chr 217, # capital U, grave accent
Uacute => chr 218, # capital U, acute accent
Ucirc => chr 219, # capital U, circumflex accent
Uuml => chr 220, # capital U, dieresis or umlaut mark
Yacute => chr 221, # capital Y, acute accent
THORN => chr 222, # capital THORN, Icelandic
szlig => chr 223, # small sharp s, German (sz ligature)
agrave => chr 224, # small a, grave accent
aacute => chr 225, # small a, acute accent
acirc => chr 226, # small a, circumflex accent
atilde => chr 227, # small a, tilde
auml => chr 228, # small a, dieresis or umlaut mark
aring => chr 229, # small a, ring
aelig => chr 230, # small ae diphthong (ligature)
ccedil => chr 231, # small c, cedilla
egrave => chr 232, # small e, grave accent
eacute => chr 233, # small e, acute accent
ecirc => chr 234, # small e, circumflex accent
euml => chr 235, # small e, dieresis or umlaut mark
igrave => chr 236, # small i, grave accent
iacute => chr 237, # small i, acute accent
icirc => chr 238, # small i, circumflex accent
iuml => chr 239, # small i, dieresis or umlaut mark
eth => chr 240, # small eth, Icelandic
ntilde => chr 241, # small n, tilde
ograve => chr 242, # small o, grave accent
oacute => chr 243, # small o, acute accent
ocirc => chr 244, # small o, circumflex accent
otilde => chr 245, # small o, tilde
ouml => chr 246, # small o, dieresis or umlaut mark
divide => chr 247, # divide sign
oslash => chr 248, # small o, slash
ugrave => chr 249, # small u, grave accent
uacute => chr 250, # small u, acute accent
ucirc => chr 251, # small u, circumflex accent
uuml => chr 252, # small u, dieresis or umlaut mark
yacute => chr 253, # small y, acute accent
thorn => chr 254, # small thorn, Icelandic
yuml => chr 255, # small y, dieresis or umlaut mark
);

# &#161;
for(0..255){
$entity{'#' . $_} = chr $_;
}
}

Search Discussions

  • Zentara at Jul 8, 2003 at 7:15 pm

    On Mon, 7 Jul 2003 21:19:19 -0400, zeus.odin@verizon.net (Zeus Odin) wrote:

    I have written an interface for m-w.com. I found some scripts on the web but
    nothing really robust. Please have a look, make comments, request
    functionality, make suggestions, make changes, or anything else you feel
    useful. If I have enough time, I will make this fully object-oriented.

    I welcome all comments.
    Nice script......into my ~/bin. :-)

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupbeginners @
categoriesperl
postedJul 8, '03 at 2:18a
activeJul 8, '03 at 7:15p
posts2
users2
websiteperl.org

2 users in discussion

Zeus Odin: 1 post Zentara: 1 post

People

Translate

site design / logo © 2022 Grokbase