[Date Index][Thread Index]
[Date Prev][Date Next][Thread Prev][Thread Next]
Re: 2nd Try: Page Explorer
- From: nospam@thanx (Ralf S. Engelschall)
- Date: Wed, 14 Jan 1998 17:54:12 +0100 (MET)
Hello Viorel ANGHEL, in a previous mail you wrote:
> > Any more comments?
>
> It's free? I'm interested to use that script local.
Sure, I'm not an PC/Windows-idiot who hacks a 10 KB script and then tries to
make money with it ;_) No problem. I've appended the script to this message
It's self-contained, i.e. it even contains the header image.
Feel free to use it. Only one exception: When you enhance it, don't make your
own program out of it, send _me_ the diffs, please. In other words: It is
freely available and useable, but it's not freeware.
Greetings,
Ralf S. Engelschall
rse@engelschall.com
www.engelschall.com
#!/sw/bin/perl
##
## pe.cgi -- Page Explorer
## Copyright (c) 1998 Ralf S. Engelschall, All Rights Reserved.
##
## This script is a webdesigner's tool to explore how
## real-life webpages are constructed. It works by acting
## as a filtering proxy which converts the webpage and
## its inlined graphics on-the-fly. The amount of
## conversion is controlled by a exploration level.
##
## Disclaimer: I didn't use the CGI.pm module because
## it leads to unexpected errors under the
## combination Apache 1.2.[45] / Solaris 2.6 :-(
##
require 5.004;
# import used modules
use URI::URL;
use HTTP::Headers;
use HTTP::Request;
use LWP::UserAgent;
use Image::Size;
use GD;
# switch to unbuffered I/O
$|++;
# surround with an eval to catch errors[...]
eval {
##
## configuration
## [THIS SHOULD BE THE ONLY PLACE WHERE YOU EDIT SOMETHING]
##
# determine our name
$server_name = $ENV{'SERVER_NAME'};
# the URL to an optionally used proxy ('none' for no proxy)
$proxy_url = 'none';
$proxy_url = 'http://en1.engelschall.com:8080/' if ($server_name =~ m|^en1|);
#$proxy_url = 'http://www.de.engelschall.com:8080/' if ($server_name =~ m|^www\.engelschall\.com$|);
#$proxy_url = 'http://???:8080/' if ($server_name =~ m|^www\.ch\.engelschall\.com$|);
# comma seperated list of domains for
# which no proxy is used
$no_proxy_domains = 'none';
# the URL to ourself for URL rewriting
#
# Note: I use it with a directory style URL through
# the use of the following Apache/mod_rewrite
# ruleset. Just index.cgi will not work! You
# then also have to set $my_url to ../index.cgi!
#
# RewriteEngine on
# RewriteRule ^$ pe.cgi [L]
# RewriteRule ^pe\.cgi - [L]
# RewriteRule ^(.+) pe.cgi/$1 [T=application/x-httpd-cgi,L]
#
$my_url = 'http://'.$server_name.'/sw/wml/pe/';
$my_url_sep = '/' if ($my_url !~ m|/$|);
# the inital URL for the form
$init_url = 'http://'.$server_name.'/sw/wml/';
# the name of this service
$our_name = 'PageExplorer';
$our_vers = '1.0.0';
##
## import of parameters
##
# PATH_INFO
$path_info = $ENV{'PATH_INFO'};
# QUERY_STRING
$query_string = $ENV{'QUERY_STRING'};
if ($ENV{'REQUEST_METHOD'} eq 'POST') {
$query_string = '';
while (<STDIN>) { $query_string .= $_; }
}
%qs = ();
@pairs = split(/&/, $query_string);
foreach $pair (@pairs) {
my ($name, $value) = split(/=/, $pair);
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack('C', hex($1))/eg;
$qs{$name} = $value;
}
##
## helping functions
##
# send a HTTP response for a complete page
sub send_page {
my ($type, $data) = @_;
print "Content-type: $type\n";
print "Content-length: ".sprintf("%d", length($data))."\n";
print "Connection: close\n";
print "Server: $our_name/$our_vers\n";
print "\n";
print $data;
}
# send a HTTP redirect
sub send_redirect {
my ($url) = @_;
print "Status: 302\n";
print "Location: $url\n";
print "URI: $url\n";
print "Server: $our_name/$our_vers\n";
print "Content-type: text/html\n";
print "\n";
print "New URL: <a href=\"$url\">$url</a>\n";
}
sub uudecode {
my($in) = @_;
my($result,$file,$mode);
$result = $mode = $file = '';
while ($in =~ s/(.*?\n)//s) {
my $line = $1;
if ($file eq '' and !$mode){
($mode,$file) = $line =~ /^begin\s+(\d+)\s+(\S+)/ ;
next;
}
next if $file eq '' and !$mode;
last if $line =~ /^end/;
$result .= uudecode_chunk($line);
}
wantarray ? ($result,$file,$mode) : $result;
}
sub uudecode_chunk {
my($chunk) = @_;
return '' if $chunk =~ /^(--|\#|CREATED)/;
my $string = substr($chunk,0,int((((ord($chunk) - 32) & 077) + 2) / 3)*4+1);
return unpack("u", $string);
}
##
## processing
##
#
# display input form on no parameters
#
if ($path_info eq '' and $query_string eq '') {
$data = <<"EOT";
<html>
<head>
<title>Page Explorer</title>
</head>
<body bgcolor="#ffffff">
<p>
<br>
<center>
<p>
<form method="POST" action="$my_url"
enctype="application/x-www-form-urlencoded">
<table cellspacing=0 cellpadding=0 border=0>
<tr>
<td colspan=4>
<img src="${my_url}${my_url_sep}head.gif" alt="PAGE EXPLORER" width=310 height=36>
</td>
</tr>
<tr>
<td colspan=4>
<font size=+1 face="Arial,Helvetica">Copyright (c) 1998 Ralf S. Engelschall</font>
<br><br><p>
</td>
</tr>
<tr>
<td><font size=+2 face="Arial,Helvetica"><b>URL: </b></font></td>
<td colspan=2><input type="text" size=40 name="url" value="$init_url"></td>
</tr>
<tr>
<td><font size=+2 face="Arial,Helvetica"><b>Level: </b></font></td>
<td>
<select name="level">
<option value="0">0 [+just URL rewriting]
<option value="1">1 [+visible 1pt-images]
<option value="2" selected>2 [+visible borders]
<option value="3">3 [+substituted images]
<option value="4">4 [+removed colors]
</select>
</td>
<td><font face="Arial,Helvetica" size=+2 color="#cc0000"><b><input type="submit" name="submit" value="explore!"></b></font></td>
</tr>
</table>
</form>
</center>
</body>
</html>
EOT
&send_page('text/html', $data);
exit(0);
}
#
# send out the header image
#
if ($path_info eq '/head.gif') {
$head_gif = <<'EOT';
begin 664 head.gif
M1TE&.#EA-@$D`,0``/____\``,S,S&9F9C,S,__,S/]F9O\S,\S__\S,_YF9
MF<S_S)F9S/_,____S/^9S/^9F6:9F?_,F?]FF0`S,_\`,S-F9O\S`&8S9@``
M`,PS,_\S9O]F,S,`,V9FF0``,R'^%D=)1B!3;6%R=%-A=F5R(%9E<C$N,6$`
M+``````V`20```7^("".9&F>:*JN;.N^<"S/=&W?>*[O?.__P*!P2"P:C\BD
M\E=H.I_0%[0)*Q@.AX`V<#`44M.PV"4HF\_HM%J@6KO3*[5,,2!D[AG"0,%.
MO=%Q?VLC@F8R!5N)BHE=$"H0BP%?+!!9D5L'CB8&EYU;+0)XHJ.D=P)VI`,H
M`Z4"K*6P>GTDI`0P"JBP&0.S);JELB:AO[I]N<1Y>RZ(GI<'DR:6B@8LTLU<
MF]>7H,BP9=["I:JOW:($O;5DQ\2J)N6CYR3#[WC&]'<$"BS,VHN:)/P6J2A@
MK1\T`)SZ*>)VSQ0`!>),K,LC@ES#6>E8S'O7CE;#._I$;*1G[V-'%`'^%6[Y
M)R)A))8E"BJ$YE)E`(8714S,T&ND0P`6<P+(V.;C+E]&>8HT6O)C2)0V%1TH
MX8G:B9HVI[:,J@7G/8Q$`4SL&/1>N[`H=I9[.C1I.Y_EFGX<N,@J0*PK1T#R
M="(E)@B3"NRMNU418#%/O-[A)8@$1%)]'ILC4;:7R+)W1*`U@7G7+%RP/$XN
M(0`TJ:6C&/\9,5&8Z5%L2_BU*SN27;R&H[U$4=!+X40';?@\Z:*LK;:C>E5&
M(5D4F\U((9_`W!$Z`+CZAL-HG385&,(H\&H%L*C@>!%^`\",J>49";S!:VB7
M48J.=\K2NZ=&CN<X\U8I8#:"=4!Y-Y\+W)W^T!P^WTW3(",B#+:%7P>)MP)!
M\<&GPX&WU++9<JMX1Z!8]Z50GV;P!&A@B2TD&$Z*4#D8(X0`]%:CC"+(M)X+
M&N;`(0R=.<=9?M.)"",*`*I0UEE'#KG?CRNX2!J!LZ4@87L`^.4(5N=%$A^/
M=1D@YIAD^J98,@.DJ>::L8D&"W$5$>FD**H0Z)-_*2Q($7\,ZB=*=K6L*6B:
M;$F)WWXS;D$;>P[BA=XBT$12`V[-+.I'0WB6H&=_(29GHG0$Z@GGBZ-9MREJ
M].!I**I_/JAH>+O9F*.,?IV''A:XYHJ%591610:F2H)3I*?"",CGGL/2V4(I
M*)J3AGWWP46,JI#^H0%MDR;46N85SF3ISPB.>BM5MLUHU6LGEJ(@[2^9EM"9
M92,L.:A:3(Z6+!ZCNG/:L22Q6@ZU1K4)$%=YA1OAM[62ZXFY4:5[PKJZM#O"
MIGW.^5$?!"ZY[+Y)S0)Q+"2HA8S``Q-L%7@C$)9P;0O_AHFN,._X<"T$U&SS
MS7I\JLNH08X\(+:'*LL"L_PB$P\A-..,\TDB1PROP@T?G-M[`J4'=;<(0;HA
MBT`B`V_/O[!EYXC^[GG/T?)P'>4])%_=CYE9*Q+<E>J1IS5`,(\;-W!;(QH#
MQ4"#':B^]I(J"@NB-OO.S'ZW^-4RY<'<!=PI1ZYK>3?BB,*5#,O=M]#^,;02
M4=#]*:T''T@"[28>;>M$2DAA:3QEXRRXJ*?$?:%,"<&2X/;E".:YK,7O,D!9
MNW>;*B>G"R,:IT+RBE=\79)(T[[V*"5,U/H(5;H@DT(&T(W-YICO/>'G^/Y-
MU$1X@@C#B)OF"Y=_:#F?MO4J2'GJ/KK3Q?M4WW-/"7#3.;[Y2&U%(1)<R+*\
MC17N!#N!D[2>@A;LW`]TCL->]A#(LD0X+!N\DX3XVD.F3A104=M*(>489XY!
MN9`7)*(=9OK@OA<`PW0UX\/.U*06/'TH+'=ZX:#88"CH^<^#W@N3"O%"C>\I
MY(0V(5[9.&+$D&6DALRCQS@ZMCID50\V7VS@B"I6-98+]2]1GW#5)]+S1.&I
M1(K3,PIU%!01+#JP'.T0W(FZ*#%CQ=$D,3R<X>IQQ%>Q@(DKT)&XM($%&IVK
M&7#\V-E4]#H[#DV+\7I<=#AU@M%)<F>!Q(.?.(G&`'R0!%Y:`=VL0I!K4*,F
M4'QC'))"O4'BJX&7Y(AC_O4TZ\SPCV(,9698V*I2GE)J-"ID&B/D1#/!THTS
MF24M'V@"P!'K?9C4E,A@F#IJ_BQ%GWR3,#.0@ITLH096$!-@SLG.=H*"34]S
<ISSG2<]ZVO.>^,RG/O?)SW[Z\Y\`#:A`7P``.P``
`
end
EOT
&send_page('image/gif', &uudecode($head_gif));
exit(0);
}
#
# do a redirection for the form data to
# get the inital canonical form of the URL
#
if ($path_info eq '' and $qs{'url'} ne '' and $qs{'level'} ne '') {
$path = $qs{'url'};
$path =~ s|^http://||;
$level = $qs{'level'};
&send_redirect($my_url.$my_url_sep."l=$level/$path");
exit(0);
}
#
# do the actual filtering for a particular URL
# (this URL can be either a webpage itself or
# some of the inlined images, etc.)
#
# define an own user-agent which has the
# resolving of HTTP redirections disabled
# because redirects have to go through us, too.
package LWP::MyUA;
@ISA = qw(LWP::UserAgent);
sub redirect_ok { return 0; }
package main;
# create an user-agent object
$useragent = new LWP::MyUA;
$useragent->agent("$our_name/$our_vers");
$useragent->proxy('http', $proxy_url) if ($proxy_url ne 'none');
$useragent->no_proxy(split(/,/, $no_proxy_domains)) if ($no_proxy_domains ne 'none');
# calculate the URL which has ourself as the prefix
$path_info =~ m|/l=(\d)/(.+)$|;
$level = $1;
$url = 'http://' . $2;
$url .= "?$query_string" if ($query_string ne '');
$url = new URI::URL($url);
# create the HTTP request
$headers = new HTTP::Headers;
$request = new HTTP::Request('GET', $url, $headers);
# perfrom the HTTP request
$response = $useragent->request($request, undef, undef);
# parse the HTTP response
$urlbase = $response->base;
$contents = $response->content;
$type = $response->content_type;
# if a redirect was forced we perform it
# but with an adjusted URL which again has
# ourself as the prefix
if ($response->is_redirect) {
$path = $response->header("Location");
$path =~ s|^http://||;
&send_redirect($my_url.$my_url_sep."l=$level/".$path);
exit(0);
}
#
# now do the actual filtering
# (Note 1: Level 0 is always needed!)
# (Note 2: It's an `if' for each level, not an `elsif'!)
#
$isnewimage = 0;
if ($level >= 0) {
#
# rewrite all URLs in HTML files
# to make sure we are always the prefix
#
if ($type eq 'text/html') {
$contents =~ s/(<(?:img|frame)\s+)(.+?)(>)/$1.&fixattr_src($2).$3/isge;
$contents =~ s/(<a\s+)(.+?)(>)/$1.&fixattr_href($2).$3/isge;
$contents =~ s/(<script\s+[^>]+?javascript.+?>)(.+?)(<\/script>)/$1.&fixattr_js($2).$3/isge;
sub fixattr_src {
my ($attr) = @_;
$attr =~ s|(src\s*=\s*")([^"]+)(")|$1.&fixurl($2).$3|isge;
$attr =~ s|(src\s*=\s*)([^"]\S+)|$1.&fixurl($2)|isge;
return $attr;
}
sub fixattr_href {
my ($attr) = @_;
$attr =~ s|(href\s*=\s*")([^"]+)(")|$1.&fixurl($2).$3|isge;
$attr =~ s|(href\s*=\s*)([^"]\S+)|$1.&fixurl($2)|isge;
return $attr;
}
sub fixattr_js {
my ($attr) = @_;
$attr =~ s/(\.src\s*=\s*')([^']+\.(?:gif|jpg))(')/$1.&fixurl($2).$3/isge;
$attr =~ s/(\.src\s*=\s*")([^"]+\.(?:gif|jpg))(")/$1.&fixurl($2).$3/isge;
return $attr;
}
sub fixurl {
my ($url) = @_;
my $u = new URI::URL $url, $urlbase;
$url = $u->abs->as_string;
$url =~ s|^http://||;
$url = $my_url.$my_url_sep."l=$level/".$url;
return $url;
}
}
}
if ($level >= 1) {
#
# replace all transparent 1pt dot-images with a red image
#
if ($type eq 'image/gif') {
($w, $h, $t) = Image::Size::imgsize(\$contents);
if ($w*$h == 1) {
# read image into GD
$tmpfile = "/tmp/pe.tmp.$$";
unlink($tmpfile);
open(TMP, ">$tmpfile");
print TMP $contents;
close(TMP);
open(TMP, "<$tmpfile");
$tmpimg = newFromGif GD::Image(TMP);
close(TMP);
unlink($tmpfile);
if ($tmpimg->transparent != -1) {
my $im = new GD::Image($w, $h);
($r1, $g1, $b1) = (255, 0, 0);
($r2, $g2, $b2) = (255, 0, 0);
my $col1 = $im->colorAllocate($r1, $g1, $b1);
my $col2 = $im->colorAllocate($r2, $r2, $b2);
$contents = $im->gif;
$isnewimage = 1;
}
}
}
}
if ($level >= 2) {
#
# replace all border=0 attributes with border=1
#
if ($type eq 'text/html') {
$contents =~ s|border\s*=\s*"?0"?|border=1|sgi;
sub fix_table_border {
my ($str) = @_;
if ($str !~ m|border\s*=|) {
$str = $str."border=1";
}
return $str;
}
$contents =~ s/(<table\s+)(.+?)(>)/$1.&fix_table_border($2).$3/isge;
}
}
if ($level >= 3) {
#
# remove any background images
#
if ($type eq 'text/html') {
sub fix_bg {
my ($str) = @_;
$str =~ s/(background\s*=\s*')([^']+\.(?:gif|jpg))(')/''/isge;
$str =~ s/(background\s*=\s*")([^"]+\.(?:gif|jpg))(")/''/isge;
return $str;
}
$contents =~ s/(<body\s+)(.+?)(>)/$1.&fix_bg($2).$3/isge;
}
#
# replace all non-transparent images with a blank one
#
if ($type =~ m|^image/.*| and not $isnewimage) {
($w, $h, $t) = Image::Size::imgsize(\$contents);
my $im = new GD::Image($w, $h);
($r1, $g1, $b1) = (200, 200, 200);
($r2, $g2, $b2) = (100, 100, 100);
($r3, $g3, $b3) = (0, 0, 0);
my $col1 = $im->colorAllocate($r1, $g1, $b1);
my $col2 = $im->colorAllocate($r2, $g2, $b2);
my $col3 = $im->colorAllocate($r3, $g3, $b3);
$im->rectangle(0, 0, $w-1, $h-1, $col2);
$im->string(gdSmallFont,1,1, sprintf("%dx%d", $w, $h), $col3);
$im->string(gdSmallFont,1,12, $t, $col3);
$contents = $im->gif;
$type = 'image/gif';
}
}
if ($level >= 4) {
#
# remove all color attributes
#
if ($type eq 'text/html') {
$contents =~ s|bgcolor="?#?[0-9a-hA-H]{6}"?||isg;
$contents =~ s|color="?#?[0-9a-hA-H]{6}"?||isg;
$contents =~ s|text="?#?[0-9a-hA-H]{6}"?||isg;
$contents =~ s|link="?#?[0-9a-hA-H]{6}"?||isg;
$contents =~ s|vlink="?#?[0-9a-hA-H]{6}"?||isg;
$contents =~ s|alink="?#?[0-9a-hA-H]{6}"?||isg;
}
}
#
# Puhhh... now the filtering is done.
# All we now have to do is to send the
# stuff to the user...
#
&send_page($type, $contents);
# die gracefully
exit(0);
# ...error handler:
};
if ($@) {
my $text = $@;
print "Content-type: text/html\n";
print "Connection: close\n";
print "\n";
print "<h1>Internal Error</h1>\n";
print $text;
}
##EOF##
______________________________________________________________________
Website META Language (WML) www.engelschall.com/sw/wml/
Official Support Mailing List sw-wml@engelschall.com
Automated List Manager majordomo@engelschall.com