You are on page 1of 5

http://www.webmasterkb.com/Uwe/Forum.

aspx/perl/31396/perl-curl-get-data-from-
website

use strict;
use warnings;

use HTML::TableExtract;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST GET);
use LWP::UserAgent;

my $show_content = 0; # 1 = shows response content (html)


my ( $content1, $content2 );

# Create cookies
my $jar = HTTP::Cookies->new();

# Create user agent


my $ua = LWP::UserAgent->new();
$ua->timeout( 10 );
$ua->cookie_jar( $jar );
$ua->agent( "Microsoft Internet Explorer/6.0" );

# Create a first request: "get track table framework"


# Note - this will establish a session with the server.
# ---------

my $request = HTTP::Request->new('GET' =>


join '', qw{
http://www.bangkokflightservices.com/TrackTrace/showc_track.php?m_prefix=176&m_s
n=75064953&h_prefix=HWB&h_sn=&ecy=e076438db64c6190f7b9689a379b7f7093368f1652d14d
b65fee1ab916713f3f5f4030f53369cb1f669614312c4748899c272f4d976a2b299274a21ad80fc0
72b1bab2ab1c181d08c670188722e51ec162f9ae337e3f2f132c88d249133815558d241ce8a4e9b3
fa75c144268b9e901037c2c7257142ee42ff9b2bf2767f57ed62b94fd938ea4dd2b28c53fea6af74
be&ch=%A0%A0%A0%A0 &id=1.2405164500620218} );

# Pass request to agent


# Note - the response is just Java Script/Ajax laced
# html document with a skeleton table. One of the table's element <td> has
# an Id = "output" that recieves the real table data from the next request.
# Apparently this establishes a cookie.

my $res = $ua->request( $request );


if ( $res->is_success ) {
print "\nHtml main Content .. OK\n\n";
if ($show_content) {
print $res->content, "\n\n";
}
$content1 = $res->content;
}
else {
print "Request (Html main Content) Failed\n";
print $res->status_line, "\n\n";
die;
}
print '='x20, "\n\n";

# Create a second request: "get track table body"


# Note - When running as an html document, JS/Ajax are used
# to dynamically load table data (html) to put in <td id="output" ..>
# already loaded with the first request (the main html).
# The html that is returned is Dynamic Html fragment. This contails
# the table data for a single prefix/serial no.
# ---------

# Loop, get the data for a couple of Way Bill Numbers.

my %wbhash = ( '176'=>'75064953', '081'=>'75133844' );

while (my ( $WBNprefix, $WBN ) = each %wbhash)


{
$request = HTTP::Request->new('GET' =>
join '', (
"http://www.bangkokflightservices.com/TrackTrace/search_awb.php ?",
"m_prefix=$WBNprefix",
"&m_sn=$WBN",
"&h_prefix=HWB",
"&h_sn=&ch= ")
);

# Pass request to agent

$res = $ua->request( $request );


if ( $res->is_success ) {
print "\nWay Bill fragment .. OK\n";
if ($show_content) {
print $res->content, "\n\n";
}
$content2 = $res->content;
}
else {
print "Request (Way Bill html fragment Content) Failed\n";
print $res->status_line, "\n\n";
die;
}
print "Way Bill ($WBNprefix - $WBN) Content tables:\n", '-'x20, "\n\n";
print_tables( $content2 );
print "\n";
}

print '='x20, "\n\n";


print "Done!\n\n\n";

exit;

## Table extract Util from wsp


##
sub print_tables {
my ( $table, $row, $cell );
my $tc = 0;
my $table_extractor = HTML::TableExtract->new();
$table_extractor->parse( $_[0] );
foreach $table ( $table_extractor->table_states ) {
print "TABLE $tc:\n"; $tc++;
my $rc = 0;
foreach $row ( $table->rows ) {
print "ROW $rc:\n"; $rc++;
foreach $cell ( @$row ) {
$cell = '' unless defined $cell;
$cell =~ s/\n/ /g;
$cell =~ s/[ \t]+/ /g;
$cell =~ s/^[ \t]//;
$cell =~ s/[ \t]$//;
$cell =~ s/ *<\/td *//g;
print "$cell|";
}
print "\n";
}
}
}
__END__

Html main Content .. OK

====================

Way Bill fragment .. OK


Way Bill (081 - 75133844) Content tables:
--------------------

TABLE 0:
ROW 0:
�||||
ROW 1:
�|Enter Master Air Waybill (MAWB)|
ROW 2:
Optional (For Import MAWB Only)|
ROW 3:
�||||
ROW 4:
||* Master Air Waybill number example 123 - 12345678 ||
TABLE 1:
ROW 0:

ROW 1:
Item|AWB No|Flight No|Flight Date|Origin|Dest|Status|Pieces|Weight|Time|
ROW 2:
1|081-75133844|JQ 029|Oct 19 2010|MEL|BKK|Delivered|2|1,480.00|Oct 20 2010 - 125
5|

Way Bill fragment .. OK


Way Bill (176 - 75064953) Content tables:
--------------------

TABLE 0:
ROW 0:
�||||
ROW 1:
�|Enter Master Air Waybill (MAWB)|
ROW 2:
Optional (For Import MAWB Only)|
ROW 3:
�||||
ROW 4:
||* Master Air Waybill number example 123 - 12345678 ||
TABLE 1:
ROW 0:

ROW 1:
Item|AWB No|Flight No|Flight Date|Origin|Dest|ULD No|Status|Pieces|Weight|Time|
ROW 2:
1|176-75064953|EK 419|Oct 15 2010|BKK|DXB|Flight Change�|Export Transshipment|3|
743.00|Oct 14 2010 5:37PM|
ROW 3:
2|176-75064953|EK 419|Oct 15 2010|BKK|DXB|�|Accepted|3|743.00|Oct 14 2010 5:37PM

ROW 4:
3|176-75064953|EK 373|Oct 15 2010|BKK|DXB|Flight Change�|Export Transshipment|3|
743.00|Oct 14 2010 6:12PM|
ROW 5:
4|176-75064953|EK 373|Oct 15 2010|BKK|DXB|SHC�|Export Transshipment|3|743.00|Oct
14 2010 6:12PM|
ROW 6:
5|176-75064953|EK 373|Oct 14 2010|BKK|DXB|Flight Change�|Export Transshipment|3|
743.00|Oct 14 2010 6:42PM|
ROW 7:
6|176-75064953|EK 373|Oct 14 2010|BKK|DXB|PMC31131EK�|Manifested|3|743.00|Oct 14
2010 6:57PM|
ROW 8:
7|176-75064953|EK 373|Oct 14 2010|BKK|DXB|�|Departed|3|743.00|Oct 14 2010 9:54PM

--------------------------------------------------------
-------------Get Data from Home Page---------------
--------------------------------------------------------

#!/usr/bin/Perl
use LWP::Simple;
use HTML::Parser;
use Data::Dumper;
my $url = shift @ARGV;
die "No URL specified on command line." unless (defined $url);
my $content = get($url); #put site html in $content.
die "get failed" if (!defined $content);
# create parser object
my $parser = HTML::Parser->new(api_version=>3,
start_h=>[\&startTag, 'tag, attr'] ,
end_h=>[\&endTag, 'tag'] ,
text_h=>[\&textElem, 'text']
);
#parse object.
$parser->parse($content);
sub startTag
{
my ($tag, $attrHash) = @_;
print "TAG: $tag \n";
print "ATTR HASH: " , Dumper $attrHash , "\n";
print "-----\n";
}
sub endTag
{
my $tag = shift;
print "END TAG: $tag \n";
print "-----\n";
}
sub textElem
{
my $text = shift;
print "TEXT: $text \n";
print "-----\n";
}

You might also like