Professional Documents
Culture Documents
eq, size;
3 c = group b by (dateiname, proj);
4 d = foreach c {
sorted = order b by req desc;
lim = limit sorted 1;
generate flatten(lim);
};
=============================================================================
grunt> a = LOAD '/user/cloudera/2015/2015-01/' USING PigStorage(',','-tagFile')
as(dateiname:chararray, proj:chararray, url:chararray, req:long, size:long);
grunt> b = filter a by proj == '$proj';
grunt> c = order b by req desc;
grunt> d = limit c $k;
grunt> dump d;
============================================================================
Aufgabe 2:
a = LOAD '/user/cloudera/2015/2015-01/' USING PigStorage(' ','-tagFile') as(date
iname:chararray, proj:chararray, url:chararray, req:long, size:long);
b = foreach a generate SUBSTRING(dateiname,11,26) as datum, proj, url, req, size
;
c = foreach b generate ToDate(datum,'YYYYMMDD-HHmmss') as (dt:datetime), proj, u
rl, req, size;
--foreach b generate ToDate(SUBSTRING(dateiname,11,19),'YYYYMMDD-HHmmss') as (dt
:datetime), proj, url, req, size;
d = foreach c generate GetWeek(dt) as week, proj, url, req, size;
e = group d by (week, url);
f = foreach e generate group, SUM(d.req); --COUNT zhlt nur die ANZAHL der records
(der erhaltenen Zeilen) => daher SUM-Funktion!
STORE f INTO 'Aufgabe2OUT.txt';
a = LOAD '/user/cloudera/2015/2015-01/' USING PigStorage(' ','-tagFile') as(date
iname:chararray, proj:chararray, url:chararray, req:long, size:long);
grunt> b = foreach a generate SUBSTRING(dateiname,11,19) as datum, proj, url, re
q, size;
grunt> b = foreach a generate SUBSTRING(dateiname,11,26) as datum, proj, url, re
q, size;
grunt> c = foreach b generate ToDate(datum,'YYYYMMDD-HHmmss') as (dt:datetime),
proj, url, req, size; --HH muss fr Stunden angegeben werden, damit ist range Stun
de (0-23) drin! (hh=1-24)
grunt> d = foreach c generate GetWeek(dt) as week, proj, url, req, size;
grunt> e = group d by (week, url);
grunt> f = foreach e generate group, SUM(d.req) as Zugriffe;
BONUS: g = order f by Zugriffe desc;
h = limit g 100;
================================================================================
=========
g = FILTER f BY YearsBetween(CurrentTime(),ToDate(date2 + ' 01', 'yyyy MM dd'))<
3
================================================================================
=========
Aufgabe 3:
a = LOAD '/user/cloudera/2015/2015-01/' USING PigStorage(' ','-tagFile') as(date
iname:chararray, proj:chararray, url:chararray, req:long, size:long);
A4:
a = LOAD '/user/cloudera/2015' USING PigStorage(' ','-tagFile') as (dateiname:ch
ararray, proj:chararray, url:chararray, req:long, size:long);
2016-04-04 05:55:39,681 [main] INFO org.apache.hadoop.conf.Configuration.deprec
ation - fs.default.name is deprecated. Instead, use fs.defaultFS
2016-04-04 05:55:39,681 [main] INFO org.apache.hadoop.conf.Configuration.deprec
ation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.addr
ess
grunt> b = filter a by (proj matches '.*.mw'); -- matches '.*.mw.*' ist dasselbe
--b = filter a by (proj matches '.*.mw.*');
grunt> c = foreach b generate SUBSTRING(dateiname,11,19) as datumTag, proj, url,
req, size;
grunt> d = group c by (datumTag, url);
--e = foreach d generate flatten(group), SUM(c.req) as zugriffe;
e = foreach d generate flatten(group), flatten(c.proj), SUM(c.req) as zugriffe;
grunt> f = order e by zugriffe desc;
grunt> g = limit f 50;
grunt> STORE g INTO 'Aufgabe4OUT.txt';
---------------------------------------------------------------------------------------grunt> a = LOAD '/user/cloudera/2015' USING PigStorage(' ','-tagFile') as (datei
name:chararray, proj:chararray, url:chararray, req:long, size:long);
2016-04-04 06:23:13,212 [main] INFO org.apache.hadoop.conf.Configuration.deprec
ation - fs.default.name is deprecated. Instead, use fs.defaultFS
2016-04-04 06:23:13,212 [main] INFO org.apache.hadoop.conf.Configuration.deprec
ation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.addr
ess
grunt> b = filter a by (proj matches '.*.mw.*');
grunt> c = foreach b generate SUBSTRING(dateiname,11,19) as datumTag, proj, url,
req, size;
grunt> d = group c by (datumTag, url, proj);
grunt> e = foreach d generate flatten(group), SUM(c.req) as zugriffe;
grunt> f = order e by zugriffe desc;
grunt> STORE f INTO 'AUFGABE4_ERGEBNIS';
================================================================================
==============
Aufgabe 5:
a = LOAD '/user/cloudera/2015' USING PigStorage(' ') as (proj:chararray, url:cha
rarray, req:long, size:long);
b = foreach a generate url;
c = distinct b ;
d = group c ALL parallel 10;
e = FOREACH d GENERATE 'same' AS key, (DOUBLE)COUNT(c.url) as (alleArtikel:doubl
e);
f = filter a by (proj matches '.*.mw.*');
g = FOREACH f GENERATE url;
h = distinct g;
i = group h ALL parallel 10;
j = FOREACH i GENERATE 'same' AS key, (DOUBLE)COUNT(h.url) as (mobileArtikel:dou
ble);
joined = JOIN e BY key, j BY key;