Grokbase Groups Pig user May 2010
Looks like attachments are not coming through. Here is the script from
Corbin inline.

One thing you might want to try is to switch your cogroups to skewed
join and see if that solves the issue:


set 'Generate topurl reports for $out_file1'

%default dir_prefix '../..'
%default storage 'BinStorage()'
%default tynt_udfs 'tynt-udfs.jar'
%default topN '20'
/* default to 30 days time period so that alltime report will get
14*30=420 min page views*/
%default timeperiod '30'
%default min_page_views_per_day '14'

register $dir_prefix/udfs/target/$tynt_udfs
register $dir_prefix/udfs/lib/piggybank.jar

---------------------summarize address bar
addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
(site:chararray, url:chararray, guid:chararray, cnt:long);
grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
SUM(addbar_stats.cnt) AS addbarvisits;
STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING

grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING

clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
(site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
(site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
title, cnt, etcnt;

light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

--light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
url, addbarvisits;
--joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
INNER, light_clickstatsbyurl BY (site, url) OUTER,
light_addbarstatsbyurl BY (site, url) OUTER;
--flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
(IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
ts)) as addbarcnt;

joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
INNER, light_clickstatsbyurl BY (site, url) OUTER;
flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

(IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as

ratio_by_url = FOREACH flattened_stats_for_ratio
generated_traffic = clickcnt+etcnt;
total_traffic = cnt;
ti =
GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
ratio, generated_traffic AS gviews, total_traffic AS views;

------------------------combined with
copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
(site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
lcnt+scnt+icnt AS cnt;

all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
(site, url, title, ratio, gviews, views),
(IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as

grouped_urls_by_site = GROUP all_urls BY site;

top_ratios = FOREACH grouped_urls_by_site
filtered_by_minpageviews = FILTER all_urls BY views
order_by_ratio = ORDER filtered_by_minpageviews BY
ratio DESC;
top_by_ratio = LIMIT order_by_ratio $topN;
GENERATE group AS site, top_by_ratio.(url, title,
ratio, gviews, views, copies) AS tops;

top_gviews = FOREACH grouped_urls_by_site
order_by_gviews = ORDER all_urls BY gviews DESC;
top_by_gviews = LIMIT order_by_gviews $topN;
GENERATE group AS site, top_by_gviews.(url, title,
ratio, gviews, views, copies) AS tops;

top_views = FOREACH grouped_urls_by_site
order_by_views = ORDER all_urls BY views DESC;
top_by_views = LIMIT order_by_views $topN;
GENERATE group AS site, top_by_views.(url, title,
ratio, gviews, views, copies) AS tops;

top_copies = FOREACH grouped_urls_by_site
order_by_copies = ORDER all_urls BY copies DESC;
top_by_copies = LIMIT order_by_copies $topN;
GENERATE group AS site, top_by_copies.(url, title,
ratio, gviews, views, copies) AS tops;

grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
site, top_copies BY site;

top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

store top_urls into '$out_file1/topurls' USING $storage;

-----Original Message-----
From: Corbin Hoenes
Sent: Thursday, May 06, 2010 11:57 AM
To: Olga Natkovich
Subject: Re: SpillableMemoryManager - low memory handler called

I have attached the script... please let me know if you have more

Search Discussions

Discussion Posts


Follow ups

Related Discussions

Discussion Navigation
viewthread | post
posts ‹ prev | 2 of 9 | next ›
Discussion Overview
groupuser @
categoriespig, hadoop
postedMay 6, '10 at 9:18p
activeOct 19, '11 at 7:00p



site design / logo © 2021 Grokbase