Grokbase Groups Pig user May 2010
FAQ
-----Original Message-----
From: Corbin Hoenes
Sent: Thursday, May 06, 2010 11:57 AM
To: Olga Natkovich
Subject: Re: SpillableMemoryManager - low memory handler called

I have attached the script... please let me know if you have more
questions.

Search Discussions

  • Olga Natkovich at May 6, 2010 at 9:25 pm
    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins

    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.
  • Corbin Hoenes at May 6, 2010 at 9:50 pm
    Wondering if when we do a group like this:

    grouped_urls_by_site = GROUP all_urls BY site;

    if certain site has a lot of urls would they all have to be processed by the same mapper (e.g. a single key?) Could this account for why we have 8GB in one map and not many in others?
    On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins

    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.

    On May 6, 2010, at 12:36 PM, Olga Natkovich wrote:

    This is just a warning saying that your job is spilling to the disk.
    Please, if you can, post a script that is causing this issue. In 0.6.0
    we moved large chunk of the code away from using SpillableMemoryManager
    but it is still used in some places. More changes are coming in 0.7.0 as
    well.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:31 AM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    0.6

    Sent from my iPhone

    On May 6, 2010, at 12:16 PM, "Olga Natkovich" <olgan@yahoo-inc.com>
    wrote:
    Which version of Pig are you using?

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 10:29 AM
    To: pig-user@hadoop.apache.org
    Subject: SpillableMemoryManager - low memory handler called

    Hi Piggers - Seeing an issue with a particular script where our job is
    taking 6hrs 42min to complete.

    syslogs are showing loads of these:
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    283443200(276800K) committed = 357957632(349568K) max =
    357957632(349568K)
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    267128840(260868K) committed = 357957632(349568K) max =
    357957632(349568K)
    One iteresting thing is it's the map phase that is slow and one of the
    mappers is getting 8GB of input while the other 2000 or so mappers are
    getting MBs and hundreds of MBs of data.

    Any where I can start looking?
  • Olga Natkovich at May 6, 2010 at 10:18 pm
    Groups are processed on the reduce side with combiner pre-aggregating on
    the map side. A single map by default gets a fixed chunk of data and not
    the entire key. The only way I can see a huge map happening is if you
    have one really huge record somewhere.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 2:31 PM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    Wondering if when we do a group like this:

    grouped_urls_by_site = GROUP all_urls BY site;

    if certain site has a lot of urls would they all have to be processed by
    the same mapper (e.g. a single key?) Could this account for why we have
    8GB in one map and not many in others?
    On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins
    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';
    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;
    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.

    On May 6, 2010, at 12:36 PM, Olga Natkovich wrote:

    This is just a warning saying that your job is spilling to the disk.
    Please, if you can, post a script that is causing this issue. In
    0.6.0
    we moved large chunk of the code away from using
    SpillableMemoryManager
    but it is still used in some places. More changes are coming in 0.7.0
    as
    well.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:31 AM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    0.6

    Sent from my iPhone

    On May 6, 2010, at 12:16 PM, "Olga Natkovich" <olgan@yahoo-inc.com>
    wrote:
    Which version of Pig are you using?

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 10:29 AM
    To: pig-user@hadoop.apache.org
    Subject: SpillableMemoryManager - low memory handler called

    Hi Piggers - Seeing an issue with a particular script where our job
    is
    taking 6hrs 42min to complete.

    syslogs are showing loads of these:
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used
    =
    283443200(276800K) committed = 357957632(349568K) max =
    357957632(349568K)
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used
    =
    267128840(260868K) committed = 357957632(349568K) max =
    357957632(349568K)
    One iteresting thing is it's the map phase that is slow and one of
    the
    mappers is getting 8GB of input while the other 2000 or so mappers
    are
    getting MBs and hundreds of MBs of data.

    Any where I can start looking?
  • Corbin Hoenes at May 7, 2010 at 2:02 pm
    Okay we did some filtering on the all_urls in the nested FOREACH and that seems to fix the performance issue. There are still some mappers that get 8GB of data but the job went down to 2 hours.

    From Dimtry's reply sounds like the low memory handler output in the logs is misleading.
    On May 6, 2010, at 3:30 PM, Corbin Hoenes wrote:

    Wondering if when we do a group like this:

    grouped_urls_by_site = GROUP all_urls BY site;

    if certain site has a lot of urls would they all have to be processed by the same mapper (e.g. a single key?) Could this account for why we have 8GB in one map and not many in others?
    On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins

    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.

    On May 6, 2010, at 12:36 PM, Olga Natkovich wrote:

    This is just a warning saying that your job is spilling to the disk.
    Please, if you can, post a script that is causing this issue. In 0.6.0
    we moved large chunk of the code away from using SpillableMemoryManager
    but it is still used in some places. More changes are coming in 0.7.0 as
    well.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:31 AM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    0.6

    Sent from my iPhone

    On May 6, 2010, at 12:16 PM, "Olga Natkovich" <olgan@yahoo-inc.com>
    wrote:
    Which version of Pig are you using?

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 10:29 AM
    To: pig-user@hadoop.apache.org
    Subject: SpillableMemoryManager - low memory handler called

    Hi Piggers - Seeing an issue with a particular script where our job is
    taking 6hrs 42min to complete.

    syslogs are showing loads of these:
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    283443200(276800K) committed = 357957632(349568K) max =
    357957632(349568K)
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    267128840(260868K) committed = 357957632(349568K) max =
    357957632(349568K)
    One iteresting thing is it's the map phase that is slow and one of the
    mappers is getting 8GB of input while the other 2000 or so mappers are
    getting MBs and hundreds of MBs of data.

    Any where I can start looking?
  • Mridul Muralidharan at May 7, 2010 at 9:46 pm
    Hi,


    Do you know which snippet in the script is causing the issue ?
    There are multiple MR jobs which will be executed, what is causing the
    exact issue ?


    Map side spills is strange - are you sure it is not in the reducer ?

    If it really is in the map side, I guess it is pointing to the case
    where a single record (which is being read) is so large that it is
    triggering spills : this could theoretically be possible if you store
    and then load it again - where reducers are set to higher memory than
    mappers. Though I did not see something like this happening here ...



    Regards,
    Mridul



    On Friday 07 May 2010 03:00 AM, Corbin Hoenes wrote:
    Wondering if when we do a group like this:

    grouped_urls_by_site = GROUP all_urls BY site;

    if certain site has a lot of urls would they all have to be processed by the same mapper (e.g. a single key?) Could this account for why we have 8GB in one map and not many in others?
    On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins

    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.

    On May 6, 2010, at 12:36 PM, Olga Natkovich wrote:

    This is just a warning saying that your job is spilling to the disk.
    Please, if you can, post a script that is causing this issue. In 0.6.0
    we moved large chunk of the code away from using SpillableMemoryManager
    but it is still used in some places. More changes are coming in 0.7.0 as
    well.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:31 AM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    0.6

    Sent from my iPhone

    On May 6, 2010, at 12:16 PM, "Olga Natkovich"<olgan@yahoo-inc.com>
    wrote:
    Which version of Pig are you using?

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 10:29 AM
    To: pig-user@hadoop.apache.org
    Subject: SpillableMemoryManager - low memory handler called

    Hi Piggers - Seeing an issue with a particular script where our job is
    taking 6hrs 42min to complete.

    syslogs are showing loads of these:
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    283443200(276800K) committed = 357957632(349568K) max =
    357957632(349568K)
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    267128840(260868K) committed = 357957632(349568K) max =
    357957632(349568K)
    One iteresting thing is it's the map phase that is slow and one of the
    mappers is getting 8GB of input while the other 2000 or so mappers are
    getting MBs and hundreds of MBs of data.

    Any where I can start looking?
  • Dmitriy Ryaboy at May 7, 2010 at 9:57 pm
    Mridul,
    like I said earlier -- unfortunately at the moment those log lines
    only mean that the Java garbage collector is invoked, not that
    anything is actually spilled. This gets printed out even if there are
    no spillable objects the Manager is aware of. An 8G map will certainly
    trigger the GC.

    On Fri, May 7, 2010 at 2:44 PM, Mridul Muralidharan
    wrote:
    Hi,


    Do you know which snippet in the script is causing the issue ?
    There are multiple MR jobs which will be executed, what is causing the exact
    issue ?


    Map side spills is strange - are you sure it is not in the reducer ?

    If it really is in the map side, I guess it is pointing to the case where a
    single record (which is being read) is so large that it is triggering spills
    : this could theoretically be possible if you store and then load it again -
    where reducers are set to higher memory than mappers. Though I did not see
    something like this happening here ...



    Regards,
    Mridul



    On Friday 07 May 2010 03:00 AM, Corbin Hoenes wrote:

    Wondering if when we do a group like this:

    grouped_urls_by_site = GROUP all_urls BY site;

    if certain site has a lot of urls would they all have to be processed by
    the same mapper (e.g. a single key?)  Could this account for why we have 8GB
    in one map and not many in others?
    On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins

    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.

    On May 6, 2010, at 12:36 PM, Olga Natkovich wrote:

    This is just a warning saying that your job is spilling to the disk.
    Please, if you can, post a script that is causing this issue. In 0.6.0
    we moved large chunk of the code away from using SpillableMemoryManager
    but it is still used in some places. More changes are coming in 0.7.0 as
    well.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:31 AM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    0.6

    Sent from my iPhone

    On May 6, 2010, at 12:16 PM, "Olga Natkovich"<olgan@yahoo-inc.com>
    wrote:
    Which version of Pig are you using?

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 10:29 AM
    To: pig-user@hadoop.apache.org
    Subject: SpillableMemoryManager - low memory handler called

    Hi Piggers - Seeing an issue with a particular script where our job is
    taking 6hrs 42min to complete.

    syslogs are showing loads of these:
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    283443200(276800K) committed = 357957632(349568K) max =
    357957632(349568K)
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    267128840(260868K) committed = 357957632(349568K) max =
    357957632(349568K)
    One iteresting thing is it's the map phase that is slow and one of the
    mappers is getting 8GB of input while the other 2000 or so mappers are
    getting MBs and hundreds of MBs of data.

    Any where I can start looking?
  • Mridul Muralidharan at May 10, 2010 at 9:21 am
    The main memory intensive aspect of pig are -
    a) bags with large number of tuples,
    b) udf consuming memory,
    c) very very large tuples.
    d) bugs/memory leaks in pig/udf.

    (a) is the most common case - particularly because of joins, cogroups,
    pig internally using bag's in its pipeline, etc.

    There is nothing much we can do about (c) - since it is a practical
    limitation which affects direct mapreduce too.

    (b) is rare, though possible - and requires user intervention : iirc
    none of the stock pig udf's exhibit this issue.

    (d) is a remote possibility, but hopefully it is not the case !



    It is unusual for mapper to run out of memory except for (b), (c).

    Unless ...
    Now, if the reducer could generate this - but the mapper could not
    consume the output of the reducer, it typically points to very large
    bags such that it fits into reducer's memory, but since mapper has less
    memory than reducer (usual hadoop config atleast in y!), it fails there.

    Regards,
    Mridul

    On Saturday 08 May 2010 03:26 AM, Dmitriy Ryaboy wrote:
    Mridul,
    like I said earlier -- unfortunately at the moment those log lines
    only mean that the Java garbage collector is invoked, not that
    anything is actually spilled. This gets printed out even if there are
    no spillable objects the Manager is aware of. An 8G map will certainly
    trigger the GC.

    On Fri, May 7, 2010 at 2:44 PM, Mridul Muralidharan
    wrote:
    Hi,


    Do you know which snippet in the script is causing the issue ?
    There are multiple MR jobs which will be executed, what is causing the exact
    issue ?


    Map side spills is strange - are you sure it is not in the reducer ?

    If it really is in the map side, I guess it is pointing to the case where a
    single record (which is being read) is so large that it is triggering spills
    : this could theoretically be possible if you store and then load it again -
    where reducers are set to higher memory than mappers. Though I did not see
    something like this happening here ...



    Regards,
    Mridul



    On Friday 07 May 2010 03:00 AM, Corbin Hoenes wrote:

    Wondering if when we do a group like this:

    grouped_urls_by_site = GROUP all_urls BY site;

    if certain site has a lot of urls would they all have to be processed by
    the same mapper (e.g. a single key?) Could this account for why we have 8GB
    in one map and not many in others?
    On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

    Looks like attachments are not coming through. Here is the script from
    Corbin inline.

    One thing you might want to try is to switch your cogroups to skewed
    join and see if that solves the issue:

    http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins

    Olga

    --------------------------------------------topurl.pig------------------
    -------------------------------------------
    set job.name 'Generate topurl reports for $out_file1'

    %default dir_prefix '../..'
    %default storage 'BinStorage()'
    %default tynt_udfs 'tynt-udfs.jar'
    %default topN '20'
    /* default to 30 days time period so that alltime report will get
    14*30=420 min page views*/
    %default timeperiod '30'
    %default min_page_views_per_day '14'

    register $dir_prefix/udfs/target/$tynt_udfs
    register $dir_prefix/udfs/lib/piggybank.jar

    ---------------------summarize address bar
    stats-----------------------------------
    addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
    (site:chararray, url:chararray, guid:chararray, cnt:long);
    grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
    addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
    FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
    SUM(addbar_stats.cnt) AS addbarvisits;
    STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
    $storage;

    grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
    PARALLEL 180;
    addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
    group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
    SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
    STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
    $storage;

    ----------------------calculate
    ratio------------------------------------------
    clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
    pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
    uwcnt:long);
    viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
    (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
    et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);

    light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
    light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
    title, cnt, etcnt;

    light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';

    --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
    url, addbarvisits;
    --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER,
    light_addbarstatsbyurl BY (site, url) OUTER;
    --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
    --
    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt,
    --
    (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
    ts)) as addbarcnt;

    joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
    INNER, light_clickstatsbyurl BY (site, url) OUTER;
    flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
    FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),

    (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
    clickcnt;

    ratio_by_url = FOREACH flattened_stats_for_ratio
    {
    generated_traffic = clickcnt+etcnt;
    total_traffic = cnt;
    ti =
    ((float)(generated_traffic))/((float)total_traffic);
    GENERATE site, url, title, ((ti>1)?(-ti):ti) AS
    ratio, generated_traffic AS gviews, total_traffic AS views;
    }

    ------------------------combined with
    #copies----------------------------------------
    copystatsbyurl = LOAD '$in_file1/copystatsbyurl' USING $storage AS
    (site:chararray, url:chararray, lcnt:long, scnt:long, icnt:long,
    acnt:long);
    light_copystatsbyurl = FOREACH copystatsbyurl GENERATE site, url,
    lcnt+scnt+icnt AS cnt;

    all_stats_by_url = COGROUP ratio_by_url BY (site, url) INNER,
    light_copystatsbyurl BY (site, url) OUTER PARALLEL 62;
    all_urls = FOREACH all_stats_by_url GENERATE FLATTEN(ratio_by_url) AS
    (site, url, title, ratio, gviews, views),
    (IsEmpty(light_copystatsbyurl)?0:MAX(light_copystatsbyurl.cnt)) as
    copies;

    grouped_urls_by_site = GROUP all_urls BY site;

    top_ratios = FOREACH grouped_urls_by_site
    {
    filtered_by_minpageviews = FILTER all_urls BY views
    ($min_page_views_per_day*$timeperiod);
    order_by_ratio = ORDER filtered_by_minpageviews BY
    ratio DESC;
    top_by_ratio = LIMIT order_by_ratio $topN;
    GENERATE group AS site, top_by_ratio.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_gviews = FOREACH grouped_urls_by_site
    {
    order_by_gviews = ORDER all_urls BY gviews DESC;
    top_by_gviews = LIMIT order_by_gviews $topN;
    GENERATE group AS site, top_by_gviews.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_views = FOREACH grouped_urls_by_site
    {
    order_by_views = ORDER all_urls BY views DESC;
    top_by_views = LIMIT order_by_views $topN;
    GENERATE group AS site, top_by_views.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    top_copies = FOREACH grouped_urls_by_site
    {
    order_by_copies = ORDER all_urls BY copies DESC;
    top_by_copies = LIMIT order_by_copies $topN;
    GENERATE group AS site, top_by_copies.(url, title,
    ratio, gviews, views, copies) AS tops;
    }

    grouped_tops = JOIN top_ratios BY site, top_gviews BY site, top_views BY
    site, top_copies BY site;

    top_urls = FOREACH grouped_tops GENERATE top_ratios::site AS site,
    top_ratios::tops, top_gviews::tops, top_views::tops, top_copies::tops;

    store top_urls into '$out_file1/topurls' USING $storage;



    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:57 AM
    To: Olga Natkovich
    Subject: Re: SpillableMemoryManager - low memory handler called

    I have attached the script... please let me know if you have more
    questions.

    On May 6, 2010, at 12:36 PM, Olga Natkovich wrote:

    This is just a warning saying that your job is spilling to the disk.
    Please, if you can, post a script that is causing this issue. In 0.6.0
    we moved large chunk of the code away from using SpillableMemoryManager
    but it is still used in some places. More changes are coming in 0.7.0 as
    well.

    Olga

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 11:31 AM
    To: pig-user@hadoop.apache.org
    Subject: Re: SpillableMemoryManager - low memory handler called

    0.6

    Sent from my iPhone

    On May 6, 2010, at 12:16 PM, "Olga Natkovich"<olgan@yahoo-inc.com>
    wrote:
    Which version of Pig are you using?

    -----Original Message-----
    From: Corbin Hoenes
    Sent: Thursday, May 06, 2010 10:29 AM
    To: pig-user@hadoop.apache.org
    Subject: SpillableMemoryManager - low memory handler called

    Hi Piggers - Seeing an issue with a particular script where our job is
    taking 6hrs 42min to complete.

    syslogs are showing loads of these:
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    283443200(276800K) committed = 357957632(349568K) max =
    357957632(349568K)
    INFO : org.apache.pig.impl.util.SpillableMemoryManager - low memory
    handler called (Usage threshold exceeded) init = 5439488(5312K) used =
    267128840(260868K) committed = 357957632(349568K) max =
    357957632(349568K)
    One iteresting thing is it's the map phase that is slow and one of the
    mappers is getting 8GB of input while the other 2000 or so mappers are
    getting MBs and hundreds of MBs of data.

    Any where I can start looking?
  • Felix gao at Oct 19, 2011 at 7:00 pm
    sorry to resurrect a old thread. But I am having the same problem here for
    Pig 0.7

    011-10-19 13:03:22,895 INFO org.apache.pig.impl.util.SpillableMemoryManager:
    low memory handler called (Usage threshold exceeded) init =
    263323648(257152K) used = 502624976(490844K) committed = 617086976(602624K)
    max = 715849728(699072K) 2011-10-19 13:03:58,776 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    494572568(482981K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:04:01,248 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Usage threshold exceeded) init = 263323648(257152K) used =
    501339160(489589K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:05:01,250 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    589147376(575339K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:05:42,181 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    487209296(475790K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:06:07,300 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Usage threshold exceeded) init = 263323648(257152K) used =
    501963568(490198K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:07:11,850 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    575026232(561549K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:07:53,782 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    505687176(493835K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:09:02,451 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    571977464(558571K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:09:44,027 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    509238248(497302K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:10:55,596 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    561933048(548762K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:11:40,931 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    517425112(505297K) committed = 715849728(699072K) max = 715849728(699072K)
    2011-10-19 13:12:48,891 INFO
    org.apache.pig.impl.util.SpillableMemoryManager: low memory handler called
    (Collection threshold exceeded) init = 263323648(257152K) used =
    558586544(545494K) committed = 715849728(699072K) max = 715849728(699072K)

    The job took a while to finish and not sure what causes it, it only happens
    in the reduce phase and i am wondering how to increase the max memory to be
    used by pig. In my mapreduce.reduce.java.opts, I have set the value to 1GB
    but it seems the max is only 700MB for whatever reason.

    Thanks,

    Felix

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupuser @
categoriespig, hadoop
postedMay 6, '10 at 9:18p
activeOct 19, '11 at 7:00p
posts9
users5
websitepig.apache.org

People

Translate

site design / logo © 2021 Grokbase