Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Threaded View
Pig, mail # user - SpillableMemoryManager - low memory handler called


Copy link to this message
-
Re: SpillableMemoryManager - low memory handler called
Corbin Hoenes 2010-05-06, 21:30
Wondering if when we do a group like this:

grouped_urls_by_site = GROUP all_urls BY site;

if certain site has a lot of urls would they all have to be processed by the same mapper (e.g. a single key?)  Could this account for why we have 8GB in one map and not many in others?

On May 6, 2010, at 3:24 PM, Olga Natkovich wrote:

> Looks like attachments are not coming through. Here is the script from
> Corbin inline.
>
> One thing you might want to try is to switch your cogroups to skewed
> join and see if that solves the issue:
>
> http://hadoop.apache.org/pig/docs/r0.6.0/piglatin_ref1.html#Skewed+Joins
>
> Olga
>
> --------------------------------------------topurl.pig------------------
> -------------------------------------------
> set job.name 'Generate topurl reports for $out_file1'
>
> %default dir_prefix '../..'
> %default storage 'BinStorage()'
> %default tynt_udfs 'tynt-udfs.jar'
> %default topN '20'
> /* default to 30 days time period so that alltime report will get
> 14*30=420 min page views*/
> %default timeperiod '30'
> %default min_page_views_per_day '14'
>
> register $dir_prefix/udfs/target/$tynt_udfs
> register $dir_prefix/udfs/lib/piggybank.jar
>
> ---------------------summarize address bar
> stats-----------------------------------
> addbar_stats = LOAD '$in_file1/addbarstats' USING $storage AS
> (site:chararray, url:chararray, guid:chararray, cnt:long);
> grouped_addbar_by_url = GROUP addbar_stats BY (site, url) PARALLEL 180;
> addbar_stats_by_url = FOREACH grouped_addbar_by_url GENERATE
> FLATTEN(group) AS (site, url), COUNT(addbar_stats) AS addbarcnt,
> SUM(addbar_stats.cnt) AS addbarvisits;
> STORE addbar_stats_by_url INTO '$out_file1/addbarstatsbyurl' USING
> $storage;
>
> grouped_addbar_stats_by_site = GROUP addbar_stats_by_url BY site
> PARALLEL 180;
> addbar_stats_by_site = FOREACH grouped_addbar_stats_by_site GENERATE
> group AS site, SUM(addbar_stats_by_url.addbarcnt) AS addbarcnt,
> SUM(addbar_stats_by_url.addbarvisits) AS addbarvisits;
> STORE addbar_stats_by_site INTO '$out_file1/addbarstatsbysite' USING
> $storage;
>
> ----------------------calculate
> ratio------------------------------------------
> clickstatsbyurl = LOAD '$in_file1/clickstatsbyurl' USING $storage AS
> (site:chararray, url:chararray, cnt:long, tracecnt:long, tcnt:long,
> pcnt:long, wcnt:long, utracecnt:long, utcnt:long, upcnt:long,
> uwcnt:long);
> viewstatsbyurl = LOAD '$in_file1/viewstatsbyurl' USING $storage AS
> (site:chararray, url:chararray, title:chararray, cnt:long, etcnt:long,
> et1cnt:long, et2cnt:long, et3cnt:long, et6cnt:long, et7cnt:long);
>
> light_clickstatsbyurl = FOREACH clickstatsbyurl GENERATE site, url, cnt;
> light_viewstatsbyurl_noisy = FOREACH viewstatsbyurl GENERATE site, url,
> title, cnt, etcnt;
>
> light_viewstatsbyurl = FILTER light_viewstatsbyurl_noisy BY url != '-';
>
> --light_addbarstatsbyurl = FOREACH addbar_stats_by_url GENERATE site,
> url, addbarvisits;
> --joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
> INNER, light_clickstatsbyurl BY (site, url) OUTER,
> light_addbarstatsbyurl BY (site, url) OUTER;
> --flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
> FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
> --
> (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
> clickcnt,
> --
> (IsEmpty(light_addbarstatsbyurl)?0:MAX(light_addbarstatsbyurl.addbarvisi
> ts)) as addbarcnt;
>
> joined_stats_for_ratio = COGROUP light_viewstatsbyurl BY (site, url)
> INNER, light_clickstatsbyurl BY (site, url) OUTER;
> flattened_stats_for_ratio = FOREACH joined_stats_for_ratio GENERATE
> FLATTEN(light_viewstatsbyurl) AS (site, url, title, cnt, etcnt),
>
> (IsEmpty(light_clickstatsbyurl)?0:MAX(light_clickstatsbyurl.cnt)) as
> clickcnt;
>
> ratio_by_url = FOREACH flattened_stats_for_ratio
>                      {
>                        generated_traffic = clickcnt+etcnt;
>                        total_traffic = cnt;
>                        ti > ((float)(generated_traffic))/((float)total_traffic);