Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Threaded View
Pig >> mail # user >> How to optimize my request


Copy link to this message
-
How to optimize my request
Hi,

I want to execute a pig command in embedded java program. For moment, I try
Pig in local mode. My data file size is around 15MB but the execution of
this command is very long so I think my script need optimizations...

My script :

A = LOAD 'data' USING MyUDFLoader('data.xml');
> filter_response_time_less_than_1_s = FILTER A BY (response_time < 1000.0);
> filter_response_time_between_1_s_and_2_s = FILTER A BY (response_time >= 1000.0 AND response_time < 1999.0);
> filter_response_time_between_greater_than_2_s = FILTER A BY (response_time >= 2000.0);
> star__zne_asfo_access_log = FOREACH ( COGROUP A BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_between_greater_than_2_s BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_less_than_1_s BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_minute,ret_code,serveur) )
> {
>         GENERATE
>                 FLATTEN(group) AS (date_day,zne_asfo_url,date_minute,zne_http_code,zne_asfo_server),
>                 (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
>                 COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
>                 COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
>                 COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
>                 COUNT(A) AS nb_hit;
> };
> agg__zne_asfo_access_log_ymd = FOREACH ( COGROUP A BY (date_day,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,date_year,date_month) )
> {
>         GENERATE
>                 FLATTEN(group) AS (date_day,date_year,date_month),
>                 (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
>                 COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
>                 COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
>                 COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
>                 COUNT(A) AS nb_hit;
> };
> agg__zne_asfo_access_log_ymd_ret_url = FOREACH ( COGROUP A BY (date_day,url,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,url,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,url,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_year,date_month) )
> {
>         GENERATE
>                 FLATTEN(group) AS (date_day,zne_asfo_url,date_year,date_month),
>                 (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
>                 COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
>                 COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
>                 COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
>                 COUNT(A) AS nb_hit;
> };
> agg__zne_asfo_access_log_ymd_ret_code = FOREACH ( COGROUP A BY (date_day,ret_code,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,ret_code,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,ret_code,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,ret_code,date_year,date_month) )
> {
>         GENERATE
>                 FLATTEN(group) AS (date_day,zne_http_code,date_year,date_month),
>                 (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
>                 COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
>                 COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
>                 COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,