Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Plain View
Pig >> mail # user >> error storing into hbase


Copy link to this message
-
error storing into hbase
Hi community,

I have the follwing pig script:

define FormatMessage com.cision.hadoop.pig.MessageFormatter();
--If you want the message to have no empty fields use this
--define FormatMessage com.cision.hadoop.pig.MessageFormatter('false');

dedupe = LOAD '/inflow/out/dedupe_out' USING org.apache.pig.piggybank.storage.avro.AvroStorage();

rmf /inflow/out/storesearch_tmp
rmf /inflow/out/search_out
search = MAPREDUCE '/opt/mapr/pig/pig-0.10.0/contrib/hadoop-0.0.1.jar' STORE dedupe INTO '/inflow/out/storesearch_tmp' USING org.apache.pig.piggybank.storage.avro.AvroStorage('schema', '{"type":"record","name":"monitor_enriched_article","fields":[
                {"name":"ssotmonitorid","type":"long"}
                , {"name":"article","type":"string"}
                , {"name":"path","type":"string"}
                , {"name":"htmlcleanedarticle","type":"string"}
                , {"name":"drmfingerprint","type":"int"}
                , {"name":"media_guid","type":["null","string"]}
                , {"name":"outletName","type":["null","string"]}
                , {"name":"outletid","type":["null","string"]}
                , {"name":"mediaId","type":["null","string"]}
                , {"name":"pubdate","type":"string"}
                , {"name":"pubname","type":"string"}
                , {"name":"headline","type":"string"}
                , {"name":"sourceid","type":"string"}
                , {"name":"mark","type":"string"}
                , {"name":"ruleId","type":"string"}
                , {"name":"publicityvalue","type":["null", "string"]}
                , {"name":"arbitronCumeEstimate","type":["null","string"]}
                , {"name":"audience","type":["null","string"]}
                , {"name":"circulation","type":["null","string"]}
                , {"name":"visitorsPerMonth","type":["null","string"]}
                , {"name":"authors","type":["null", "string"]}
                , {"name":"legacyContactId","type":["null", "string"]}
                , {"name":"subscriptionid","type":["null", "string"]}
                , {"name":"customerid","type":["null", "string"]}
                , {"name":"media_type","type":["null", "string"]}
                , {"name":"industries","type":["null", "string"]}
                , {"name":"locations","type":["null", "string"]}
                , {"name":"organizations","type":["null", "string"]}
                , {"name":"people","type":["null", "string"]}
                , {"name":"subject","type":["null", "string"]}
                ]}')
                LOAD '/inflow/out/search_out' USING org.apache.pig.piggybank.storage.SequenceFileLoader() AS (prefix: chararray, searchResult: chararray)
                `com.cision.hadoop.mapreduce.LuceneMapReduceMain /inflow/out/storesearch_tmp /inflow/out/search_out | SearchAgents a:query a:cust_id a:subscription_id a:tags 100 dc1-r1-n6.qwestcolo.local,dc1-r1-n5.qwestcolo.local,dc1-r2-n5.qwestcolo.local 5181`;

subscriptionIds = FILTER search BY com.cision.hadoop.pig.filter.StartsWith(prefix, 's_');
highlights = FILTER search BY com.cision.hadoop.pig.filter.StartsWith(prefix, 'h_');

subscriptionIds_to_store = FOREACH subscriptionIds GENERATE
                (long)SUBSTRING(prefix, 2, (int)StringSize(prefix) - 2) AS ssotmonitorid
                , searchResult AS ids;

highlightsSplit_to_store = FOREACH highlights GENERATE
                SUBSTRING(prefix, 2, (int)StringSize(prefix) - 2) AS rowkey
                , FLATTEN(STRSPLIT(searchResult, '\\|')) AS (fieldid: chararray, text: chararray);

--STORE highlightsSplit_to_store INTO 'HighlightedSearches' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('hs:field hs:text', '-loadKey true -caster HBaseBinaryConverter');

joined_subscriptions = JOIN dedupe BY ssotmonitorid LEFT OUTER, subscriptionIds_to_store BY ssotmonitorid USING 'skewed';

merged_articles = FOREACH joined_subscriptions GENERATE
                dedupe::ssotmonitorid AS ssotmonitorid
                , article AS article
                , path AS path
                , htmlcleanedarticle AS htmlcleanedarticle
                , drmfingerprint AS drmfingerprint
                , media_guid AS media_guid
                , outletid AS outletid
                , mediaId AS mediaId
                , outletName AS outletName
                , pubdate AS pubdate
                , pubname AS pubname
                , headline AS headline
                , sourceid AS sourceid
                , mark AS mark
                , ruleId AS ruleId
                , publicityvalue AS publicityvalue
                , arbitronCumeEstimate AS arbitronCumeEstimate
                , audience AS audience
                , circulation AS circulation
                , visitorsPerMonth AS visitorsPerMonth
                , authors AS authors
                , legacyContactId AS legacyContactId
                , com.cision.hadoop.pig.common.TupleJoin('|', com.cision.hadoop.pig.common.EliminateDuplicatesInTuple(com.cision.hadoop.pig.common.PigCombiner(STRSPLIT(subscriptionid, '\\|'), STRSPLIT(ids, '\\|')))) AS subscriptionid
                , customerid AS customerid
                , media_type AS media_type
                , industries AS industries
                , locations AS locations
                , organizations AS organizations
                , people AS people
                , subject AS subject;

to_store_hbase = FOREACH merged_articles GENERATE
                (chararray)ssotmonitorid
                , industries
                , locations
                , organizations
                , people
                , subject
                , htmlcleanedarticle
                , outletid
                , outletName
                , ruleId
                , publicityvalue
                , arbitronCumeEstimate
                , audience
                , circulation
                , visitorsPerMonth
                , authors
                , legacyContactId
                , media_type
            
+
Bo Vargas 2013-02-04, 15:01