|
|
-
error storing into hbaseJonas Hartwig 2013-02-04, 14:36
Hi community,
I have the follwing pig script: define FormatMessage com.cision.hadoop.pig.MessageFormatter(); --If you want the message to have no empty fields use this --define FormatMessage com.cision.hadoop.pig.MessageFormatter('false'); dedupe = LOAD '/inflow/out/dedupe_out' USING org.apache.pig.piggybank.storage.avro.AvroStorage(); rmf /inflow/out/storesearch_tmp rmf /inflow/out/search_out search = MAPREDUCE '/opt/mapr/pig/pig-0.10.0/contrib/hadoop-0.0.1.jar' STORE dedupe INTO '/inflow/out/storesearch_tmp' USING org.apache.pig.piggybank.storage.avro.AvroStorage('schema', '{"type":"record","name":"monitor_enriched_article","fields":[ {"name":"ssotmonitorid","type":"long"} , {"name":"article","type":"string"} , {"name":"path","type":"string"} , {"name":"htmlcleanedarticle","type":"string"} , {"name":"drmfingerprint","type":"int"} , {"name":"media_guid","type":["null","string"]} , {"name":"outletName","type":["null","string"]} , {"name":"outletid","type":["null","string"]} , {"name":"mediaId","type":["null","string"]} , {"name":"pubdate","type":"string"} , {"name":"pubname","type":"string"} , {"name":"headline","type":"string"} , {"name":"sourceid","type":"string"} , {"name":"mark","type":"string"} , {"name":"ruleId","type":"string"} , {"name":"publicityvalue","type":["null", "string"]} , {"name":"arbitronCumeEstimate","type":["null","string"]} , {"name":"audience","type":["null","string"]} , {"name":"circulation","type":["null","string"]} , {"name":"visitorsPerMonth","type":["null","string"]} , {"name":"authors","type":["null", "string"]} , {"name":"legacyContactId","type":["null", "string"]} , {"name":"subscriptionid","type":["null", "string"]} , {"name":"customerid","type":["null", "string"]} , {"name":"media_type","type":["null", "string"]} , {"name":"industries","type":["null", "string"]} , {"name":"locations","type":["null", "string"]} , {"name":"organizations","type":["null", "string"]} , {"name":"people","type":["null", "string"]} , {"name":"subject","type":["null", "string"]} ]}') LOAD '/inflow/out/search_out' USING org.apache.pig.piggybank.storage.SequenceFileLoader() AS (prefix: chararray, searchResult: chararray) `com.cision.hadoop.mapreduce.LuceneMapReduceMain /inflow/out/storesearch_tmp /inflow/out/search_out | SearchAgents a:query a:cust_id a:subscription_id a:tags 100 dc1-r1-n6.qwestcolo.local,dc1-r1-n5.qwestcolo.local,dc1-r2-n5.qwestcolo.local 5181`; subscriptionIds = FILTER search BY com.cision.hadoop.pig.filter.StartsWith(prefix, 's_'); highlights = FILTER search BY com.cision.hadoop.pig.filter.StartsWith(prefix, 'h_'); subscriptionIds_to_store = FOREACH subscriptionIds GENERATE (long)SUBSTRING(prefix, 2, (int)StringSize(prefix) - 2) AS ssotmonitorid , searchResult AS ids; highlightsSplit_to_store = FOREACH highlights GENERATE SUBSTRING(prefix, 2, (int)StringSize(prefix) - 2) AS rowkey , FLATTEN(STRSPLIT(searchResult, '\\|')) AS (fieldid: chararray, text: chararray); --STORE highlightsSplit_to_store INTO 'HighlightedSearches' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('hs:field hs:text', '-loadKey true -caster HBaseBinaryConverter'); joined_subscriptions = JOIN dedupe BY ssotmonitorid LEFT OUTER, subscriptionIds_to_store BY ssotmonitorid USING 'skewed'; merged_articles = FOREACH joined_subscriptions GENERATE dedupe::ssotmonitorid AS ssotmonitorid , article AS article , path AS path , htmlcleanedarticle AS htmlcleanedarticle , drmfingerprint AS drmfingerprint , media_guid AS media_guid , outletid AS outletid , mediaId AS mediaId , outletName AS outletName , pubdate AS pubdate , pubname AS pubname , headline AS headline , sourceid AS sourceid , mark AS mark , ruleId AS ruleId , publicityvalue AS publicityvalue , arbitronCumeEstimate AS arbitronCumeEstimate , audience AS audience , circulation AS circulation , visitorsPerMonth AS visitorsPerMonth , authors AS authors , legacyContactId AS legacyContactId , com.cision.hadoop.pig.common.TupleJoin('|', com.cision.hadoop.pig.common.EliminateDuplicatesInTuple(com.cision.hadoop.pig.common.PigCombiner(STRSPLIT(subscriptionid, '\\|'), STRSPLIT(ids, '\\|')))) AS subscriptionid , customerid AS customerid , media_type AS media_type , industries AS industries , locations AS locations , organizations AS organizations , people AS people , subject AS subject; to_store_hbase = FOREACH merged_articles GENERATE (chararray)ssotmonitorid , industries , locations , organizations , people , subject , htmlcleanedarticle , outletid , outletName , ruleId , publicityvalue , arbitronCumeEstimate , audience , circulation , visitorsPerMonth , authors , legacyContactId , media_type |