Home | About | Sematext search-lucene.com search-hadoop.com
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB
 Search Hadoop and all its subprojects:

Switch to Plain View
Pig >> mail # user >> error storing into hbase


Copy link to this message
-
error storing into hbase
Hi community,

I have the follwing pig script:

define FormatMessage com.cision.hadoop.pig.MessageFormatter();
--If you want the message to have no empty fields use this
--define FormatMessage com.cision.hadoop.pig.MessageFormatter('false');

dedupe = LOAD '/inflow/out/dedupe_out' USING org.apache.pig.piggybank.storage.avro.AvroStorage();

rmf /inflow/out/storesearch_tmp
rmf /inflow/out/search_out
search = MAPREDUCE '/opt/mapr/pig/pig-0.10.0/contrib/hadoop-0.0.1.jar' STORE dedupe INTO '/inflow/out/storesearch_tmp' USING org.apache.pig.piggybank.storage.avro.AvroStorage('schema', '{"type":"record","name":"monitor_enriched_article","fields":[
                {"name":"ssotmonitorid","type":"long"}
                , {"name":"article","type":"string"}
                , {"name":"path","type":"string"}
                , {"name":"htmlcleanedarticle","type":"string"}
                , {"name":"drmfingerprint","type":"int"}
                , {"name":"media_guid","type":["null","string"]}
                , {"name":"outletName","type":["null","string"]}
                , {"name":"outletid","type":["null","string"]}
                , {"name":"mediaId","type":["null","string"]}
                , {"name":"pubdate","type":"string"}
                , {"name":"pubname","type":"string"}
                , {"name":"headline","type":"string"}
                , {"name":"sourceid","type":"string"}
                , {"name":"mark","type":"string"}
                , {"name":"ruleId","type":"string"}
                , {"name":"publicityvalue","type":["null", "string"]}
                , {"name":"arbitronCumeEstimate","type":["null","string"]}
                , {"name":"audience","type":["null","string"]}
                , {"name":"circulation","type":["null","string"]}
                , {"name":"visitorsPerMonth","type":["null","string"]}
                , {"name":"authors","type":["null", "string"]}
                , {"name":"legacyContactId","type":["null", "string"]}
                , {"name":"subscriptionid","type":["null", "string"]}
                , {"name":"customerid","type":["null", "string"]}
                , {"name":"media_type","type":["null", "string"]}
                , {"name":"industries","type":["null", "string"]}
                , {"name":"locations","type":["null", "string"]}
                , {"name":"organizations","type":["null", "string"]}
                , {"name":"people","type":["null", "string"]}
                , {"name":"subject","type":["null", "string"]}
                ]}')
                LOAD '/inflow/out/search_out' USING org.apache.pig.piggybank.storage.SequenceFileLoader() AS (prefix: chararray, searchResult: chararray)
                `com.cision.hadoop.mapreduce.LuceneMapReduceMain /inflow/out/storesearch_tmp /inflow/out/search_out | SearchAgents a:query a:cust_id a:subscription_id a:tags 100 dc1-r1-n6.qwestcolo.local,dc1-r1-n5.qwestcolo.local,dc1-r2-n5.qwestcolo.local 5181`;

subscriptionIds = FILTER search BY com.cision.hadoop.pig.filter.StartsWith(prefix, 's_');
highlights = FILTER search BY com.cision.hadoop.pig.filter.StartsWith(prefix, 'h_');

subscriptionIds_to_store = FOREACH subscriptionIds GENERATE
                (long)SUBSTRING(prefix, 2, (int)StringSize(prefix) - 2) AS ssotmonitorid
                , searchResult AS ids;

highlightsSplit_to_store = FOREACH highlights GENERATE
                SUBSTRING(prefix, 2, (int)StringSize(prefix) - 2) AS rowkey
                , FLATTEN(STRSPLIT(searchResult, '\\|')) AS (fieldid: chararray, text: chararray);

--STORE highlightsSplit_to_store INTO 'HighlightedSearches' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('hs:field hs:text', '-loadKey true -caster HBaseBinaryConverter');

joined_subscriptions = JOIN dedupe BY ssotmonitorid LEFT OUTER, subscriptionIds_to_store BY ssotmonitorid USING 'skewed';

merged_articles = FOREACH joined_subscriptions GENERATE
                dedupe::ssotmonitorid AS ssotmonitorid
                , article AS article
                , path AS path
                , htmlcleanedarticle AS htmlcleanedarticle
                , drmfingerprint AS drmfingerprint
                , media_guid AS media_guid
                , outletid AS outletid
                , mediaId AS mediaId
                , outletName AS outletName
                , pubdate AS pubdate
                , pubname AS pubname
                , headline AS headline
                , sourceid AS sourceid
                , mark AS mark
                , ruleId AS ruleId
                , publicityvalue AS publicityvalue
                , arbitronCumeEstimate AS arbitronCumeEstimate
                , audience AS audience
                , circulation AS circulation
                , visitorsPerMonth AS visitorsPerMonth
                , authors AS authors
                , legacyContactId AS legacyContactId
                , com.cision.hadoop.pig.common.TupleJoin('|', com.cision.hadoop.pig.common.EliminateDuplicatesInTuple(com.cision.hadoop.pig.common.PigCombiner(STRSPLIT(subscriptionid, '\\|'), STRSPLIT(ids, '\\|')))) AS subscriptionid
                , customerid AS customerid
                , media_type AS media_type
                , industries AS industries
                , locations AS locations
                , organizations AS organizations
                , people AS people
                , subject AS subject;

to_store_hbase = FOREACH merged_articles GENERATE
                (chararray)ssotmonitorid
                , industries
                , locations
                , organizations
                , people
                , subject
                , htmlcleanedarticle
                , outletid
                , outletName
                , ruleId
                , publicityvalue
                , arbitronCumeEstimate
                , audience
                , circulation
                , visitorsPerMonth
                , authors
                , legacyContactId
                , media_type
            
+
Bo Vargas 2013-02-04, 15:01
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB