Home | About | Sematext search-lucene.com search-hadoop.com
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB
 Search Hadoop and all its subprojects:

Switch to Threaded View
Hive >> mail # user >> A GenericUDF Function to Extract a Field From an Array of Structs


Copy link to this message
-
A GenericUDF Function to Extract a Field From an Array of Structs
I am trying to write a GenericUDF function to collect all of a specific struct field(s) within an array for each record, and return them in an array as well.
I wrote the UDF (as below), and it seems to work but:
1) It does not work when I am performing this on an external table, it works fine on a managed table, any idea?
2) I am having a tough time writing a test on this.  I have attached the test I have so far, and it does not work, always getting 'java.util.ArrayList cannot be cast to org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector' or cannot cast String to LazyString', my question is how do I supply a list of structs for the evalue method?
Any help will be greatly appreciated.
Thanks,Peter
The table:
CREATE EXTERNAL TABLE FOO (    TS string,    customerId string,    products array< struct<productCategory:string> >  )  PARTITIONED BY (ds string)  ROW FORMAT SERDE 'some.serde'  WITH SERDEPROPERTIES ('error.ignore'='true')  LOCATION 'some_locations'  ;
A row of record holds:1340321132000, 'some_company', [{"productCategory":"footwear"},{"productCategory":"eyewear"}]
This is my code:
import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.lazy.LazyString;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.io.Text;
import java.util.ArrayList;
@Description(name = "extract_product_category",        value = "_FUNC_( array< struct<sha256:string> > ) - Collect all product category field values inside an array of struct(s), and return the results in an array<string>",        extended = "Example:\n SELECT _FUNC_(array_of_structs_with_product_category_field)")public class GenericUDFExtractProductCategory        extends GenericUDF{    private ArrayList ret;
    private ListObjectInspector listOI;    private StructObjectInspector structOI;    private ObjectInspector prodCatOI;
    @Override    public ObjectInspector initialize(ObjectInspector[] args)            throws UDFArgumentException    {        if (args.length != 1) {            throw new UDFArgumentLengthException("The function extract_product_category() requires exactly one argument.");        }
        if (args[0].getCategory() != Category.LIST) {            throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_product_category but " + args[0].getTypeName() + " is found instead");        }
        listOI = ((ListObjectInspector) args[0]);        structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());
        if (structOI.getAllStructFieldRefs().size() != 1) {            throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");        }
        StructField productCategoryField = structOI.getStructFieldRef("productCategory");        //If not, throw exception        if (productCategoryField == null) {            throw new UDFArgumentTypeException(0, "NO \"productCategory\" field in input structure");        }
        //Are they of the correct types?        //We store these object inspectors for use in the evaluate() method        prodCatOI = productCategoryField.getFieldObjectInspector();
        //First are they primitives        if (prodCatOI.getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "productCategory field must be of string type");        }
        //Are they of the correct primitives?        if (((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {            throw new UDFArgumentTypeException(0, "productCategory field must be of string type");        }
        ret = new ArrayList();
        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);    }
    @Override    public ArrayList evaluate(DeferredObject[] arguments)            throws HiveException    {        ret.clear();
        if (arguments.length != 1) {            return null;        }
        if (arguments[0].get() == null) {         return null;        }
        int numElements = listOI.getListLength(arguments[0].get());
        for (int i = 0; i < numElements; i++) {            LazyString prodCatDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("productCategory")));            Text productCategoryValue = ((StringObjectInspector) prodCatOI).getPrimitiveWritableObject(prodCatDataObject);            ret.add(productCategoryValue);        }        return ret;    }
    @Override    public String getDisplayString(String[] strings)    {        assert (strings.length > 0);        StringBuilder sb = new StringBuilder();        sb.append("extract_product_category(");        sb.append(strings[0]);        sb.append(")");        return sb.toString();    }}

My Test:
import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObj
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB