Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Threaded View
Hive >> mail # user >> A GenericUDF Function to Extract a Field From an Array of Structs

Copy link to this message
A GenericUDF Function to Extract a Field From an Array of Structs
I am trying to write a GenericUDF function to collect all of a specific struct field(s) within an array for each record, and return them in an array as well.
I wrote the UDF (as below), and it seems to work but:
1) It does not work when I am performing this on an external table, it works fine on a managed table, any idea?
2) I am having a tough time writing a test on this.  I have attached the test I have so far, and it does not work, always getting 'java.util.ArrayList cannot be cast to org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector' or cannot cast String to LazyString', my question is how do I supply a list of structs for the evalue method?
Any help will be greatly appreciated.
The table:
CREATE EXTERNAL TABLE FOO (    TS string,    customerId string,    products array< struct<productCategory:string> >  )  PARTITIONED BY (ds string)  ROW FORMAT SERDE 'some.serde'  WITH SERDEPROPERTIES ('error.ignore'='true')  LOCATION 'some_locations'  ;
A row of record holds:1340321132000, 'some_company', [{"productCategory":"footwear"},{"productCategory":"eyewear"}]
This is my code:
import org.apache.hadoop.hive.ql.exec.Description;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.lazy.LazyString;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import org.apache.hadoop.io.Text;
import java.util.ArrayList;
@Description(name = "extract_product_category",        value = "_FUNC_( array< struct<sha256:string> > ) - Collect all product category field values inside an array of struct(s), and return the results in an array<string>",        extended = "Example:\n SELECT _FUNC_(array_of_structs_with_product_category_field)")public class GenericUDFExtractProductCategory        extends GenericUDF{    private ArrayList ret;
    private ListObjectInspector listOI;    private StructObjectInspector structOI;    private ObjectInspector prodCatOI;
    @Override    public ObjectInspector initialize(ObjectInspector[] args)            throws UDFArgumentException    {        if (args.length != 1) {            throw new UDFArgumentLengthException("The function extract_product_category() requires exactly one argument.");        }
        if (args[0].getCategory() != Category.LIST) {            throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_product_category but " + args[0].getTypeName() + " is found instead");        }
        listOI = ((ListObjectInspector) args[0]);        structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());
        if (structOI.getAllStructFieldRefs().size() != 1) {            throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");        }
        StructField productCategoryField = structOI.getStructFieldRef("productCategory");        //If not, throw exception        if (productCategoryField == null) {            throw new UDFArgumentTypeException(0, "NO \"productCategory\" field in input structure");        }
        //Are they of the correct types?        //We store these object inspectors for use in the evaluate() method        prodCatOI = productCategoryField.getFieldObjectInspector();
        //First are they primitives        if (prodCatOI.getCategory() != Category.PRIMITIVE) {            throw new UDFArgumentTypeException(0, "productCategory field must be of string type");        }
        //Are they of the correct primitives?        if (((PrimitiveObjectInspector)prodCatOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {            throw new UDFArgumentTypeException(0, "productCategory field must be of string type");        }
        ret = new ArrayList();
        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);    }
    @Override    public ArrayList evaluate(DeferredObject[] arguments)            throws HiveException    {        ret.clear();
        if (arguments.length != 1) {            return null;        }
        if (arguments[0].get() == null) {         return null;        }
        int numElements = listOI.getListLength(arguments[0].get());
        for (int i = 0; i < numElements; i++) {            LazyString prodCatDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("productCategory")));            Text productCategoryValue = ((StringObjectInspector) prodCatOI).getPrimitiveWritableObject(prodCatDataObject);            ret.add(productCategoryValue);        }        return ret;    }
    @Override    public String getDisplayString(String[] strings)    {        assert (strings.length > 0);        StringBuilder sb = new StringBuilder();        sb.append("extract_product_category(");        sb.append(strings[0]);        sb.append(")");        return sb.toString();    }}

My Test:
import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObj