Home | About | Sematext search-lucene.com search-hadoop.com
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB
 Search Hadoop and all its subprojects:

Switch to Threaded View
Pig >> mail # user >> PigStorage


Copy link to this message
-
Re: PigStorage
done. PIG-3057 <https://issues.apache.org/jira/browse/PIG-3057>

On Mon, Nov 19, 2012 at 6:32 PM, Jonathan Coveney <[EMAIL PROTECTED]>wrote:

> Make a JIRA and attach the patch, please.
>
>
> 2012/11/19 pablomar <[EMAIL PROTECTED]>
>
> > hi all,
> >
> > I did it as simple as I could. What about this changes ?
> >
> >
> > PigStorage.java
> > original:
> >     private void readField(byte[] buf, int start, int end) {
> >         if (start == end) {
> >             // NULL value
> >             mProtoTuple.add(null);
> >         } else {
> >             mProtoTuple.add(new DataByteArray(buf, start, end));
> >         }
> >     }
> >
> >
> > new:
> >     protected void addToCurrentTuple(DataByteArray data) {
> >             mProtoTuple.add(data);
> >     }
> >
> >     protected void readField(byte[] buf, int start, int end) {
> >         if (start == end) {
> >             // NULL value
> >             addToCurrentTuple(null);
> >         } else {
> >             addToCurrentTuple(new DataByteArray(buf, start, end));
> >         }
> >     }
> >
> >
> >
> > so, what's the advantage ?
> > with the new version, if I want to extend PigStorage just to override
> > readField, I do something like:
> >
> > import org.apache.pig.builtin.PigStorage;
> > import org.apache.pig.data.DataByteArray;
> >
> > public class MyLoader extends PigStorage {
> >   public MyLoader() {
> >     this("\t");
> >   }
> >
> >   public MyLoader(String delimiter) {
> >     super(delimiter);
> >   }
> >
> >   @Override
> >   protected void readField(byte[] buf, int start, int end) {
> >     // remove the field name (example: min=, max=)
> >     for (int i=start; i<=end;i++) {
> >       if (buf[i] == '=') {
> >         start = i + 1;
> >         break;
> >       }
> >     }
> >
> >     if (start == end) {
> >       // NULL value
> >       addToCurrentTuple(null);
> >     } else {
> >       addToCurrentTuple(new DataByteArray(buf, start, end));
> >     }
> >   }
> > }
> >
> > currently, just to override readField, I also need to copy/paste a lot
> from
> > PigStorage:
> >
> > import java.io.IOException;
> > import java.util.ArrayList;
> > import java.util.Properties;
> > import org.apache.hadoop.io.Text;
> > import org.apache.pig.backend.executionengine.ExecException;
> > import org.apache.pig.builtin.PigStorage;
> > import org.apache.pig.PigException;
> > import org.apache.pig.data.DataByteArray;
> > import org.apache.pig.data.Tuple;
> > import org.apache.pig.data.TupleFactory;
> > import org.apache.pig.impl.util.ObjectSerializer;
> > import org.apache.pig.impl.util.StorageUtil;
> > import org.apache.pig.impl.util.UDFContext;
> >
> > public class MyLoaderextends PigStorage {
> >   private byte fieldDel = '\t';
> >   private ArrayList<Object> mProtoTuple = null;
> >   private TupleFactory mTupleFactory = TupleFactory.getInstance();
> >   private boolean mRequiredColumnsInitialized = false;
> >
> >   public MyLoader() {
> >     this("\t");
> >   }
> >
> >   public MyLoader(String delimiter) {
> >     super(delimiter);
> >     fieldDel = StorageUtil.parseFieldDel(delimiter);
> >   }
> >
> >   // exactly the same than in PigStorage, but I needed to modify
> readField
> > which is private, so I have to copy all of it
> >     @Override
> >     public Tuple getNext() throws IOException {
> >         mProtoTuple = new ArrayList<Object>();
> >         if (!mRequiredColumnsInitialized) {
> >             if (signature!=null) {
> >                 Properties p > > UDFContext.getUDFContext().getUDFProperties(this.getClass());
> >                 mRequiredColumns > > (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
> >             }
> >             mRequiredColumnsInitialized = true;
> >         }
> >         try {
> >             boolean notDone = in.nextKeyValue();
> >             if (!notDone) {
> >                 return null;
> >
> > }
> >
> >             Text value = (Text) in.getCurrentValue();
> >             byte[] buf = value.getBytes();
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB