Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Threaded View
MapReduce >> mail # user >> New to hadoop, trying to write a customary file split


Copy link to this message
-
Re: New to hadoop, trying to write a customary file split
The reason for the two id that it may say
<Foo> ....
or
<Foo attr1="...
- now I suppose you could just look for <Foo which would cover either case

Also note I am cheating a bit and this will not handle properly tags which
are commented out with
the xml comment <!-- but I doubt it is possible to handle these without
parsing the entire (potentially large file)
On Mon, Jul 18, 2011 at 9:40 AM, Erik T <[EMAIL PROTECTED]> wrote:

> Hi Steven,
>
> Thank you for the sample. I have one question though.
>
> In MyXMLFileReader, nextKeyValue, is startTag and startTag2 needed?
> Erik
>
>
>
> On 11 July 2011 15:11, Steve Lewis <[EMAIL PROTECTED]> wrote:
>
>> Look at this sample
>> ============================================>> package org.systemsbiology.hadoop;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.fs.*;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.io.compress.*;
>> import org.apache.hadoop.mapreduce.*;
>> import org.apache.hadoop.mapreduce.lib.input.*;
>>
>> import java.io.*;
>> import java.util.*;
>>
>> /**
>>  * org.systemsbiology.xtandem.hadoop.XMLTagInputFormat
>>  * Splitter that reads scan tags from an XML file
>>  * No assumption is made about lines but tage and end tags MUST look like
>> <MyTag </MyTag> with no embedded spaces
>>  * usually you will subclass and hard code the tag you want to split on
>>  */
>> public class XMLTagInputFormat extends FileInputFormat<Text, Text> {
>>     public static final XMLTagInputFormat[] EMPTY_ARRAY = {};
>>
>>
>>     private static final double SPLIT_SLOP = 1.1;   // 10% slop
>>
>>
>>     public static final int BUFFER_SIZE = 4096;
>>
>>     private final String m_BaseTag;
>>     private final String m_StartTag;
>>     private final String m_EndTag;
>>     private String m_Extension;
>>
>>     public XMLTagInputFormat(final String pBaseTag) {
>>         m_BaseTag = pBaseTag;
>>         m_StartTag = "<" + pBaseTag;
>>         m_EndTag = "</" + pBaseTag + ">";
>>
>>     }
>>
>>     public String getExtension() {
>>         return m_Extension;
>>     }
>>
>>     public void setExtension(final String pExtension) {
>>         m_Extension = pExtension;
>>     }
>>
>>     public boolean isSplitReadable(InputSplit split) {
>>         if (!(split instanceof FileSplit))
>>             return true;
>>         FileSplit fsplit = (FileSplit) split;
>>         Path path1 = fsplit.getPath();
>>         return isPathAcceptable(path1);
>>     }
>>
>>     protected boolean isPathAcceptable(final Path pPath1) {
>>         String path = pPath1.toString().toLowerCase();
>>         if(path.startsWith("part-r-"))
>>             return true;
>>         String extension = getExtension();
>>         if (extension != null && path.endsWith(extension.toLowerCase()))
>>             return true;
>>         if (extension != null && path.endsWith(extension.toLowerCase() +
>> ".gz"))
>>             return true;
>>         if (extension == null )
>>             return true;
>>         return false;
>>     }
>>
>>     public String getStartTag() {
>>         return m_StartTag;
>>     }
>>
>>     public String getBaseTag() {
>>         return m_BaseTag;
>>     }
>>
>>     public String getEndTag() {
>>         return m_EndTag;
>>     }
>>
>>     @Override
>>     public RecordReader<Text, Text> createRecordReader(InputSplit split,
>>                                                        TaskAttemptContext
>> context) {
>>         if (isSplitReadable(split))
>>             return new MyXMLFileReader();
>>         else
>>             return NullRecordReader.INSTANCE; // do not read
>>     }
>>
>>     @Override
>>     protected boolean isSplitable(JobContext context, Path file) {
>>         String fname = file.getName().toLowerCase();
>>         if(fname.endsWith(".gz"))
>>             return false;
>>         return true;
>>     }
>>
>>     /**
>>      * Generate the list of files and make them into FileSplits.
>>      * This needs to be copied to insert a filter on acceptable data
Steven M. Lewis PhD
4221 105th Ave NE
Kirkland, WA 98033
206-384-1340 (cell)
Skype lordjoe_com