Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Plain View
HDFS, mail # user - Re: XML parsing in Hadoop


+
unmesha sreeveni 2013-11-28, 08:52
+
Vinayakumar B 2013-11-28, 09:10
Copy link to this message
-
Re: XML parsing in Hadoop
Adam Kawa 2013-11-29, 10:11
Alternatively you can try an input format called WholeFileInputFormat
(nicely explained in "Hadoop: The Definitive Guide" by Tom White), where
you process a whole file as a record in a single map() method. Refer to a
book, for a code example.
2013/11/28 Devaraj K <[EMAIL PROTECTED]>

> Hi,
>
> Here this map() function will be called for every (key,value) pair (i.e.
> for every line of split in your Job because of TextInputFormat). This xml
> parsing code which you have written in map() function will be executed for
> every line of your input which is causing the problem.
>
> You can customize your InputFormat to read the xml file, instead of
> parsing in map() or you could place this parsing code in run() method by
> overriding it from Mapper.run(Context context).
>
>
> On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
> [EMAIL PROTECTED]> wrote:
>
>>  Hi,
>>
>>
>>
>>
>>
>> The below code parses XML file, Here the output of the code is correct
>> but the job takes long time for completion.
>>
>> It took 20 hours to parse 2MB file.
>>
>> Kindly suggest what changes could be done to increase the performance.
>>
>>
>>
>>
>>
>>
>>
>> package xml;
>>
>>
>>
>> import java.io.FileInputStream;
>>
>> import java.io.FileNotFoundException;
>>
>> import java.io.IOException;
>>
>> import java.util.*;
>>
>>
>>
>> import javax.xml.parsers.DocumentBuilder;
>>
>> import javax.xml.parsers.DocumentBuilderFactory;
>>
>> import javax.xml.parsers.ParserConfigurationException;
>>
>> import javax.xml.xpath.XPath;
>>
>> import javax.xml.xpath.XPathConstants;
>>
>> import javax.xml.xpath.XPathExpressionException;
>>
>> import javax.xml.xpath.XPathFactory;
>>
>>
>>
>> import org.apache.hadoop.fs.FSDataInputStream;
>>
>> import org.apache.hadoop.fs.FSInputStream;
>>
>> import org.apache.hadoop.fs.FileSystem;
>>
>> import org.apache.hadoop.fs.Path;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>>
>> import org.apache.hadoop.io.*;
>>
>>
>>
>> import org.apache.hadoop.mapred.JobConf;
>>
>> import org.apache.hadoop.mapreduce.*;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>>
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>>
>>
>>
>>
>> import org.apache.log4j.Logger;
>>
>> import org.w3c.dom.Document;
>>
>> import org.w3c.dom.Element;
>>
>> import org.w3c.dom.NodeList;
>>
>> import org.xml.sax.SAXException;
>>
>>
>>
>>
>>
>> public class ReadXmlMR
>>
>> {
>>
>>                 static Logger log >> Logger.getLogger(ReadXmlMR.class.getName());
>>
>>                  public static String fileName = new String();
>>
>>                  public static Document dom;
>>
>>                  public void configure(JobConf job) {
>>
>>          fileName = job.get("map.input.file");
>>
>> }
>>
>>
>>
>>
>>
>>                 public static class Map extends
>> Mapper<LongWritable,Text,Text,Text>
>>
>>                {
>>
>>
>>
>>                                 public void map(LongWritable key, Text
>> value,Context context ) throws IOException, InterruptedException
>>
>>                                 {
>>
>>                                                 try {
>>
>>                                                                 FileSplit
>> fileSplit = (FileSplit)context.getInputSplit();
>>
>>
>> Configuration conf = context.getConfiguration();
>>
>>
>>
>>
>> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>>
>>
>>
>>
>> FSDataInputStream fstream1;
>>
>>                                                                 Path file
>> = fileSplit.getPath();
>>
>>                                                 FileSystem fs >> file.getFileSystem(conf);
>>
>>                                                 fstream1 >> fs.open(fileSplit.getPath());
>>
>>
>> DocumentBuilder db = dbf.newDocumentBuilder();