Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Plain View
MapReduce, mail # user - XML parsing in Hadoop


+
Chhaya Vishwakarma 2013-11-28, 06:45
Copy link to this message
-
Re: XML parsing in Hadoop
Devaraj K 2013-11-28, 08:09
Hi,

Here this map() function will be called for every (key,value) pair (i.e.
for every line of split in your Job because of TextInputFormat). This xml
parsing code which you have written in map() function will be executed for
every line of your input which is causing the problem.

You can customize your InputFormat to read the xml file, instead of parsing
in map() or you could place this parsing code in run() method by overriding
it from Mapper.run(Context context).
On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
[EMAIL PROTECTED]> wrote:

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log > Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs > file.getFileSystem(conf);
>
>                                                 fstream1 > fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom > db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle > dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object

Thanks
Devaraj K
+
Mirko Kämpf 2013-11-28, 07:23
+
Sofia Georgiakaki 2013-11-28, 09:28