Home | About | Sematext search-lucene.com search-hadoop.com
 Search Hadoop and all its subprojects:

Switch to Threaded View
Pig, mail # user - Pig + Cassandra Example !!


Copy link to this message
-
Re: Pig + Cassandra Example !!
Dan DeCapria, CivicScienc... 2013-03-18, 17:27
Also,

// ruby script modified for cassandra, from amazon

#!/usr/bin/ruby
require 'hpricot'
require 'tempfile'

CONFIG_HEADER = "<?xml version=\"1.0\"?>\n<?xml-stylesheet
type=\"text/xsl\" href=\"configuration.xsl\"?>"

def parse_config_file(config_file_path)
  ret = []
  if File.exist?(config_file_path) then
    doc = open(config_file_path) { |f| Hpricot(f) }
    (doc/"configuration"/"property").each do |property|
      val = {:name => (property/"name").inner_html, :value =>
(property/"value").inner_html }
      if (property/"final").inner_html != "" then
        val[:final] =  (property/"final").inner_html
      end
      ret << val
    end
  else
    puts "#{config_file_path} does not exist, assuming empty configuration"
  end
  return ret
end

def dump_config_file(file_name, config)
  open(file_name, 'w') do |f|
    f.puts CONFIG_HEADER
    f.puts '<configuration>'
    for entry in config
      f.print "
 <property><name>#{entry[:name]}</name><value>#{entry[:value]}</value>"
      if entry[:final] then
        f.print "<final>#{entry[:final]}</final>"
      end
      f.puts '</property>'
    end
    f.puts '</configuration>'
  end
end

def merge_config(default, overwrite)
  for entry in overwrite
    cells = default.select { |x| x[:name] == entry[:name]}
    if cells.size == 0 then
      puts "'#{entry[:name]}': default does not have key, appending value
'#{entry[:value]}'"
      default << entry
    elsif cells.size == 1 then
      puts "'#{entry[:name]}': new value '#{entry[:value]}' overwriting
'#{cells[0][:value]}'"
      cells[0].replace(entry)
    else
      raise "'#{entry[:name]}': default has #{cells.size} keys"
    end
  end
end

def add_cassandra_settings()
  file = "/home/hadoop/conf/mapred-site.xml"
  default = parse_config_file(file)
  merge_config(default,[{:name => "cassandra.thrift.address", :value =>
"THISIPADDRESS" }])
  merge_config(default,[{:name => "cassandra.input.thrift.address", :value
=> "THISIPADDRESS" }])
  merge_config(default,[{:name => "cassandra.output.thrift.address", :value
=> "THISIPADDRESS" }])
  merge_config(default,[{:name => "cassandra.thrift.port", :value => "9160"
}])
  merge_config(default,[{:name => "cassandra.input.thrift.port", :value =>
"9160" }])
  merge_config(default,[{:name => "cassandra.output.thrift.port", :value =>
"9160" }])
  merge_config(default,[{:name => "cassandra.partitioner.class", :value =>
"org.apache.cassandra.dht.RandomPartitioner" }])
  merge_config(default,[{:name => "cassandra.input.partitioner.class",
:value => "org.apache.cassandra.dht.RandomPartitioner" }])
  merge_config(default,[{:name => "cassandra.output.partitioner.class",
:value => "org.apache.cassandra.dht.RandomPartitioner" }])
  dump_config_file(file + ".new", default)
  if File.exist?(file) then
    File.rename(file, file + ".old")
  end
  File.rename(file + ".new", file)
  puts "Saved #{file} with overwrites. Original saved to #{file}.old"
end

def warn(msg)
  STDERR.puts "#{Time.now.utc} WARN " + msg
end

add_cassandra_settings()
On Mon, Mar 18, 2013 at 1:24 PM, Dan DeCapria, CivicScience <
[EMAIL PROTECTED]> wrote:

> So it appears that you need to configure Cassandra to run with hadoop.
>  There are a couple of things you will need to do here.
> In my case, I usually bootstrap these for my hadoop master and slaves, for
> the correct dependencies and pig IP touch points for cassandra.
>
> // install cassandra everywhere
> echo "deb http://debian.datastax.com/community stable main" >
> /tmp/cassandra.sources.list
> sudo mv /tmp/cassandra.sources.list
> /etc/apt/sources.list.d/cassandra.sources.list
> curl -L http://debian.datastax.com/debian/repo_key | sudo apt-key add -
> sudo apt-get update
> sudo apt-get install -y cassandra
> sudo /etc/init.d/cassandra stop
> echo
> "HADOOP_CLASSPATH=/usr/share/cassandra/*:/usr/share/cassandra/lib/*:$HADOOP_CLASSPATH"
> >> /home/hadoop/conf/hadoop-user-env.sh
> echo "PIG_INITIAL_ADDRESS=MYIPGOESHERE" >>
> /home/hadoop/conf/hadoop-user-env.sh

Dan DeCapria
CivicScience, Inc.
Senior Informatics / DM / ML / BI Specialist