Home | About | Sematext search-lucene.com search-hadoop.com
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB
 Search Hadoop and all its subprojects:

Switch to Threaded View
Pig >> mail # user >> Pig + Cassandra Example !!


Copy link to this message
-
Re: Pig + Cassandra Example !!
Also,

// ruby script modified for cassandra, from amazon

#!/usr/bin/ruby
require 'hpricot'
require 'tempfile'

CONFIG_HEADER = "<?xml version=\"1.0\"?>\n<?xml-stylesheet
type=\"text/xsl\" href=\"configuration.xsl\"?>"

def parse_config_file(config_file_path)
  ret = []
  if File.exist?(config_file_path) then
    doc = open(config_file_path) { |f| Hpricot(f) }
    (doc/"configuration"/"property").each do |property|
      val = {:name => (property/"name").inner_html, :value =>
(property/"value").inner_html }
      if (property/"final").inner_html != "" then
        val[:final] =  (property/"final").inner_html
      end
      ret << val
    end
  else
    puts "#{config_file_path} does not exist, assuming empty configuration"
  end
  return ret
end

def dump_config_file(file_name, config)
  open(file_name, 'w') do |f|
    f.puts CONFIG_HEADER
    f.puts '<configuration>'
    for entry in config
      f.print "
 <property><name>#{entry[:name]}</name><value>#{entry[:value]}</value>"
      if entry[:final] then
        f.print "<final>#{entry[:final]}</final>"
      end
      f.puts '</property>'
    end
    f.puts '</configuration>'
  end
end

def merge_config(default, overwrite)
  for entry in overwrite
    cells = default.select { |x| x[:name] == entry[:name]}
    if cells.size == 0 then
      puts "'#{entry[:name]}': default does not have key, appending value
'#{entry[:value]}'"
      default << entry
    elsif cells.size == 1 then
      puts "'#{entry[:name]}': new value '#{entry[:value]}' overwriting
'#{cells[0][:value]}'"
      cells[0].replace(entry)
    else
      raise "'#{entry[:name]}': default has #{cells.size} keys"
    end
  end
end

def add_cassandra_settings()
  file = "/home/hadoop/conf/mapred-site.xml"
  default = parse_config_file(file)
  merge_config(default,[{:name => "cassandra.thrift.address", :value =>
"THISIPADDRESS" }])
  merge_config(default,[{:name => "cassandra.input.thrift.address", :value
=> "THISIPADDRESS" }])
  merge_config(default,[{:name => "cassandra.output.thrift.address", :value
=> "THISIPADDRESS" }])
  merge_config(default,[{:name => "cassandra.thrift.port", :value => "9160"
}])
  merge_config(default,[{:name => "cassandra.input.thrift.port", :value =>
"9160" }])
  merge_config(default,[{:name => "cassandra.output.thrift.port", :value =>
"9160" }])
  merge_config(default,[{:name => "cassandra.partitioner.class", :value =>
"org.apache.cassandra.dht.RandomPartitioner" }])
  merge_config(default,[{:name => "cassandra.input.partitioner.class",
:value => "org.apache.cassandra.dht.RandomPartitioner" }])
  merge_config(default,[{:name => "cassandra.output.partitioner.class",
:value => "org.apache.cassandra.dht.RandomPartitioner" }])
  dump_config_file(file + ".new", default)
  if File.exist?(file) then
    File.rename(file, file + ".old")
  end
  File.rename(file + ".new", file)
  puts "Saved #{file} with overwrites. Original saved to #{file}.old"
end

def warn(msg)
  STDERR.puts "#{Time.now.utc} WARN " + msg
end

add_cassandra_settings()
On Mon, Mar 18, 2013 at 1:24 PM, Dan DeCapria, CivicScience <
[EMAIL PROTECTED]> wrote:

> So it appears that you need to configure Cassandra to run with hadoop.
>  There are a couple of things you will need to do here.
> In my case, I usually bootstrap these for my hadoop master and slaves, for
> the correct dependencies and pig IP touch points for cassandra.
>
> // install cassandra everywhere
> echo "deb http://debian.datastax.com/community stable main" >
> /tmp/cassandra.sources.list
> sudo mv /tmp/cassandra.sources.list
> /etc/apt/sources.list.d/cassandra.sources.list
> curl -L http://debian.datastax.com/debian/repo_key | sudo apt-key add -
> sudo apt-get update
> sudo apt-get install -y cassandra
> sudo /etc/init.d/cassandra stop
> echo
> "HADOOP_CLASSPATH=/usr/share/cassandra/*:/usr/share/cassandra/lib/*:$HADOOP_CLASSPATH"
> >> /home/hadoop/conf/hadoop-user-env.sh
> echo "PIG_INITIAL_ADDRESS=MYIPGOESHERE" >>
> /home/hadoop/conf/hadoop-user-env.sh

Dan DeCapria
CivicScience, Inc.
Senior Informatics / DM / ML / BI Specialist
NEW: Monitor These Apps!
elasticsearch, apache solr, apache hbase, hadoop, redis, casssandra, amazon cloudwatch, mysql, memcached, apache kafka, apache zookeeper, apache storm, ubuntu, centOS, red hat, debian, puppet labs, java, senseiDB