RubyでPageRankの参照

PageRankRubyで処理したときに使ったスクリプト。使いまわしだしエラー処理してないけど。


個人ニュースサイトのPageRank - 鯨飲馬食コード」で使ったやつ。

#!/usr/bin/ruby

require 'uri'
require 'net/http'
require "rexml/document"
require 'kconv'

include REXML

class PR
  
  def initialize()
    @data = []
    @rank_table = {}
    @title_table = {}
  end
  
  def pagerank(target)
    page_rank = -1
    address = "www.trynt.com"
    path = "/google-pagerank-api/v1/" + "?u=#{target}"
    body = Net::HTTP.get(address, path)
    doc = Document.new body
    error = doc.elements.to_a("//Error-Code/")[0]

    unless error then
      doc.elements.to_a("//Pagerank").each do |element|
        page_rank = element.text.to_i
      end
    end
    return page_rank
  end
  
  def title(target)
    uri = URI.parse(target)
    title = target
    begin
      body = Net::HTTP.get(uri.host, uri.path)
    rescue
      return title
    else
      if body =~ /<title>(.*?)<\/title>/i
        title = $1
      end
    end
    return title
  end
  
  def read_data(file)
    IO.foreach(file) do |line|
      if line =~ /http:\/\//
        @data.push(line.strip)
      end
    end
  end

  def check_pagerank()
    i=1
    @data.each do |url|
      @rank_table[url] = pagerank(url)
      @title_table[url] = title(url)
      $stderr.print "\r#{i}"
      i+=1
    end
    $stderr.puts
  end
    
  def write_table()
    table = @rank_table.to_a
    table.sort! do |a, b|
      (b[1] <=> a[1])*2 + (a[0]<=>b[0])
    end
    puts "|*PageRank|*サイト|"
    table.each do |x|
      url = x[0]
      rank = x[1]
      title = @title_table[x[0]].toutf8
      puts "|" + rank.to_s + "|" + "<a href=\"" + url + "\">" + title + "<\/a>" + "|"
    end
  end

end

in_file = ARGV.shift

test = PR.new
test.read_data(in_file)
test.check_pagerank()
test.write_table()

はてなダイアリーのPageRank - 鯨飲馬食コード」で使ったスクリプト

#!/usr/bin/ruby

require 'xmlrpc/client'
require 'uri'
require 'net/http'
Net::HTTP.version_1_2
require "rexml/document"
require 'kconv'

include REXML

class PR
  
  def initialize()
    @data = []
    @rank_table = {}
    @title_table = {}
    @id_table = {}
    @bm_table = {}
  end
  
  def pagerank(target)
    page_rank = -1
    address = "www.trynt.com"
    path = "/google-pagerank-api/v1/" + "?u=#{target}"
    body = Net::HTTP.get(address, path)
    doc = Document.new body
    error = doc.elements.to_a("//Error-Code/")[0]

    unless error then
      doc.elements.to_a("//Pagerank").each do |element|
        page_rank = element.text.to_i
      end
    end
    return page_rank
  end
  
  def b_hatena(target)
    srv = XMLRPC::Client.new2('http://b.hatena.ne.jp/xmlrpc')
    bm = srv.call('bookmark.getTotalCount', target).to_i
    return bm
  end
  
  def title(target)
    uri = URI.parse(target)
    title = target
    begin
      body = Net::HTTP.get(uri.host, uri.path)
    rescue
      return title
    else
      if body =~ /<title>(.*?)<\/title>/i
        title = $1
      end
    end
    return title
  end
  
  def read_data(file)
    IO.foreach(file) do |line|
      if line =~ /http:\/\//
        @data.push(line.strip)
      end
    end
  end

  def check_pagerank()
    i=1
    @data.each do |url|
      @rank_table[url] = pagerank(url)
      @title_table[url] = title(url)
      if url =~ /http:\/\/d\.hatena\.ne\.jp\/(.*)\//
        id = $1
        @id_table[url] = id
      end
      @bm_table[url] = b_hatena(url)
      $stderr.print "\r#{i}"
      i+=1
    end
    $stderr.puts
  end
    
  def write_table()
    table = []
    @rank_table.to_a.each do |x|
      info = []
      url = x[0]
      rank = x[1]
      info[0] = url
      info[1] = rank
      info[2] = @title_table[url].toutf8
      info[3] = @id_table[url].toutf8
      info[4] = @bm_table[url]
      table.push(info)
    end
    table.sort! do |a, b|
      (b[4]<=>a[4])
    end
    i=1
    table.each do |x|
      x.push(i)
      i+=1
    end
    table.sort! do |a, b|
      (b[1] <=> a[1])*2 + (b[4]<=>a[4])
    end
    puts "|*PageRank|*id|*サイト名|*はてブ順位|*はてブ数|"
    table.each do |x|
      url = x[0]
      rank = x[1]
      title = x[2]
      id = x[3]
      bm = x[4]
      bm_rank = x[5]
      puts "|" + rank.to_s + "|" + "id:" + id + "|" + "<a href=\"" + url + "\">" + title + "<\/a>" + "|" + bm_rank.to_s + "位|" + bm.to_s + "|"
    end
  end

end

in_file = ARGV.shift

test = PR.new
test.read_data(in_file)
test.check_pagerank()
test.write_table()