Follower数取得スクリプト

前回の「「はてなダイアラー」のFollower数ランキング - 鯨飲馬食コード」で用いたスクリプトを未来の自分のために貼り付けておく。いつもならばRubyで書くのだけれど、少し前にPython + BeautifulSoupで書いたスクリプトがあったので、それを流用した。

以下、ウェブページをスクレイピングしてTwitterのFollowerおよびFollowing数とはてなアンテナの被登録数を得るスクリプト

import urllib2
import StringIO
import gzip
import re
import sys
from BeautifulSoup import BeautifulSoup

def get_html(url):
  request = urllib2.Request(url)
  request.add_header('Accept-encoding', 'gzip')
  opener = urllib2.build_opener()

  try:
    data_stream = opener.open(request)
  except urllib2.URLError, e:
      sys.stderr.write('ERROR OCCURED\n')
      return ""
  else:
    headers = data_stream.headers
    if headers.get('Content-Encoding') == 'gzip':
      comp_data_stream = StringIO.StringIO(data_stream.read())
      gzipper = gzip.GzipFile(fileobj=comp_data_stream)
      data = gzipper.read()
    else:
      data = data_stream.read()
    return data
  
def count_twitter(id):
  url = 'http://twitter.com/' + id
  data = get_html(url)
  #html = unicode(data, 'utf-8')
  html = data 
  soup = BeautifulSoup(html)
  following = 0
  follower = 0
  if soup.find('span', {'id': 'following_count'}):
    following = soup.find('span', {'id': 'following_count'}).string
    following = int(following.replace(",",""))
  if soup.find('span', {'id': 'follower_count'}):
    follower = soup.find('span', {'id': 'follower_count'}).string
    follower = int(follower.replace(",",""))
  data = dict()
  data["following"] = following
  data["follower"] = follower
  return data
  
def count_ahatena(id):
  url = 'http://a.hatena.ne.jp/include?http://d.hatena.ne.jp/'+ id +'/'
  data = get_html(url)
  #html = unicode(data, 'euc-jp')
  html = data
  soup = BeautifulSoup(html)
  h1 = soup.find('h1')
  if h1.contents and len(h1.contents) > 2:
    m = re.compile("\((\d+)\)").search(h1.contents[1])
    if m:
      return int(m.group(1))
  return 0

for line in sys.stdin:
  tmp = list()
  id_list = line.rstrip("\n").split(",")
  hatena_id = id_list[0]
  twitter_id = id_list[1]

  sys.stderr.write(hatena_id+'\n')
  acount = count_ahatena(hatena_id)
  data = count_twitter(twitter_id)

  print hatena_id+","+twitter_id+","+str(data['follower'])+","+str(data['following'])+","+str(acount)

上のスクリプトで得た結果をソートして、はてな記法で出力するスクリプトが下。

import sys

result_list = list()
for line in sys.stdin:
  tmp = line.rstrip("\n").split(",")
  result_list.append(tmp)

follower_sort = lambda x: -1*int(x[2])
result_list.sort(key = follower_sort)

print "|*Rank|*HatenaID|*TwitterID|*Follower|*Following|*ahatena|"
i = 1
for result in result_list:
  print "|"+str(i)+"|id:"+result[0]+"|twitter:@"+result[1]+"|"+str(result[2])+"|"+str(result[3])+"|"+str(result[4])+"|"
  i+=1