Follower数取得スクリプト
前回の「「はてなダイアラー」のFollower数ランキング - 鯨飲馬食コード」で用いたスクリプトを未来の自分のために貼り付けておく。いつもならばRubyで書くのだけれど、少し前にPython + BeautifulSoupで書いたスクリプトがあったので、それを流用した。
以下、ウェブページをスクレイピングしてTwitterのFollowerおよびFollowing数とはてなアンテナの被登録数を得るスクリプト。
import urllib2 import StringIO import gzip import re import sys from BeautifulSoup import BeautifulSoup def get_html(url): request = urllib2.Request(url) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener() try: data_stream = opener.open(request) except urllib2.URLError, e: sys.stderr.write('ERROR OCCURED\n') return "" else: headers = data_stream.headers if headers.get('Content-Encoding') == 'gzip': comp_data_stream = StringIO.StringIO(data_stream.read()) gzipper = gzip.GzipFile(fileobj=comp_data_stream) data = gzipper.read() else: data = data_stream.read() return data def count_twitter(id): url = 'http://twitter.com/' + id data = get_html(url) #html = unicode(data, 'utf-8') html = data soup = BeautifulSoup(html) following = 0 follower = 0 if soup.find('span', {'id': 'following_count'}): following = soup.find('span', {'id': 'following_count'}).string following = int(following.replace(",","")) if soup.find('span', {'id': 'follower_count'}): follower = soup.find('span', {'id': 'follower_count'}).string follower = int(follower.replace(",","")) data = dict() data["following"] = following data["follower"] = follower return data def count_ahatena(id): url = 'http://a.hatena.ne.jp/include?http://d.hatena.ne.jp/'+ id +'/' data = get_html(url) #html = unicode(data, 'euc-jp') html = data soup = BeautifulSoup(html) h1 = soup.find('h1') if h1.contents and len(h1.contents) > 2: m = re.compile("\((\d+)\)").search(h1.contents[1]) if m: return int(m.group(1)) return 0 for line in sys.stdin: tmp = list() id_list = line.rstrip("\n").split(",") hatena_id = id_list[0] twitter_id = id_list[1] sys.stderr.write(hatena_id+'\n') acount = count_ahatena(hatena_id) data = count_twitter(twitter_id) print hatena_id+","+twitter_id+","+str(data['follower'])+","+str(data['following'])+","+str(acount)
上のスクリプトで得た結果をソートして、はてな記法で出力するスクリプトが下。
import sys result_list = list() for line in sys.stdin: tmp = line.rstrip("\n").split(",") result_list.append(tmp) follower_sort = lambda x: -1*int(x[2]) result_list.sort(key = follower_sort) print "|*Rank|*HatenaID|*TwitterID|*Follower|*Following|*ahatena|" i = 1 for result in result_list: print "|"+str(i)+"|id:"+result[0]+"|twitter:@"+result[1]+"|"+str(result[2])+"|"+str(result[3])+"|"+str(result[4])+"|" i+=1