Python抓取Torrentkitty磁力链

这个也是在开始学习Python时写的一个小脚本,抓取Torrentkitty中的磁力链接,指定结束日期,会通过他的历史页面进行开抓取,会一直抓取到2007-01-01日的数据,因其每页分页只有只有30条内容,所以每个日期有多少分页开几个线程,加快抓取速度,还有这网站检测了UserAgent,需要伪装一个,伪装完事就可以开始抓了。

抓过一晚上,抓回来几十万把(一共就100多万),自己弄个小页面进行搜索,确实快了不少。

这些代码是为了方便自己用简单写的,如真要用,怎么也得稍作修改。

上代码:

1
2
3
4
5
6
7
CREATE TABLE `magnet` (
`source` varchar(50) character set utf8 NOT NULL,
`name` varchar(250) character set utf8 NOT NULL,
`magnet` text character set utf8 NOT NULL,
`md5` varchar(32) character set utf8 NOT NULL,
UNIQUE KEY `md5` (`md5`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
import urllib2
import sys
import time
import MySQLdb
import hashlib
#import getMagnet
import threading

class getMagnet(threading.Thread):
url = ""
conn = ""
cur = ""

def __init__(self,url):
threading.Thread.__init__(self)
self.url = url
self.conn = MySQLdb.connect(host='127.0.0.1',user='root',passwd='',port=3306)
self.cur=self.conn.cursor()
self.cur.execute("set names utf8;")

def run(self):
html = getHtml(self.url)
if html:
self.getMagnet(html)
self.conn.commit()
self.conn.close()

def stop(self):
self.thread_stop = True

def getMagnet(self,html):
trtd = re.compile("<tr>(.*)</tr>")
trlist = trtd.findall(html)
magnum = 0
for tr in trlist:
namere = re.compile("<td class=\"name\">(.*)</td><td\sclass=\"size\">")
magnetre = re.compile("magnet:(.*)\"\stitle=")
name = namere.findall(tr)
magnet = magnetre.findall(tr)
try:
filename = name[0]
filemagnet = 'magnet:'+magnet[0]
magnum = magnum + self.writeFile(filename,filemagnet)
except:
nt = 0
print "--- Get ("+ str(magnum) +") Magnets On ("+self.url+")---\n"

def writeFile(self,name,url):
md5url = hashlib.md5(url).hexdigest()
sql = "insert into kitty.magnet(source,name,magnet,md5) values('"+self.url+"','"+name+"','"+url+"','"+md5url+"')"
try:
self.cur.execute(sql)
return 1
except:
return 0

def getHtml(url):
try:
print "------Try : Begin Get "+url+"------"
rep = urllib2.Request(url)
URLLIB2_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
rep.add_header('User-Agent', URLLIB2_USER_AGENT)
res = urllib2.urlopen(rep)
html = res.read()
res.close()
return html
except:
print "++++++Error : Retry Get "+url+"++++++"
getHtml(url)

def getPage(url,html):
lst = []
plst = []
pageinfore = re.compile("<div\sclass=\"pagination\">(.*)</div>")
pageinfo = pageinfore.findall(html)
num = []
for pagenum in pageinfo:
numinfo = re.compile("<a\shref=\"(\d+)\">")
num = numinfo.findall(pagenum)
num = list(set(num))
pagenumbs = 0
max = 1
for i in num:
if int(i) > max:
max = int(i)
j=2
lst.append(url)
plst.append("1")
while(j<=int(max)):
lst.append(url+str(j))
plst.append(str(j))
j = j +1
print "--- Found ("+str(plst)+") Pages----"
return lst

def datetime_timestamp(dt):
time.strptime(dt, '%Y-%m-%d %H:%M:%S')
s = time.mktime(time.strptime(dt, '%Y-%m-%d %H:%M:%S'))
return int(s)

dt=datetime_timestamp("2012-10-16 00:00:00")

while dt>datetime_timestamp("2007-01-01 00:00:00"):
dttime = time.strftime('%Y-%m-%d',time.gmtime(dt))
html = getHtml('http://www.torrentkitty.com/archive/'+str(dttime)+'/')
if html:
PageNum = []
PageNum = getPage('http://www.torrentkitty.com/archive/'+str(dttime)+'/',html)
PageNum.append('http://www.torrentkitty.com/archive/'+str(dttime)+'/')
for i in PageNum:
thread = getMagnet(i)
thread.start()
while(thread.isAlive()):
time.sleep(10)
dt = dt-86400