解决思路
1,利用post向wordpress提交表单
2,通过wordpress_xmlrpc模块,有轮子不用想干啥
3,通过mysqldb直接插入数据库,有服务器、不需远程,直接把py脚本放在服务器跑
我们这次要用轮子拼一台摩托车!,宝马、、自己动手吧
开始动手:需自行安装的模块requests,xmlrpc;windows系统、linux安装如下,土豪随意:
pip install requests
pip install python-wordpress-xmlrpc
caiji.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
#encoding=utf-8 '''练手可以找wp博客来采集,这个脚本就是针对wp博客来做下手采集的''' import re,requests,time,random,urllib,threading,threadpool from wordpress_xmlrpc import Client, WordPressPost from wordpress_xmlrpc.methods.posts import GetPosts, NewPost '''登录''' try : wp = Client( 'http://www.example.com/xmlrpc.php' , 'wp的账号' , 'wp的密码' ) except Exception, e: wp = Client( 'http://www.example.com/xmlrpc.php' , 'wp的账号' , 'wp的密码' ) post = WordPressPost() '''针对单站url重复采集问题''' f = open ( 'url.txt' , 'a+' ) urls = f.read() url_list = [m.strip() for m in open ( 'url.txt' ).readlines()] daili_list = [] '''过滤html标签''' def filter_tags(htmlstr): re_cdata = re. compile ( '//<!\[CDATA\[[^>]*//\]\]>' ,re.I) #匹配CDATA re_script = re. compile ( '<\s*script[^>]*>[^<]*<\s*/\s*script\s*>' ,re.I) #Script re_style = re. compile ( '<\s*style[^>]*>[^<]*<\s*/\s*style\s*>' ,re.I) #style re_br=re.compile('<br\s*?/?>')#处理换行 re_br = re. compile ( '<br />' ) re_h = re. compile ( '</?\w+[^>]*>' ) #HTML标签 re_comment = re. compile ( '<!--[^>]*-->' ) #HTML注释 s = re_cdata.sub('',htmlstr) #去掉CDATA s = re_script.sub('',s) #去掉SCRIPT s = re_style.sub('',s) #去掉style s = re_br.sub( '\n' ,s) #将br转换为换行 s = re_h.sub('',s) #去掉HTML 标签 s = re_comment.sub('',s) #去掉HTML注释 blank_line = re. compile ( '\n+' ) #去掉多余的空行 s = blank_line.sub( '\n' ,s) return s '''轮换user-agent''' def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)' , 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)' , 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1' , 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)' , 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0' , 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)' , 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)' , 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)' , 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' , 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' , ] ua = random.choice(uaList) return ua '''提取正则''' def search(re_url,html): re_Data = re.findall(re_url,html) if re_Data: return re_Data[ 0 ] else : return 'no' '''轮换ip''' def ip(): for x in open ( 'daili.txt' ): x = x.strip() daili_list.append(x) newip = random.choice(daili_list) return newip '''获取html''' def gethtml(url,headers): while 1 : try : newip = ip() proxies = { "http" : "http://%s" % newip.strip()} pages = requests.post(url,headers,proxies,timeout = 10 ) html = pages.content code = pages.status_code if '404' '302 Found' in html or code ! = 200 in html: print u '代理失效重试' continue elif 'verify' in html: print u '出验证码,重试' continue else : return html except Exception, e: # print e continue '''正则用以提取列表页上的url,需根据实际情况来调整''' re_url = re. compile (r '<a href="(http://www\.example\.com/.*?\d+\.html)"' ) '''正则用以提取内页上的title和正文内容content,当然也可以使用readability模块,正则需根据实际情况做修改''' re_title_content = re. compile (r '<h1 class="entry-title">(.*?)</h1>[\s\S]*?<div class="entry-content">([\s\S]*?)<div class="clear">' ) '''成功通过wordpress-xmlrpc模块自动发布文章到wordpress''' def getData(url): headers = { 'User-Agent' : '%s' % getUA(),} mutex.acquire() html = gethtml(url,headers) re_Data = re.findall(re_url,html) for i in re_Data: i = i.strip() if i not in url_list and i not in urls: page = gethtml(i,headers) page_Data = re.findall(re_title_content,page) for n in page_Data: # print type(n) try : title = n[ 0 ] content = filter_tags(n[ 1 ]) ` except : title = 0 content = 0 if title and content: print title,content '''发布到wp''' # post.title=title # post.content=content # post.post_status = 'publish' # wp.call(NewPost(post)) url_list.append(i) f.writelines(i + '\n' ) print 'Updates' else : pass else : print 'Noposts updates' continue mutex.release() def now_time(url): for i in url_list: getData(i) url_list = [] for line in range ( 1 , 12 ): line = 'http://www.example.com/page/%d' % line word = line.strip() url_list.append(word) mutex = threading.Lock() pool = threadpool.ThreadPool( 3 ) reqs = threadpool.makeRequests(now_time, url_list) [pool.putRequest(req) for req in reqs] pool.wait() |
设置采集内容到哪个默认目录,可以在wp后台设置,从代码上做修改也可以;具体可以看看xmlrpc官方文档:http://python-wordpress-xmlrpc.readthedocs.io/en/latest/overview.html
另外可以通过命令crontab -e 让脚本按需自动跑起来!