爬虫:CSDN文章批量抓取以及导入WordPress
版权声明:转载请注明出处:http://blog.csdn.net/m0sh1 http://blog.share345.com/ https://blog.csdn.net/m0sh1/article/details/53058195
学习python 写了个简单的小功能:
CSDN文章批量抓取以及导入WordPress
代码地址: https://github.com/ALawating-Rex/csdn_wordpress_posts_import
原文写到了:http://blog.share345.com/2016/10/04/csdn-wordpress-posts-import.html
CSDN 文章批量抓取以及导入 WordPress
简介
- python练手项目
- 不仅可以将自己的CSDN博客导入 wp 也可以抓取别人的
- 友情提示:抓取他人的请注明文章出处 (∵)nnn
- 代码写有点乱,慎读
使用
- 修改必要变量值后 运行 ready/1.py 即可
- 修改变量 wp_url,wp_url_tags 为你的 wp 地址
- 修改变量 username,password 为你的用户名密码
- 修改 ready_cate_id 为你准备导入的分类,当然你也可以参考tag 那样动态分配分类
- wp 要安装 rest api 插件 安装方法这里不做说明了
- 实例请看 http://blog.share345.com/category/muti-information/old-pages
#coding=utf-8 import sys # import urllib # import urllib2 # import cookielib from helper.tools import * from helper.parse_page import * from lxml import etree import json # todo 尝试使用 scrapy reload(sys) sys.setdefaultencoding('utf8') sys.setrecursionlimit(2000) wp_url = "http://127.0.0.1/wp-json/wp/v2/posts" wp_url_tags = "http://127.0.0.1/wp-json/wp/v2/tags" username = "simael" password = "123456" # ready_cate_id = "4" ready_cate_id = "11" wp_data = {} wp_headers = {} old_tags = {} # todo 先获取所有的 old tags res_old_tags = Crawl_helper_tools_url.http_auth_handle_get_tag(wp_url_tags,wp_data) if (res_old_tags == "fail"): print('获取标签失败') sys.exit() decode_res_tags = json.loads(res_old_tags) for res_tag in decode_res_tags: old_tags[res_tag['name']] = res_tag['id'] print('已经存在的标签列表:') print(old_tags) url = 'http://blog.csdn.net/m0sh1' root_url = 'http://blog.csdn.net' # var = 'this is a var in' # print(Crawl_helper_tools_url.testfunction(var)) headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0','Referer' : 'http:www.share345.com'} crawl_url = Crawl_helper_tools_url(url,root_url) html = crawl_url.getCurl(url,{},headers) # print html if html == '' or html == 'None': print '读取 html 失败' sys.exit() # print html tree = etree.HTML(html) crawl_url.parse_html_page_count(tree,'//*[@id="papelist"]/span') # 多线程 一页一个线程 还是一个链接一个线程 # 链接类似: http://blog.csdn.net/m0sh1/article/list/1 1表示第一页 count = crawl_url.getCount() page = crawl_url.getPage() i = 1 # TODO 每页创建 一个线程 # TODO 分析每页循环取出每篇文章的内容等 while(i <= int(page)): # http://blog.csdn.net/m0sh1/article/list/1 next_page_url = url + '/article/list/' + str(i) # 重新获取 新的 tree # next_page_crawl_url = crawl_url # TODO 上面一行是测试代码 下面的三行才是每次循环取出来的真正的HTML页面 next_page_crawl_url = Crawl_helper_tools_url(next_page_url,root_url) try: next_page_html = next_page_crawl_url.getCurl(next_page_url,{},headers) except Exception , e: print 'except 读取某一页异常....',e continue next_page_tree = etree.HTML(next_page_html) # next_page_tree = tree i += 1 print(next_page_url) # TODO 置顶的文章不需要读取 在后面页码中还会有真和谐文章的 top_page_list = next_page_crawl_url.getPageTitle(next_page_tree,'//*[@id="article_toplist"]') if(top_page_list): pass # print len(top_page_list) # print top_page_list else: print '没有文章哦' page_list = next_page_crawl_url.getPageTitle(next_page_tree,'//*[@id="article_list"]') if(page_list): pass # print len(page_list) # print page_list else: print '没有文章哦' real_page_list_len = len(top_page_list) + len(page_list) real_page_list = top_page_list + page_list print real_page_list_len print real_page_list for n in real_page_list: try: print n parse_page = Crawl_helper_parse_page(n) try: page_title = parse_page.getTitle_soup(n,{},headers) if(page_title == "FAIL"): print(n + "超时了 下一条数据") continue print page_title except Exception , e: print 'except 标题异常....',e continue try: page_tags = parse_page.getTag_soup(n,{},headers) print '此文章的 tags 是:' print page_tags #todo 处理 tags for i_page_tag in page_tags: if old_tags.has_key(i_page_tag): print('tag 存在') print(old_tags[i_page_tag]) else: print('tag 不存在') print(i_page_tag) #todo 不存在创建tag res = Crawl_helper_tools_url.http_auth_handle_create_tag(username,password,wp_url_tags,i_page_tag,wp_headers) if (res != "fail"): decode_res_create_tag = json.loads(res) old_tags[i_page_tag] = decode_res_create_tag[u'id'] print('old tags 更新为:') print(old_tags) except Exception , e: print 'except 标签异常....',e continue try: page_content = parse_page.getContent_soup(n,{},headers) #print page_content except Exception , e: print 'except 内容异常....',e continue # page_cates = next_page_crawl_url.getCates() page_content = str(page_content) + "<br><p>此文章通过 python 爬虫创建,原文是自己的csdn 地址: <a target="_blank" href="" + str(n) + "">" + page_title + "</a></p><br>" wp_data['status'] = "publish" wp_data['title'] = page_title wp_data['content'] = page_content wp_data['author'] = 1 wp_data['slug'] = n[-8:] wp_data['categories[0]'] = ready_cate_id for tag_i in range(len(page_tags)): print(page_tags[tag_i]) wp_data["tags["+str(tag_i)+"]"] = old_tags[page_tags[tag_i].decode('utf-8')] # todo 爬虫 添加至 github # todo 源代码 带有 script 在 wp中显示有问题 # print('创建文章 参数') # print(wp_data) res = Crawl_helper_tools_url.http_auth(username,password,wp_url,wp_data,wp_headers) if (res == "fail"): print("添加失败") # sys.exit() except Exception , e: print 'except 某篇文章出错....',e continue