python爬取网站全部url链接
版权声明:本文为博主原创文章,转载请标明作者和原链接 https://blog.csdn.net/gyq1998/article/details/80092652
对于安全人员来说,了解网站的url目录结构是首要的,御剑是一个很好用的工具,下载地址:https://download.csdn.net/download/gyq1998/10374406
御剑自带了字典,主要是分析字典中的网址是否存在,但是可能会漏掉一些关键的网址,于是前几天用python写了一个爬取网站全部链接的爬虫。
实现方法
主要的实现方法是循环,具体步骤看下图:
贴上代码:
- # author: saucer_man
- # date:2018-04-24
- # python3.6
- import re
- import requests
- # 获取并检验要爬取的网站
- def url_get():
- url=input("please input the url:")
- try:
- kv={'user_agent':'Mozilla/5.0'}
- requests.get(url,headers=kv)
- return url
- except:
- print("your url is incorrect!!")
- return url_get()
- '''
- 找出url中的域名
- 比如从https://www.xiaogeng.top/article/page/id=3筛选出www.xiaogeng.top
- '''
- def url_same(url):
- #判断输入的网站使用的是https还是http
- urlprotocol=re.findall(r'.*(?=://)',url)[0]
- print('该站使用的协议是:' + urlprotocol)
- if len(re.findall(r'/',url)) >2:
- if urlprotocol=='https':
- sameurl = re.findall(r'(?<=https://).*?(?=/)', url)[0]
- else:
- sameurl = re.findall(r'(?<=http://).*?(?=/)', url)[0]
- else:
- url = url + '/'
- if urlprotocol=='https':
- sameurl = re.findall(r'(?<=https://).*?(?=/)',url)[0]
- else:
- sameurl = re.findall(r'(?<=http://).*?(?=/)',url)[0]
- print('域名地址:' + sameurl)
- return sameurl
- # 爬取url页面中的所有链接
- def spiderpage(url):
- kv={'user_agent':'Mozilla/5.0'}
- r=requests.get(url,headers=kv)
- r.encoding=r.apparent_encoding
- pagetext=r.text
- pagelinks = re.findall(r'(?<=href=").*?(?=")|(?<=href=').*?(?=')',pagetext)
- return pagelinks
- #筛选pagelinks中的url
- def url_filtrate(pagelinks):
- '''
- print("我现在在筛选")
- '''
- #去除不是该站点的url
- same_target_url = []
- for l in pagelinks:
- if re.findall(sameurl,l):
- same_target_url.append(l)
- #去除重复url
- unrepect_url = []
- for l in same_target_url:
- if l not in unrepect_url:
- unrepect_url.append(l)
- return unrepect_url
- #将一个列表写入文件
- def writetofile(list):
- file=open('urls.txt','w')
- for url in list:
- file.write(url)
- file.write('n')
- file.close()
- # url集合,循环遍历会用到
- class linkQuence:
- def __init__(self):
- #已访问的url集合
- self.visited=[]
- #待访问的url集合
- self.unvisited=[]
- #获取访问过的url队列
- def getvisitedurl(self):
- return self.visited
- #获取未访问的url队列
- def getunvisitedurl(self):
- return self.unvisited
- #添加url到访问过得队列中
- def addvisitedurl(self,url):
- return self.visited.append(url)
- #移除访问过得url
- def removevisitedurl(self,url):
- return self.visited.remove(url)
- #从未访问队列中取一个url
- def unvisitedurldequence(self):
- try:
- return self.unvisited.pop()
- except:
- return None
- #添加url到未访问的队列中
- def addunvisitedurl(self,url):
- if url!="" and url not in self.visited and url not in self.unvisited:
- return self.unvisited.insert(0,url)
- #获得已访问的url数目
- def getvisitedurlount(self):
- return len(self.visited)
- #获得未访问的url数目
- def getunvistedurlcount(self):
- return len(self.unvisited)
- #判断未访问的url队列是否为空
- def unvisitedurlsempty(self):
- return len(self.unvisited)==0
- # 真正的爬取函数
- class Spider():
- def __init__(self,url):
- self.linkQuence = linkQuence() #引入linkQuence类
- self.linkQuence.addunvisitedurl(url) #并将需要爬取的url添加进linkQuence对列中
- def crawler(self):
- while not self.linkQuence.unvisitedurlsempty():# 若未访问队列非空
- print("嘀嘀嘀我又爬到一个")
- visitedurl = self.linkQuence.unvisitedurldequence()# 取一个url
- if visitedurl is None or visitedurl == '':
- continue
- initial_links=spiderpage(visitedurl) # 爬出该url页面中所有的链接
- right_links = url_filtrate(initial_links) # 筛选出合格的链接
- self.linkQuence.addvisitedurl(visitedurl) # 将该url放到访问过的url队列中
- for link in right_links: # 将筛选出的链接放到未访问队列中
- self.linkQuence.addunvisitedurl(link)
- # print(self.linkQuence.visited)
- print("哥我爬完了")
- return self.linkQuence.visited
- if __name__ == '__main__':
- url=url_get()
- sameurl=url_same(url)
- spider=Spider(url)
- urllist=spider.crawler()
- writetofile(urllist)
结果 主要是用自己的网站https://xiaogeng.top做的测试:
- # 更新:排除外链
- # /usr/bin/env python3
- # _*_ coding:utf-8 _*_
- # auther: saucerman
- # project: https://github.com/saucer-man/UrlCrawler
- """
- decription : 全站url爬取脚本
- """
- import re
- import time
- import sys
- import requests
- try :
- import tldextract
- except:
- print('module tldextract not fount nyou can try pip install tldextract')
- sys.exit()
- def domain_get():
- '''
- 接收要爬取的网站url
- '''
- url = input("Please input the url of website:")
- if '//' not in url:
- url = 'http://' + url
- try:
- kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
- requests.head(url,headers=kv)
- return url
- except:
- print("your url is incorrect!!")
- return domain_get()
- class spider():
- def __init__(self, domain, key, depth):
- self.domain = domain # 爬取的域名
- self.depth = depth # 爬取的深度
- self.urls_all = set([]) # 爬取的结果
- self.key = key # 顶级域名,用于排除外链
- def page_spider(self, url):
- '''
- 爬取url中的所有链接
- '''
- try:
- kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
- r = requests.get(url, headers=kv, timeout = 2)
- r.encoding=r.apparent_encoding
- pagetext=r.text
- pagelinks = re.findall(r'(?<=href=").*?(?=")|(?<=href=').*?(?=')',pagetext)
- except:
- return set([])
- # 接下来对爬取的链接进行处理
- # 1、先去除不同域的链接
- url_list = set([])
- for url in pagelinks:
- if self.key in url:
- url_list.add(url)
- # 2、再对链接进行去重处理
- url_list = set(url_list)-self.urls_all
- self.urls_all.update(url_list)
- return url_list # 返回集合
- def run(self):
- url_list = set([self.domain]) # 第一次爬取原始url的链接
- while self.depth >= 1: # 每一次深度的爬取都会爬取url_list的所有链接
- print("倒数第%d轮"%self.depth)
- url_list_tmp = set([])
- for url in url_list:
- url_list_tmp.update(self.page_spider(url))
- url_list = url_list_tmp
- self.depth = self.depth -1
- file=open('result.txt','w')
- for url in self.urls_all:
- file.write(url)
- file.write('n')
- file.close()
- if __name__ == '__main__':
- time.clock()
- domain = domain_get()
- print('domain:', domain)
- key_tmp = tldextract.extract(domain)
- # 用于排除外链,爬取的url不包含key的都会被舍弃。
- # 'https://www.xiaogeng.com.cn/admin?id=6'==>'www.xiaogeng.com.cn'
- key = key_tmp.subdomain + '.' + key_tmp.domain+'.' + key_tmp.suffix
- print('key:', key)
- print('开始爬取...n')
- spider = spider(domain = domain, key = key, depth = 3)
- spider.run()
- print('结果已保存至result.txt中')
- print('time:',time.clock())