现在学习python 做为练手写了个网站内容采集脚本.
实现功能如下:
1.根据配置文件进行采集.
2.支持命令行参数 如: python corn.py --config=urls.ini
3.根据规则生成url列表集(只支持数字,能倒序采集).
4.根据规则获取列表页面特定位置,从而进一步分析缩小范围分析内容页面url.
5.将内容部分url存储到文件,每行一个url,并且在写入的时候进行判断是否已经存在相同url.
6.Bug太多,我慢慢完善.
使用方法: ubuntu环境 终端运行 python xxx.py --config=xxx.ini
windows环境 修改#!/usr/bin/python 为你的python.exe目录 在命令行运行 python xxx.py --config=xxx.ini
下面给出代码 保存成.py文件:
- #!/usr/bin/python
- #-*-coding:utf-8-*-
- # 简单的蜘蛛采集程序
- # 小项
- # 2008-09-18
- import sys;
- import getopt;
- import re;
- import urllib;
- import ConfigParser;
- import time;
- import MySQLdb as mysql;
- if __name__ == "__main__":
- try:
- opts,argv = getopt.getopt(sys.argv[1:],'c:',['config=']);
- except getopt.GetoptError:
- Help()
- for keys,cut in opts:
- #if keys in ('-w','--write'):
- # writefile()
- if keys in ('-c','--config'):
- cut
- try:
- conf = ConfigParser.ConfigParser();
- conf.readfp(open(cut));
- #==读取采集配置文件==#
- #--urllibs--#
- starturl = conf.get("urllibs","starturl");
- startpage = int(conf.get("urllibs","startpage"));
- endpage = int(conf.get("urllibs","endpage"));
- urltemp = starturl + conf.get("urllibs","urltemp");
- filelist = conf.get("urllibs","urllist");
- dellist = conf.get("urllibs","dellist");
- #--countcfg--#
- Stops = int(conf.get("countcfg","Stops"));
- Divurl = conf.get("countcfg","Divurl");
- Urlls = conf.get("countcfg","Urlls");
- Title = conf.get("countcfg","Title");
- Keywords = conf.get("countcfg","Keywords");
- Description = conf.get("countcfg","Description");
- url = [ urltemp % page for page in range(startpage,endpage) ];
- for url in url:
- urllist = urllib.urlopen(url).read(); #读取web文件
- urls = re.findall(Divurl,urllist); #获取指定的特定区域
- #将urls{属性为list}转换成str属性的以便于再次获取
- c = "";
- urls = c.join(urls);
- urlls = re.findall(Urlls,urls);
- urlfile = file(filelist,'r+a'); #打开文件进行追加数据
- outurl = urlfile.readlines();
- for urlls in set(urlls):
- #==整理url并进行重复判断==#
- curls = urlls + "\n";
- if curls in outurl: #对找到的连接列表进行循环
- print urlls + "页面重复跳过";
- continue; # break 是属于整个跳过,continue 是属于跳回去继续执行
- urlfile.write(urlls + '\n'); #循环输出
- urlfile.close(); #关闭文件
- print "所有url列表获取完成,存入",filelist,"文件中";
- time.sleep(Stops); #同样停顿一下
- #==每次读取列表文件的一行并且指针下移一行,这样可以循环获取每个列表.难题为如何获取整个文件的行数==#
- listurl = open(filelist,'r');
- mun = len(listurl.readlines())+1;
- listurl.seek(0); #指针跳会文件开头
- #contents = open('contens.txt','a');
- User = 'root';
- Passwd = '970207';
- Host = 'localhost';
- Db = 'testcorn';
- contents = mysql.connect(user=User,passwd=Passwd,host=Host,db=Db).cursor();
- #==进行内容循环输出==#
- for conurl in range(1,mun):
- curl = listurl.readline();
- #okurl = r'http://www.510buy.com/yewu/5662.html';
- #print okurl;
- time.sleep(Stops); #停顿一下,免得被封,or超时 进行下一次采集
- content = urllib.urlopen(starturl + curl).read(); #读取url
- title = re.findall(Title,content); #找到标题
- keywords = re.findall(Keywords,content); #找到关键词
- description = re.findall(Description,content); #找到描述
- for title,keywords,description in zip(set(title),set(keywords),set(description)):
- #print title; #测试查看输出结果
- #contents.write('[Title:]' + title + '\n');
- #contents.write('[Keyword:]' + keywords + '\n');
- #contents.write('[Description:]' + description + '\n\n');
- #values = "(" + "\"" + title + "\"","\"" + keywords + "\"","\"" + description + "\"" + ")"
- #contents.execute("INSERT INTO `counts` (`title` ,`keywords` ,`description`) VALUES (%s, %s, %s);",values);
- #title = title.encode(title,"utf=8")
- print "写入",title,"成功!","停顿",Stops,"秒进行下一次采集";
- contents.close();
- except KeyboardInterrupt:
- print "用户终止";
下面是ini的配置文件 保存成.ini文件:
- [urllibs]
- #目标网址
- starturl = http://www.510buy.com
- #列表开始页码
- startpage = 2
- #列表结束页码
- endpage = 3
- #列表部分代码
- urltemp = /yewu/list_%d.html
- #存储url列表文件路径
- urllist = /home/buysz/桌面/urllist.ini
- #剔除不需要的url列表,中间用,隔开
- dellist = http://www.510buy.com,http://www.510buy.com" target="_blank,/yewu/index.html,http://www.510buy.com/
- [countcfg]
- #采集停顿秒数
- Stops = 1
- #采集特定位置url规则
- Divurl = <div.*?>(.*?)<\/div>
- #采集url的正则
- Urlls = <a href=[\"|\'](.*?)[\"\']>
- #标题正则
- Title = <title>(.*?) - .*?</title>
- #关键词正则
- Keywords = name=\"keywords\" content=\"(.*?)\">
- #页面描述正则
- Description = name=\"description\" content=\"(.*?)\">
#1