1.利用casperjs 爬取新浪股市排行数据,生成数据文件
//获取新浪股票排行
var?casper?=?require('casper').create({
????waitTimeout:?10000,
????verbose:true,
????logLevel:?"debug",
????clientScripts:?["../jquery-3.1.1.min.js"],?#Casperjs的运行需要jquery
????pageSettings:?{
????????loadImages:??false,?
????????loadPlugins:?false,????
????userAgent:?'Mozilla/5.0?(Windows?NT?6.1;?rv:17.0)?Gecko/20100101?Firefox/17.0',
???},
});????
casper.start();??
//?2.?打开新浪股票热度排行页面??
casper.thenOpen('http://touzi.sina.com.cn/public/bhot');??
??
//?3.输出全市场用户关注1日变化幅度排行?
casper.then(function?getrank()?{??
??rank?=?casper.evaluate(function?()?{
????var?trlist?=?[];??
??????$("#allday1").find('tr:gt(0)').each(function(){
????????var?tdlist?=?[];
??????????$(this).find("td").each(function()?{
?????????????tdlist.push($(this).text().trim());
??????????});
??????????tdlist.push('\n');
??????????trlist.push(tdlist);
??????});?????????????????
??????return?trlist;
??});?
????var?filename?=?'data/allday1_sina.txt';
????var?fs?=?require('fs');
????fs.write(filename,rank);
});?
//?4.输出全市场用户关注5日变化幅度排行?
casper.then(function?getrank()?{??
??rank?=?casper.evaluate(function?()?{
????var?trlist?=?[];??
??????$("#allday5").find('tr:gt(0)').each(function(){
????????var?tdlist?=?[];
??????????$(this).find("td").each(function()?{
?????????????tdlist.push($(this).text().trim());
??????????});
??????????tdlist.push('\n');
??????????trlist.push(tdlist);
??????});?????????????????
??????return?trlist;
??});?
????var?filename?=?'data/allday5_sina.txt';
????var?fs?=?require('fs');
????fs.write(filename,rank);
});?
?
casper.run();
2.使用 python入库 读取Casperjs生成的数据文件,写入mysql
#!/usr/bin/evn?python
#?-*-?coding:utf-8?-*-
import?MySQLdb
import?os,sys,subprocess
class?Spider():
???def?__init__(self,?filename,?js_filename,?table):
?????self.url?=?url
?????self.filename?=?filename?
?????self.table?=?table
???
???def?read_file_data(self):?????????
?????value_sets?=?[]?????????
?????try:?????????????
???????with?open(self.filename,?'rb')?as?data:
?????????count?=?0
?????????for?line?in?data:
???????????value?=?line.decode("gb2312").encode("utf-8").split()
???????????value_sets.append(value)?????????????????????
???????????count?+=?1?????????????
?????????return?value_sets,?count?????????
?????except?Exception,e:?????????????
????????print?Exception,":",e
????
???def?get_insert_sql(self):
??????"""
??????获取table字段list,返回插入数据sql语句
??????"""
?????try:
???????cursor?=?db.cursor()
???????cursor.execute("select?*?from?%s?limit?1"?%?self.table)
???????field_name_list?=?[each[0]?for?each?in?cursor.description]
???????del?field_name_list[0]??#去除自增id字段名
???????column_list?=?"("?+?",".join([field?for?field?in?field_name_list])?+?")"
???????values_format?=?"values("?+?("%s,"*len(field_name_list)).rstrip(',')?+?")"
???????insert_sql?=?"INSERT?INTO?%s"?%?self.table+column_list?+?values_format
???????return?insert_sql
?????except?Exception,?e:
???????print("Error:?%s"?%?e)
?????cursor.close()
?????db.close()
???????
???def?save(self):
?????value_sets,?count?=?self.get_file_data()
?????insert_sql?=?self.get_insert_sql()
?????if?not?(value_set?and?count):
???????print?"get?data?fom?file?failed"
?????if?not?insert_sql:
???????print?"get?insert_sql?failed"
?????try:
???????cursor?=?db.cursor()?
???????cursor.executemany(insert_sql,?value_sets)
???????db.commit()
???????print?(u"成功插入数据%d条"?%?count)
?????except?Exception,?e:
???????db.rollback()
???????print?Exception,?":",?e
???????print?(u"插入数据失败,数据回滚")
?????cursor.close()
?????db.close()