これまでの「wget –mirror」では、うまくとれないページがあまりに増えてきた為。
install
unzip するだけ、ただ、font installは必要
$ su - # sudo yum -y install gcc gcc-c++ make flex bison gperf ruby \ openssl-devel freetype-devel fontconfig-devel libicu-devel sqlite-devel \ libpng-devel libjpeg-devel $ cd ~/local $ wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 $ tar -xvf phantomjs-2.1.1-linux-x86_64.tar.bz2 $ ln -s phantomjs-2.1.1-linux-x86_64.tar.bz2 phantomjs ### https://www.google.com/get/noto/ $ wget https://noto-website.storage.googleapis.com/pkgs/NotoSansCJKjp-hinted.zip $ su - # mkdir /usr/share/fonts/noto # cd /usr/share/fonts/noto # cp NotoSansCJKjp-hinted.zip # unzip NotoSansCJKjp-hinted.zip # chmod 644 *.otf # fc-cache -fv
sample script
ポイントは、src内にcommentしてます
var system = require('system'); // request urlは引数でも指定ok var url = system.args[1] || 'http://www.example.com/kodate/chuko/tokyo/list/'; console.log("request url: " + url); var page = require('webpage').create(); page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'; //for basic auth page.customHeaders = {"User-Agent" : page.settings.userAgent, "Upgrade-Insecure-Requests":1, "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language":"ja,en-US;q=0.8,en;q=0.6", "Cache-Control":"max-age=0" }; page.viewportSize = { width: 1024, height: 768 }; // for (var atri_key in system.os) { // console.log(atri_key +' = '+system.os[atri_key]); // } page.open(url, function(status){ if(status != "success") { phantom.exit(); } console.log("Status: " + status); //page load後に、javascriptが動作してることもある為、sleep setTimeout(function(){ //外部jsのload後の処理にjsonで引数指定ok var eval_params = {hoge:"hoge_val"}; var eval_params_str = JSON.stringify(eval_params); var ret = evaluate_list_page(page, eval_params_str); console.log(ret['title']); console.log(ret['hoge']); console.log(ret['last_page']); // 一部分の画面キャプチャもok page.render('page_all.png'); page.clipRect = { top: ret['clip_rect']['top'], left: ret['clip_rect']['left'], width: ret['clip_rect']['width'], height: ret['clip_rect']['height'] }; page.render('page_part.png'); for (var i in ret['house_urls']) { console.log( ret['house_urls'][i] ); } //htmlの保存 var fs = require('fs'); fs.write('page_all.html', page.content, 'w'); phantom.exit(); }, 5000); }); //最近のsiteは、jqueryを含んでいることが多い為、実施していませんが //phantomjsのpage.includeJs() で、外部のjquery等のloadもできます function evaluate_list_page(page, params_json){ var ret = page.evaluate(function(params_json) { var eval_params = JSON.parse(params_json); var ret = {}; ret['title'] = $('title').text(); ret['hoge'] = eval_params['hoge']; //画面上部にあるpagerから最終pageを取得 ret['last_page'] = $('.mod-listPaging li.lastPage').eq(1).text(); ret['house_urls'] = []; $('.mod-mergeBuilding--sale .moduleHead h3') .each(function(index, element){ ret['house_urls'].push( $('a',element).attr('href')); }); var clip_rect = $('.mod-mergeBuilding--sale').eq(1); ret['clip_rect'] = {top: clip_rect.offset().top, left: clip_rect.offset().left, width: clip_rect.width(), height: clip_rect.height() }; return ret; }, params_json); return ret; }
↑こう書いて↓こう実行
$ ~/local/phantomjs/bin/phantomjs --debug=true ./foo2js