sys
root@hostname:/home/shimizu/nodejs# cat sys_sample.js var sys = require('sys'); var obj = { name: "shimizu", age: 21, }; sys.print("Print message\n"); sys.log('Log message.'); sys.print(sys.inspect(obj)+"\n"); root@hostname:/home/shimizu/nodejs# nodejs sys_sample.js Print message 12 Jan 23:53:01 - Log message. { name: 'shimizu', age: 21 }
jsdom
JavaScriptによるDOM解析と操作をサーバーサイド(nodejs)で実施できる
sample1
root@hostname:/home/shimizu/nodejs# cat hello_dom.js var jsdom = require("jsdom").jsdom; var document = jsdom("<html><head></head><body><div id='greeting'>hello world</div></body></html>"); var window = document.parentWindow; console.log(window.document.getElementById("greeting").innerHTML); root@hostname:/home/shimizu/nodejs# nodejs hello_dom.js hello world
sample2
root@hostname:/home/shimizu/nodejs/crawler1# cat html.js var jsdom = require("jsdom").jsdom, document = jsdom("<html><head></head><body>hello world</body></html>"), window = document.createWindow(); console.log(window.document.innerHTML); // output: '<html><head></head><body>hello world</body></html>' console.log(window.innerWidth) // output: 1024 console.log(typeof window.document.getElementsByClassName); // outputs: function root@hostname:/home/shimizu/nodejs/crawler1# nodejs html.js /home/shimizu/nodejs/crawler1/html.js:3 window = document.createWindow(); ^ TypeError: Object [object HTMLDocument] has no method 'createWindow' at Object.<anonymous> (/home/shimizu/nodejs/crawler1/html.js:3:25) at Module._compile (module.js:456:26) at Object.Module._extensions..js (module.js:474:10) at Module.load (module.js:356:32) at Function.Module._load (module.js:312:12) at Function.Module.runMain (module.js:497:10) at startup (node.js:119:16) at node.js:906:3 root@hostname:/home/shimizu/nodejs/crawler1# cat html.js var jsdom = require("jsdom").jsdom, document = jsdom("<html><head></head><body>hello world</body></html>"), window = document.parentWindow; // createWindow()を修正した console.log(window.document.innerHTML); // output: '<html><head></head><body>hello world</body></html>' console.log(window.innerWidth) // output: 1024 console.log(typeof window.document.getElementsByClassName); // outputs: function root@hostname:/home/shimizu/nodejs/crawler1# nodejs html.js undefined 1024 function
crawlerの作成
タイトルを取得しDBにインサートする
root@hostname:/home/shimizu/nodejs/crawler2# npm install request mysql sequelize cheerio root@hostname:/home/shimizu/nodejs/crawler2# cat crawler.js // Initialize instance var sequelize = require('sequelize'); // Connect to database var connection = new sequelize('DBName', 'user', 'password'); // Define models var sites = connection.define('sites', { url: sequelize.TEXT, title: sequelize.STRING }); //Initialize instance var request = require("request"); var cheerio = require("cheerio"); // Define request url var requestUrl = "http://www.google.co.jp"; // Send http request request({url: requestUrl}, function(error, response, body) { // If request succeed if (!error && response.statusCode == 200) { $ = cheerio.load(body); // Create cheerio instance // Get response data var url = response.request.href; var title = $("title").text(); console.log(url); console.log(title); // Create new instance var site = sites.build(); // Set fields site.url = url; site.title = title; // Save to database site.save() .success(function(anotherTask) { console.log('Succeed'); }) .error(function(error) { console.log(error); }); } // If error occured else { console.log("--------------------------------------------------"); if (error && "code" in error) { console.log("Error Code:" + error.code); } if (error && "errno" in error) { console.log("Error No:" + error.errno); } if (error && "syscall" in error) { console.log("Error Syscall:" + error.syscall); } if (response && "statusCode" in response) { console.log("Status Code:" + response.statusCode); } } });
mysql内を確認すると
*************************** 1. row *************************** id: 1 url: http://www.google.co.jp/ title: Google createdAt: 2015-01-12 16:04:14 updatedAt: 2015-01-12 16:04:14 3 rows in set (0.00 sec)
参考URL
node.jsで遊ぶ: 標準モジュール編
http://blog.summerwind.jp/archives/1436
node.jsを使ってjQueryチックにWebサイトをクローリングする方法
http://liginc.co.jp/programmer/archives/4848
バージョンごとのdocs
http://nodejs.org/docs/
http://sakuratan.biz/archives/3393
https://github.com/tmpvar/jsdom
http://blog.summerwind.jp/archives/1436