Node.jsいろいろやってみた

  • 投稿者:
  • 投稿カテゴリー:node.js

sys

root@hostname:/home/shimizu/nodejs# cat sys_sample.js
var sys = require('sys');
var obj = {
        name: "shimizu",
        age: 21,
};

sys.print("Print message\n");
sys.log('Log message.');
sys.print(sys.inspect(obj)+"\n");

root@hostname:/home/shimizu/nodejs# nodejs sys_sample.js
Print message
12 Jan 23:53:01 - Log message.
{ name: 'shimizu', age: 21 }

jsdom

JavaScriptによるDOM解析と操作をサーバーサイド(nodejs)で実施できる

sample1

root@hostname:/home/shimizu/nodejs# cat hello_dom.js
var jsdom    = require("jsdom").jsdom;
var document = jsdom("<html><head></head><body><div id='greeting'>hello world</div></body></html>");
var window = document.parentWindow;
console.log(window.document.getElementById("greeting").innerHTML);
root@hostname:/home/shimizu/nodejs# nodejs hello_dom.js
hello world

sample2

root@hostname:/home/shimizu/nodejs/crawler1# cat html.js
var jsdom    = require("jsdom").jsdom,
    document = jsdom("<html><head></head><body>hello world</body></html>"),
    window   = document.createWindow();

console.log(window.document.innerHTML);
// output: '<html><head></head><body>hello world</body></html>'

console.log(window.innerWidth)
// output: 1024

console.log(typeof window.document.getElementsByClassName);
// outputs: function
root@hostname:/home/shimizu/nodejs/crawler1# nodejs html.js

/home/shimizu/nodejs/crawler1/html.js:3
    window   = document.createWindow();
                        ^
TypeError: Object [object HTMLDocument] has no method 'createWindow'
    at Object.<anonymous> (/home/shimizu/nodejs/crawler1/html.js:3:25)
    at Module._compile (module.js:456:26)
    at Object.Module._extensions..js (module.js:474:10)
    at Module.load (module.js:356:32)
    at Function.Module._load (module.js:312:12)
    at Function.Module.runMain (module.js:497:10)
    at startup (node.js:119:16)
    at node.js:906:3

root@hostname:/home/shimizu/nodejs/crawler1# cat html.js
var jsdom    = require("jsdom").jsdom,
    document = jsdom("<html><head></head><body>hello world</body></html>"),
    window   = document.parentWindow; // createWindow()を修正した

console.log(window.document.innerHTML);
// output: '<html><head></head><body>hello world</body></html>'

console.log(window.innerWidth)
// output: 1024

console.log(typeof window.document.getElementsByClassName);
// outputs: function
root@hostname:/home/shimizu/nodejs/crawler1# nodejs html.js
undefined
1024
function

crawlerの作成

タイトルを取得しDBにインサートする

root@hostname:/home/shimizu/nodejs/crawler2# npm install request mysql sequelize cheerio
root@hostname:/home/shimizu/nodejs/crawler2# cat crawler.js
// Initialize instance
var sequelize = require('sequelize');

// Connect to database
var connection = new sequelize('DBName', 'user', 'password');

// Define models
var sites = connection.define('sites', {
    url: sequelize.TEXT,
    title: sequelize.STRING
});

//Initialize instance
var request = require("request");
var cheerio = require("cheerio");

// Define request url
var requestUrl = "http://www.google.co.jp";

// Send http request
request({url: requestUrl}, function(error, response, body) {

    // If request succeed
    if (!error && response.statusCode == 200) {
        $ = cheerio.load(body); // Create cheerio instance

        // Get response data
        var url = response.request.href;
        var title = $("title").text();

        console.log(url);
        console.log(title);

        // Create new instance
        var site = sites.build();

        // Set fields
        site.url = url;
        site.title = title;

        // Save to database
        site.save()
            .success(function(anotherTask) {
                console.log('Succeed');
            })
            .error(function(error) {
                console.log(error);
            });
    }

    // If error occured
    else {
        console.log("--------------------------------------------------");
        if (error && "code" in error) {
            console.log("Error Code:" + error.code);
        }
        if (error && "errno" in error) {
            console.log("Error No:" + error.errno);
        }
        if (error && "syscall" in error) {
            console.log("Error Syscall:" + error.syscall);
        }
        if (response && "statusCode" in response) {
            console.log("Status Code:" +  response.statusCode);
        }
    }
});

mysql内を確認すると

*************************** 1. row ***************************
       id: 1
      url: http://www.google.co.jp/
    title: Google
createdAt: 2015-01-12 16:04:14
updatedAt: 2015-01-12 16:04:14
3 rows in set (0.00 sec)

参考URL

node.jsで遊ぶ: 標準モジュール編
http://blog.summerwind.jp/archives/1436
node.jsを使ってjQueryチックにWebサイトをクローリングする方法
http://liginc.co.jp/programmer/archives/4848
バージョンごとのdocs
http://nodejs.org/docs/

http://sakuratan.biz/archives/3393
https://github.com/tmpvar/jsdom
http://blog.summerwind.jp/archives/1436