Rate limited website scraping with node.js and async

By creator1988 on Wednesday 25 April 2012 19:30 - Comments (9)
Category: Frontend, Views: 27.092

So yesterday a job description at my previous employer popped up in my facebook stream which reminded me of the programming excercise that we included in the interview process just before I left the company. In short it comes down to:
  • Funda has an API that lets you do queries, the response is paged, max. 25 objects at a time
  • The API is rate limited at about 100 req./minute
  • Request all pages for a given query
  • Count the times a realtor ID is in the result
  • Aggregate and sum the realtor ID's and create a top 10 list of realtors with the most objects
Scraping this is pretty easy, but the rate limiting got me thinking. A great library for doing queue work like this (create a large list of URLs to scrape, then do it 4 at the same time or something) is async by caolan, but it lacks real rate limiting. Room for improvement!

Expanding async
The async library already has a pretty convenient way to create dynamically sized queues with concurrency, in the form of:

JavaScript:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// create a queue that does 4 items at the same time
// that for every item in the queue outputs the value times 2
var q = async.queue(function (itemnext) {
    // add a random timeout so we can see the queue'ing
    setTimeout(function () {
        console.log(item * 2);
        next();
    }, Math.random() * 1000 |0);
}, 4);

q.drain = function () { console.log("done"); };

q.push([ 1,2,3,4,5,6,7,8,9,10 ]);

// gives something like (order can be different)
// but higher numbers are pushed later than lower numbers
// 8, 6, 12, 4, 2, 10, 18, 16, 14, 20, done

To add rate limiting to queues I created a mixin that adds some methods to async that will create a form of an event loop structure that'll fire every X ms. Where X is of course the max. speed that we can query the target website. The usage is still the same, but the queue variable now has a chainable method 'rateLimit' added. Executing the same code like before but rate limited to 1 request per second will give a sorted response, because even though we have a concurrency of four, the max. time processing an item is 1 second. The previous record will therefore always be processed.

JavaScript:
1
2
3
4
5
6
7
// change
// }, 4);
// into
}, 4).rateLimit(1000);

// gives
// 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, done


Transforming it in real world code
The response that we get from funda has a 'Paging' parameter that contains the next URL that we can call. If it's empty, we've reached the end of our set. In pseudo code:

pseudo:
1
2
3
4
5
6
func processItem (url)
    resp = request(url)
    if resp.Paging.VolgendeUrl
        processItem resp.Paging.VolgendeUrl
    else
        "done"

In javascript with async, this will look like:

JavaScript:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
var async = require("async");

var q = async.queue(function (urlnext) {
    request.get(urlfunction (errresbody) {

        // parse the body to JSON
        body = JSON.parse(body);

        if (body && body.Paging.VolgendeUrl) {
            q.push(body.Paging.VolgendeUrl);
        }

        // do stuff like counting realtors

        next();
    });
}, 1).rateLimit(60000 / 60); // 60 per minute, just to be safe

// initial page
q.push("/zaandam/tuin/");

q.drain = function () {
    console.log("done");
};


Counting realtor IDs
Because the purpose of the assignment is to count the realtor IDs we'll add a simple object map where we gather all the data:

JavaScript:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// the key will be the realtor ID and the value the no of times we encountered this realtor
var map = {}; 

// =================
// when a request comes in:
// first grab the realtor IDs
var realtorIds = body.Objects.map(function (obj) {
    return obj.MakelaarId;
});
// then move it to the map
realtorIds.forEach(function (rid) {
    // check for existing one, if not initialize it with '1'
    map[rid] = (map[rid] || 0) + 1;
});

// =================
// on drain:
// make a sortable object with {id: [Number], cnt: [Number]}
var sortable = Object.keys(map).map(function (k) {
    return {
        idk,
        cntmakelaarMap[k]
    };
});
// now sort it on cnt HI-LO
var sorted = sortable.sort(function (op) {
    return o.cnt > p.cnt ? -1 : (o.cnt === p.cnt ? 0 : 1);
});
// output it
for (var ix = 0ix < 10ix++) {
    console.log(ix+1 + '.'sorted[ix].id'has'sorted[ix].cnt'objects');
}


Hooking it together
We'll need some small things to do, first, we'll need to incorporate the base URL, then, we'll need to normalize the URLs we receive from 'VolgendeUrl' and maybe do some sanitizing. The final script will look something like this:

JavaScript:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
var async = require("async");
var request = require("request");

var makelaarMap = {};
var q = async.queue(function (zonext) {
    console.log("process"zo);
    var url = "http://partnerapi.funda.nl/feeds/Aanbod.svc/json/a001e6c3ee6e4853ab18fe44cc1494de/?type=koop&pagesize=25&zo=" + zo;
    request.get(urlfunction (errresbody) {
        // add error checking (see err, and res.statusCode)
        if ((body = body && JSON.parse(body)) && body.Paging.VolgendeUrl) {
            q.push(body.Paging.VolgendeUrl.replace(/^\/\~\/\w+/""));
        }
        body.Objects.map(function (o) { return o.MakelaarId; }).forEach(function (mid) {
            makelaarMap[mid] = (makelaarMap[mid] || 0) + 1;
        });
        next();
    });
}, 1).rateLimit(60000 / 60); // 60 per minute

// initial page
q.push("/zaandam/tuin/");

q.drain = function () {
    var sorted = Object.keys(makelaarMap).map(function (k) {
        return {
            idk,
            cntmakelaarMap[k]
        };
    }).sort(function (op) {
        return o.cnt > p.cnt ? -1 : (o.cnt === p.cnt ? 0 : 1);
    });
    for (var ix = 0ix < 10ix++) {
        console.log(ix+1 + '.'sorted[ix].id'has'sorted[ix].cnt'objects');
    }
};


Running it
To run it: execute the following commands on your local system or on Cloud9 IDE:

bash:
1
2
3
4
$ git clone https://github.com/janjongboom/async node_modules/async
$ npm install request
# paste the code in server.js
$ node server.js

Volgende: Building Wordpress sites in the cloud 07-'12 Building Wordpress sites in the cloud
Volgende: Inheritance in javascript 04-'12 Inheritance in javascript

Comments


By Tweakers user Xantios, Wednesday 25 April 2012 20:04

waarom wel het woord makelaar gebruiken in je code ( om er eens 1 te noemen)
en de rest van je post in het engels?

Sowieso, Funda is toch een Nederlandse toko ?

By Tweakers user kipusoep, Thursday 26 April 2012 10:20

Xantios wrote on Wednesday 25 April 2012 @ 20:04:
waarom wel het woord makelaar gebruiken in je code ( om er eens 1 te noemen)
en de rest van je post in het engels?

Sowieso, Funda is toch een Nederlandse toko ?
Viel mij ook op inderdaad. Wat mij betreft: code (en dus o.a. ook variabelen) altijd in het engels.

By Tweakers user creator1988, Thursday 26 April 2012 11:36

kipusoep wrote on Thursday 26 April 2012 @ 10:20:
[...]

Viel mij ook op inderdaad. Wat mij betreft: code (en dus o.a. ook variabelen) altijd in het engels.
Mwah, het punt is dat de API van funda in het Nederlands is. En het object dus 'MakelaarId' heet, een 'makelaarMap' is daar dan nog wel te verdedigen.

By Teun, Thursday 26 April 2012 13:52

De reden dat we dat bij funda doen is dat we vinden dat je je code liefst in het Engels moet doen, maar echte domein termen die in je organisatie op een heel specifieke manier worden gebruikt beter niet kunt gaan vertalen. Je krijgt dan code met namen zoals GetMakelaarContracts() en CalculateWoonoppervlakte(). Dat is inderdaad lelijk, maar voorkomt verwarring. In onze organisatie betekenen Makelaar en Woonoppervlakte iets heel specifiekt en dat raak je kwijt in GetEstateAgentContracts() en CalculateTotalLivingSurface(). Vind ik.

By Tweakers user BeRtjh, Thursday 26 April 2012 14:31

This reminds me of my own Funda crawler, which seems a lot simpler.
I used Mechanize (http://mechanize.rubyforge.org/) and Ruby on Rails and created a rake-file (http://guides.rubyonrails.org/command_line.html#rake) for this:

desc "Get houses from Funda"
task :funda => :environment do
require 'mechanize'
agent = Mechanize.new
i = 1;
agent.get("http://www.funda.nl/koop/heel-nederland/p#{i}/")
begin
agent.page.search(".nvm").each do |node|
street = node.search(".item").map(&:text).map(&:strip).first
info = node.search(".specs").map(&:text).map(&:strip).first
price = node.search(".nvm-extern").map(&:text).map(&:strip).first
broker = node.search(".rel a").map(&:text).map(&:strip).first
House.create! do |house|
house.street = street
house.broker = broker
house.price = price
house.info = info
end
end
i = i.next
next_page = agent.page.link_with(:href => "/koop/heel-nederland/p#{i}/")
end while (next_page.click unless next_page.nil?)
end

Don't know if this still works, tho.

[Comment edited on Thursday 26 April 2012 14:33]


By Tweakers user creator1988, Thursday 26 April 2012 16:00

BeRtjh wrote on Thursday 26 April 2012 @ 14:31:
This reminds me of my own Funda crawler, which seems a lot simpler.
I used Mechanize (http://mechanize.rubyforge.org/) and Ruby on Rails and created a rake-file (http://guides.rubyonrails.org/command_line.html#rake) for this:
Yeah, it's probably not the easiest way to do this, but the point of adding rate limiting made it interesting for me personally.
Teun wrote on Thursday 26 April 2012 @ 13:52:
De reden dat we dat bij funda doen is dat we vinden dat je je code liefst in het Engels moet doen...
Hi Teun :D

By Albert-Jan, Monday 03 September 2012 12:06

Ik kwam de opdracht ook tegen op internet en vond het wel een mooie showcase voor een REST client die ik ontwikkel. De source kun je vinden op github: https://github.com/albertjan/houses en de voor de REST client hier: https://github.com/albertjan/DynamicRestClient

By Daniel de Witte, Thursday 13 December 2012 13:44

How do you access the Funda webservice exactly? is this still possible without a paid subscription?

I searched myself a while ago because i wanted to automatic keep track of some objects i am interested in.

By Tweakers user creator1988, Thursday 13 December 2012 14:08

Daniel de Witte wrote on Thursday 13 December 2012 @ 13:44:
How do you access the Funda webservice exactly? is this still possible without a paid subscription?

I searched myself a while ago because i wanted to automatic keep track of some objects i am interested in.
There is an API key available but it's only intended to be used for the pre-job interview programming example, other parts of the API aren't public (at least this was a year ago when I left funda).

Comments are closed