jeudi 26 février 2015

Having trouble building custom web scraper

I'm trying to build a custom web scraper using request, cheerio and async on node.js. I can't figure out why I get some undefined values inside the companies argument causing the waterfall to crash. I know it's a little bit messy code but it's mostly readable. I think my problem might came from the callback calls.



async.waterfall([
function(callback){
var base_url = 'http://ift.tt/1DUu9Fg';
_.times(3, function(n){
var url = base_url+(n+1);
request(url, function (error, response, html) {
if (error)
callback(url);
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var links_companies = $('#sheet_content_inside > ul > li > div h2 > a'),
//links_companies = $('a', li_companies),
companies = [];

//console.log(colors.blue(url), links_companies.length);
links_companies.each(function(i, a) {
companies.push({name: $(this).attr('title'), url_from: url, next_url: $(this).attr('href')});
});
callback(null, companies);
}
});
});
},
function(companies, callback){
async.map(companies, function(c, cb){
var url = c.next_url;
if(!_.isUndefined(url))
request(url, function (error, response, html) {
console.log(url, c.name);
if (error)
cb(url);
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html),
profile_link = $('#head_main_content > div > div:nth-child(4) > h6 > a'),
cons = (typeof profile_link == 'undefined')? "WHAT?" : profile_link.attr('href');
//console.log("url founded: "+cons);
// if(typeof profile_link == 'undefined')
// cb(c);
c.origin_url = c.url_from;
c.next_url = profile_link.attr('href');
c.url_from = url;
c.profile_url = c.next_url;
//console.log(c);
// if (!_.isUndefined(c.next_url))
cb(null, c);
// else
// cb(c);
//return _.extend({}, c, {profile_url: profile_link.attr('href'), origin_url: c.url_from});
}
});
}, function(err, _companies){
callback(null, _companies);
});
},
function(companies, callback){
async.map(companies, function(c, cb){
var url = c.next_url;
console.log(colors.green(url), colors.red(c.name));
// if (_.isUndefined(url))
// return cb(c);
if(!_.isUndefined(url))
request(url, function (error, response, html) {
//console.log(url, c.name);
if (error)
cb(url);
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html),
left_zone = $('#sheet_content_inside > div.margin_top_20 > div.left'),
right_zone = $('#sheet_content_inside > div.margin_top_20 > div.right.width_195');
//console.log(left_zone.html(), right_zone.html());
var name = $('span[itemprop="name"]', left_zone).text(),
s_address = $('span[itemprop="streetAddress"]', left_zone).text(),
p_code = $('span[itemprop="postalCode"]', left_zone).text(),
city = $('span[itemprop="addressLocality"]', left_zone).text(),
country = $('span[itemprop="addressCountry"]', left_zone).text();

console.log(name, s_address, p_code, city, country);
cb(null, c);
}
});
}, function(err, _companies){
callback(null, _companies);
});
}
], function(err, companies){
console.log(colors.warn("end"), companies.length);
console.log(err);
console.log(companies);
});

Aucun commentaire:

Enregistrer un commentaire