1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
'use strict';
const fs = require('fs');
const Crawler = require('crawler');
const path = require('path');
/**
* Sort the object keys
* @see https://stackoverflow.com/a/48112249/5155484
* @param {Object} obj The object
* @param {Function} arraySorter The sorter callback
*/
const sortObject = function(obj, arraySorter) {
if (typeof obj !== 'object') {
return obj;
}
if (Array.isArray(obj)) {
if (arraySorter) {
obj.sort(arraySorter);
}
for (var i = 0; i < obj.length; i++) {
obj[i] = sortObject(obj[i], arraySorter);
}
return obj;
}
var temp = {};
var keys = [];
for (var key in obj) {
keys.push(key);
}
keys.sort();
for (var index in keys) {
temp[keys[index]] = sortObject(obj[keys[index]], arraySorter);
}
return temp;
};
const writeJSON = function(filename, data, cbSuccess = null) {
fs.writeFile(filename, JSON.stringify(sortObject(data), null, 2) + '\n', function(err) {
if (err) {
return console.log(err);
} else {
if (cbSuccess !== null) {
cbSuccess();
}
}
});
};
const readJSON = function(filename, callbackSuccess) {
fs.readFile(filename, 'utf8', function(err, data) {
if (err) {
return console.log(err);
}
callbackSuccess(JSON.parse(data), filename);
});
};
const listDirectory = function(dirname, callbackSuccess) {
fs.readdir(dirname, (err, files) => {
if (err) {
return console.log(err);
}
callbackSuccess(files, dirname);
});
};
const writePage = function(filePrefix, name, url, data, onWriteSuccess) {
let pageKB = {
url: url,
name: name,
data: data,
};
writeJSON(path.join(__dirname, '../', 'data', filePrefix + pageKB.name + '.json'), pageKB, onWriteSuccess);
};
const processDataExtraction = function(pages, filePrefix, parsePage) {
return new Promise(resolve => {
var nbrPagesProcessed = 0;
var crawler = new Crawler({
maxConnections: 1,
// This will be called for each crawled page
callback: function(error, res, done) {
if (error) {
console.log(error);
} else {
console.log('URL : ' + res.options.url);
parsePage(res.$, anchors => {
writePage(filePrefix, res.options.name, res.options.url, anchors, () => {
nbrPagesProcessed++;
if (nbrPagesProcessed === pages.length) {
resolve();
}
});
});
}
done();
},
});
crawler.queue(
pages.map(page => {
return { uri: page.url, name: page.name, url: page.url };
})
);
});
};
module.exports = {
processDataExtraction: processDataExtraction,
listDirectory: listDirectory,
readJSON: readJSON,
writeJSON: writeJSON,
};
|