158 lines
4.6 KiB
JavaScript
158 lines
4.6 KiB
JavaScript
import 'assets://js/lib/uri.min.js'
|
|
import cheerio from 'assets://js/lib/cheerio.min.js';
|
|
import 'assets://js/lib/crypto-js.js'
|
|
|
|
var charStr = 'abacdefghjklmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ0123456789';
|
|
export function randIndex(min, max, i) {
|
|
let index = Math.floor(Math.random() * (max - min + 1) + min),
|
|
numStart = charStr.length - 10;
|
|
if (i == 0 && index >= numStart) {
|
|
index = randIndex(min, max, i);
|
|
}
|
|
return index;
|
|
}
|
|
|
|
export function randomStr(len) {
|
|
let min = 0, max = charStr.length - 1, _str = '';
|
|
len = len || 15;
|
|
for (var i = 0, index; i < len; i++) {
|
|
index = randIndex(min, max, i);
|
|
_str += charStr[index];
|
|
}
|
|
return _str;
|
|
}
|
|
|
|
export function urljoin(base, url) {
|
|
base = base || '';
|
|
url = url || '';
|
|
let baseU = new Uri(base.trim().rstrip('/'));
|
|
url = url.trim().rstrip('/');
|
|
let u = undefined;
|
|
if (url.startsWith('http://') || url.startsWith('https://')) {
|
|
u = new Uri(url);
|
|
} else if (url.startsWith('://')) {
|
|
u = new Uri(baseU.protocol() + url);
|
|
} else if (url.startsWith('//')) {
|
|
u = new Uri(baseU.protocol() + ':' + url);
|
|
} else {
|
|
u = new Uri(baseU.protocol() + '://' + baseU.host() + (baseU.port() ? ':' + baseU.port() : '') + '/' + url);
|
|
}
|
|
if ((!u.path() || u.path().trim().length === 0) && baseU.path())
|
|
u.path(baseU.path());
|
|
if (!u.query() && baseU.query())
|
|
u.query(baseU.query());
|
|
return u.toString();
|
|
}
|
|
|
|
const DOM_CHECK_ATTR = /(url|src|href|data-original|data-src)$/;
|
|
const SELECT_REGEX = /:eq|:lt|:gt|#/g;
|
|
const SELECT_REGEX_A = /:eq|:lt|:gt/g;
|
|
|
|
export function pdfh(html, parse, base_url) {
|
|
if (!parse || !parse.trim()) {
|
|
return ''
|
|
}
|
|
let eleFind = typeof html === 'object';
|
|
let option = undefined;
|
|
if (eleFind && parse.startsWith('body&&')) {
|
|
parse = parse.substr(6);
|
|
if (parse.indexOf('&&') < 0) {
|
|
option = parse.trim();
|
|
parse = '*=*';
|
|
}
|
|
}
|
|
if (parse.indexOf('&&') > -1) {
|
|
let sp = parse.split('&&');
|
|
option = sp[sp.length - 1];
|
|
sp.splice(sp.length - 1);
|
|
if (sp.length > 1) {
|
|
for (let i in sp) {
|
|
if (!SELECT_REGEX.test(sp[i])) {
|
|
sp[i] = sp[i] + ':eq(0)';
|
|
}
|
|
}
|
|
} else {
|
|
if (!SELECT_REGEX.test(sp[0])) {
|
|
sp[0] = sp[0] + ':eq(0)';
|
|
}
|
|
}
|
|
parse = sp.join(' ');
|
|
}
|
|
let result = '';
|
|
const $ = eleFind ? html.rr : cheerio.load(html);
|
|
let ret = eleFind ? ((parse === '*=*' || $(html.ele).is(parse)) ? html.ele : $(html.ele).find(parse)) : $(parse);
|
|
if (option) {
|
|
if (option === 'Text') {
|
|
result = $(ret).text();
|
|
}
|
|
else if (option === 'Html') {
|
|
result = $(ret).html();
|
|
}
|
|
else {
|
|
result = $(ret).attr(option);
|
|
}
|
|
if (result && base_url && DOM_CHECK_ATTR.test(option)) {
|
|
if (/http/.test(result)) {
|
|
result = result.substr(result.indexOf('http'));
|
|
} else {
|
|
result = urljoin(base_url, result)
|
|
}
|
|
}
|
|
} else {
|
|
result = $(ret).toString();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
export function pdfa(html, parse) {
|
|
if (!parse || !parse.trim()) {
|
|
return [];
|
|
}
|
|
let eleFind = typeof html === 'object';
|
|
if (parse.indexOf('&&') > -1) {
|
|
let sp = parse.split('&&');
|
|
for (let i in sp) {
|
|
if (!SELECT_REGEX_A.test(sp[i]) && i < sp.length - 1) {
|
|
sp[i] = sp[i] + ':eq(0)';
|
|
}
|
|
}
|
|
parse = sp.join(' ');
|
|
}
|
|
const $ = eleFind ? html.rr : cheerio.load(html);
|
|
let ret = eleFind ? ($(html.ele).is(parse) ? html.ele : $(html.ele).find(parse)) : $(parse);
|
|
let result = [];
|
|
if (ret) {
|
|
ret.each(function (idx, ele) {
|
|
result.push({ rr: $, ele: ele });
|
|
});
|
|
}
|
|
return result;
|
|
}
|
|
|
|
const defaultParser = {
|
|
pdfh:pdfh,
|
|
pdfa:pdfa,
|
|
pd(html,parse,uri){
|
|
let ret = this.pdfh(html,parse);
|
|
if(typeof(uri)==='undefined'||!uri){
|
|
uri = '';
|
|
}
|
|
if(DOM_CHECK_ATTR.test(parse)){
|
|
if(/http/.test(ret)){
|
|
ret = ret.substr(ret.indexOf('http'));
|
|
}else{
|
|
ret = urljoin(MY_URL,ret)
|
|
}
|
|
}
|
|
return ret
|
|
},
|
|
};
|
|
|
|
globalThis.randIndex = randIndex;
|
|
globalThis.randomStr = randomStr;
|
|
globalThis.urljoin = urljoin;
|
|
globalThis.joinUrl = urljoin;
|
|
globalThis.defaultParser = defaultParser;
|
|
globalThis.pdfa = defaultParser.pdfa;
|
|
globalThis.pdfh = defaultParser.pdfh;
|
|
globalThis.pd = defaultParser.pd; |