89 lines
3.4 KiB
JavaScript
89 lines
3.4 KiB
JavaScript
|
const DOM_CHECK_ATTR = /(url|src|href|-original|-src|-play|-url)$/;
|
||
|
const SELECT_REGEX = /:eq|:lt|:gt|#/g;
|
||
|
const SELECT_REGEX_A = /:eq|:lt|:gt/g;
|
||
|
const parseTags = {
|
||
|
jq:{
|
||
|
pdfh(html, parse, base_url) {
|
||
|
if (!parse || !parse.trim()) {
|
||
|
return ''
|
||
|
}
|
||
|
parse = parse.trim();
|
||
|
let option = null;
|
||
|
if (parse.startsWith('body&&')) {
|
||
|
parse = parse.substr(6);
|
||
|
}
|
||
|
print('pdfh parse前:'+parse);
|
||
|
if (parse.indexOf('&&') > -1) {
|
||
|
let sp = parse.split('&&');
|
||
|
option = sp[sp.length - 1];
|
||
|
sp.splice(sp.length - 1);
|
||
|
sp.forEach((it,idex)=>{
|
||
|
if(/:eq\((.*?)\)/.test(it)){
|
||
|
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
|
||
|
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
|
||
|
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
|
||
|
sp[idex] = it;
|
||
|
}
|
||
|
}else if (!SELECT_REGEX.test(it) && it!=='body') {
|
||
|
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
|
||
|
}
|
||
|
});
|
||
|
parse = sp.join(' ');
|
||
|
}
|
||
|
if(parse === 'Text'){
|
||
|
parse = 'body';
|
||
|
option = 'Text';
|
||
|
}else if(parse === 'Html'){
|
||
|
parse = 'body';
|
||
|
option = 'Html';
|
||
|
}
|
||
|
print('pdfh parse后:'+parse+',option:'+option);
|
||
|
let result = defaultParser.pdfh(html,parse + " " + option);
|
||
|
print(result);
|
||
|
if(option&&/style/.test(option.toLowerCase())&&/url\(/.test(result)){
|
||
|
try {
|
||
|
result = result.match(/url\((.*?)\)/)[1];
|
||
|
}catch (e) {}
|
||
|
}
|
||
|
if (result && base_url && option && DOM_CHECK_ATTR.test(option)) {
|
||
|
if (/http/.test(result)) {
|
||
|
result = result.substr(result.indexOf('http'));
|
||
|
} else {
|
||
|
result = urljoin(base_url, result)
|
||
|
}
|
||
|
}
|
||
|
return result;
|
||
|
},
|
||
|
pdfa(html, parse) {
|
||
|
if (!parse || !parse.trim()) {
|
||
|
print('!parse');
|
||
|
return [];
|
||
|
}
|
||
|
parse = parse.trim();
|
||
|
print('pdfa parse前:'+parse);
|
||
|
if (parse.indexOf('&&') > -1) {
|
||
|
let sp = parse.split('&&');
|
||
|
sp.forEach((it,idex)=>{
|
||
|
if(/:eq\((.*?)\)/.test(it) && idex < sp.length - 1){
|
||
|
let pos = parseInt(it.match(/:eq\((.*?)\)/)[1]);
|
||
|
if(pos >= 0 ){ // jsoup的eq 正整数从1开始
|
||
|
it = it.replace(/:eq\((.*?)\)/,`:eq(${pos+1})`);
|
||
|
sp[idex] = it;
|
||
|
}
|
||
|
}else if (!SELECT_REGEX_A.test(it) && idex < sp.length - 1 && it!=='body') {
|
||
|
sp[idex] = it+':eq(1)'; // jsoup的eq从1开始
|
||
|
}
|
||
|
});
|
||
|
parse = sp.join(' ');
|
||
|
}
|
||
|
print('pdfa parse后:'+parse);
|
||
|
let result = defaultParser.pdfa(html,parse);
|
||
|
// print(result);
|
||
|
print(result.length);
|
||
|
return result;
|
||
|
},
|
||
|
pd(html,parse,uri){
|
||
|
return parseTags.jq.pdfh(html, parse, MY_URL);
|
||
|
},
|
||
|
},
|
||
|
};
|