N3RD/JN/dr_py/utils/HtmlParser.java

299 lines
11 KiB
Java
Raw Normal View History

2024-01-08 22:50:47 +08:00
package com.github.tvbox.osc.util.js;
import android.text.TextUtils;
import com.quickjs.android.JSUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlParser {
private static String pdfh_html = "";
private static String pdfa_html = "";
private static final Pattern p = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
private static final Pattern NOADD_INDEX = Pattern.compile(":eq|:lt|:gt|:first|:last|^body$|^#"); // 不自动加eq下标索引
private static final Pattern URLJOIN_ATTR = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 需要自动urljoin的属性
private static final Pattern SPECIAL_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 过滤特殊链接,不走urlJoin
private static Document pdfh_doc = null;
private static Document pdfa_doc = null;
public static String joinUrl(String parent, String child) {
if (JSUtils.isEmpty(parent)) {
return child;
}
URL url;
String q = parent;
try {
url = new URL(new URL(parent), child);
q = url.toExternalForm();
} catch (MalformedURLException e) {
e.printStackTrace();
}
// if (q.contains("#")) {
// q = q.replaceAll("^(.+?)#.*?$", "$1");
// }
return q;
}
public static class Painfo {
public String nparse_rule;
public int nparse_index;
public List<String> excludes;
}
private static Painfo getParseInfo(String nparse) {
/*
根据传入的单规则获取 parse规则索引位置,排除列表 -- 可以用于剔除元素,支持多个按标签剔除按id剔除等操作
:param nparse:
:return:*/
Painfo painfo = new Painfo();
//List<String> excludes = new ArrayList<>(); //定义排除列表默认值为空
//int nparse_index; //定义位置索引默认值为0
painfo.nparse_rule = nparse; //定义规则默认值为本身
if (nparse.contains(":eq")) {
painfo.nparse_rule = nparse.split(":")[0];
String nparse_pos = nparse.split(":")[1];
if (painfo.nparse_rule.contains("--")) {
String[] rules = painfo.nparse_rule.split("--");
painfo.excludes = new ArrayList<>(Arrays.asList(rules));
painfo.excludes.remove(0);
painfo.nparse_rule = rules[0];
} else if (nparse_pos.contains("--")) {
String[] rules = nparse_pos.split("--");
painfo.excludes = new ArrayList<>(Arrays.asList(rules));
painfo.excludes.remove(0);
nparse_pos = rules[0];
}
try {
painfo.nparse_index = Integer.parseInt(nparse_pos.replace("eq(", "").replace(")", ""));
} catch (Exception e1) {
painfo.nparse_index = 0;
}
} else {
if (nparse.contains("--")) {
String[] rules = painfo.nparse_rule.split("--");
painfo.excludes = new ArrayList<>(Arrays.asList(rules));
painfo.excludes.remove(0);
painfo.nparse_rule = rules[0];
}
}
return painfo;
}
public static boolean isIndex(String str) {
if (JSUtils.isEmpty(str)) {
return false;
}
for (String str2 : new String[]{":eq", ":lt", ":gt", ":first", ":last", "body", "#"}) {
if (str.contains(str2)) {
if (str2.equals("body") || str2.equals("#")) {
return str.startsWith(str2);
}
return true;
}
}
return false;
}
public static boolean isUrl(String str) {
if (JSUtils.isEmpty(str)) {
return false;
}
for (String str2 : new String[]{"url", "src", "href", "-original", "-play"}) {
if (str.contains(str2)) {
return true;
}
}
return false;
}
private static String parseHikerToJq(String parse, boolean first) {
/*
海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
:param parse:
:param first:
:return:
*/
// 不自动加eq下标索引
if (parse.contains("&&")) {
String[] parses = parse.split("&&"); //带&&的重新拼接
List<String> new_parses = new ArrayList<>(); //构造新的解析表达式列表
for (int i = 0; i < parses.length; i++) {
String[] pss = parses[i].split(" ");
String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素
Matcher m = NOADD_INDEX.matcher(ps);
//if (!isIndex(ps)) {
if (!m.find()) {
if (!first && i >= parses.length - 1) { //不传first且遇到最后一个,不用补eq(0)
new_parses.add(parses[i]);
} else {
new_parses.add(parses[i] + ":eq(0)");
}
} else {
new_parses.add(parses[i]);
}
}
parse = TextUtils.join(" ", new_parses);
} else {
String[] pss = parse.split(" ");
String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素
Matcher m = NOADD_INDEX.matcher(ps);
//if (!isIndex(ps) && first) {
if (!m.find() && first) {
parse = parse + ":eq(0)";
}
}
return parse;
}
public static String parseDomForUrl(String html, String rule, String add_url) {
if (!pdfh_html.equals(html)) {
pdfh_html = html;
pdfh_doc = Jsoup.parse(html);
}
Document doc = pdfh_doc;
if (rule.equals("body&&Text") || rule.equals("Text")) {
return doc.text();
} else if (rule.equals("body&&Html") || rule.equals("Html")) {
return doc.html();
}
String option = "";
if (rule.contains("&&")) {
String[] rs = rule.split("&&");
option = rs[rs.length - 1];
List<String> excludes = new ArrayList<>(Arrays.asList(rs));
excludes.remove(rs.length - 1);
rule = TextUtils.join("&&", excludes);
}
rule = parseHikerToJq(rule, true);
String[] parses = rule.split(" ");
Elements ret = new Elements();
for (String nparse : parses) {
ret = parseOneRule(doc, nparse, ret);
if (ret.isEmpty()) {
return "";
}
}
String result;
if (JSUtils.isNotEmpty(option)) {
if (option.equals("Text")) {
result = ret.text();
} else if (option.equals("Html")) {
result = ret.html();
} else {
result = ret.attr(option);
if (option.toLowerCase().contains("style") && result.contains("url(")) {
Matcher m = p.matcher(result);
if (m.find()) {
result = m.group(1);
}
// 2023/07/28新增 style取内部链接自动去除首尾单双引号
result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
}
if (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(add_url)) {
// 需要自动urljoin的属性
Matcher m = URLJOIN_ATTR.matcher(option);
Matcher n = SPECIAL_URL.matcher(result);
//if (isUrl(option)) {
if (m.find() && !n.find()){
if (result.contains("http")) {
result = result.substring(result.indexOf("http"));
} else {
result = joinUrl(add_url, result);
}
}
}
}
} else {
result = ret.outerHtml();
}
return result;
}
public static List<String> parseDomForArray(String html, String rule) {
if (!pdfa_html.equals(html)) {
pdfa_html = html;
pdfa_doc = Jsoup.parse(html);
}
Document doc = pdfa_doc;
rule = parseHikerToJq(rule, false);
String[] parses = rule.split(" ");
Elements ret = new Elements();
for (String pars : parses) {
ret = parseOneRule(doc, pars, ret);
if (ret.isEmpty()) {
return new ArrayList<>();
}
}
List<String> eleHtml = new ArrayList<>();
for (int i = 0; i < ret.size(); i++) {
Element element1 = ret.get(i);
eleHtml.add(element1.outerHtml());
}
return eleHtml;
}
private static Elements parseOneRule(Document doc, String nparse, Elements ret) {
Painfo painfo = getParseInfo(nparse);
if (ret.isEmpty()) {
ret = doc.select(painfo.nparse_rule);
} else {
ret = ret.select(painfo.nparse_rule);
}
if (nparse.contains(":eq")) {
if(painfo.nparse_index < 0){
ret = ret.eq(ret.size() + painfo.nparse_index);
} else {
ret = ret.eq(painfo.nparse_index);
}
}
if (painfo.excludes != null && !ret.isEmpty()) {
ret = ret.clone(); //克隆一个, 免得直接remove会影响doc的缓存
for (int i = 0; i < painfo.excludes.size(); i++) {
ret.select(painfo.excludes.get(i)).remove();
}
}
return ret;
}
public static List<String> parseDomForList(String html, String p1, String list_text, String list_url, String add_url) {
if (!pdfa_html.equals(html)) {
pdfa_html = html;
pdfa_doc = Jsoup.parse(html);
}
Document doc = pdfa_doc;
p1 = parseHikerToJq(p1, false);
String[] parses = p1.split(" ");
Elements ret = new Elements();
for (String pars : parses) {
ret = parseOneRule(doc, pars, ret);
if (ret.isEmpty()) {
return new ArrayList<>();
}
}
List<String> new_vod_list = new ArrayList<>();
for(int i = 0; i < ret.size(); i++){
String it = ret.get(i).outerHtml();
new_vod_list.add(parseDomForUrl(it, list_text, "").trim() + '$' + parseDomForUrl(it, list_url, add_url));
}
return new_vod_list;
}
}