修复drpy支持磁力链接,style属性处理等问题
This commit is contained in:
parent
dbc8e560a3
commit
aff38eb7fe
@ -1 +1 @@
|
||||
3.9.47beta15
|
||||
3.9.47beta15
|
||||
14
libs/drpy.js
14
libs/drpy.js
@ -55,7 +55,7 @@ function pre(){
|
||||
}
|
||||
|
||||
let rule = {};
|
||||
const VERSION = 'drpy1 3.9.47beta1 20230711';
|
||||
const VERSION = 'drpy1 3.9.47beta15 20230728';
|
||||
/** 已知问题记录
|
||||
* 1.影魔的jinjia2引擎不支持 {{fl}}对象直接渲染 (有能力解决的话尽量解决下,支持对象直接渲染字符串转义,如果加了|safe就不转义)[影魔牛逼,最新的文件发现这问题已经解决了]
|
||||
* Array.prototype.append = Array.prototype.push; 这种js执行后有毛病,for in 循环列表会把属性给打印出来 (这个大毛病需要重点排除一下)
|
||||
@ -111,6 +111,8 @@ var _pdfa;
|
||||
var _pd;
|
||||
// const DOM_CHECK_ATTR = ['url', 'src', 'href', 'data-original', 'data-src'];
|
||||
const DOM_CHECK_ATTR = /(url|src|href|-original|-src|-play|-url|style)$/;
|
||||
// 过滤特殊链接,不走urlJoin
|
||||
const SPECIAL_URL = /^(ftp|magnet|thunder|ws):/;
|
||||
const SELECT_REGEX = /:eq|:lt|:gt|#/g;
|
||||
const SELECT_REGEX_A = /:eq|:lt|:gt/g;
|
||||
|
||||
@ -537,7 +539,7 @@ const defaultParser = {
|
||||
if(typeof(uri)==='undefined'||!uri){
|
||||
uri = '';
|
||||
}
|
||||
if(DOM_CHECK_ATTR.test(parse)){
|
||||
if(DOM_CHECK_ATTR.test(parse) && !SPECIAL_URL.test(ret)){
|
||||
if(/http/.test(ret)){
|
||||
ret = ret.substr(ret.indexOf('http'));
|
||||
}else{
|
||||
@ -569,6 +571,8 @@ function pdfh2(html,parse){
|
||||
if(/style/.test(option.toLowerCase())&&/url\(/.test(result)){
|
||||
try {
|
||||
result = result.match(/url\((.*?)\)/)[1];
|
||||
// 2023/07/28新增 style取内部链接自动去除首尾单双引号
|
||||
result = result.replace(/^['|"](.*)['|"]$/, "$1");
|
||||
}catch (e) {}
|
||||
}
|
||||
return result
|
||||
@ -604,7 +608,7 @@ function pd2(html,parse,uri){
|
||||
if(typeof(uri)==='undefined'||!uri){
|
||||
uri = '';
|
||||
}
|
||||
if(DOM_CHECK_ATTR.test(parse)){
|
||||
if(DOM_CHECK_ATTR.test(parse) && !SPECIAL_URL.test(ret)){
|
||||
if(/http/.test(ret)){
|
||||
ret = ret.substr(ret.indexOf('http'));
|
||||
}else{
|
||||
@ -727,10 +731,12 @@ const parseTags = {
|
||||
if(/style/.test(option.toLowerCase())&&/url\(/.test(result)){
|
||||
try {
|
||||
result = result.match(/url\((.*?)\)/)[1];
|
||||
// 2023/07/28新增 style取内部链接自动去除首尾单双引号
|
||||
result = result.replace(/^['|"](.*)['|"]$/, "$1");
|
||||
}catch (e) {}
|
||||
}
|
||||
}
|
||||
if (result && base_url && DOM_CHECK_ATTR.test(option)) {
|
||||
if (result && base_url && DOM_CHECK_ATTR.test(option) && !SPECIAL_URL.test(result)) {
|
||||
if (/http/.test(result)) {
|
||||
result = result.substr(result.indexOf('http'));
|
||||
} else {
|
||||
|
||||
2
libs/drpy.min.js
vendored
2
libs/drpy.min.js
vendored
File diff suppressed because one or more lines are too long
@ -41,7 +41,7 @@ function pre(){
|
||||
|
||||
let rule = {};
|
||||
let vercode = typeof(pdfl) ==='function'?'drpy2.1':'drpy2';
|
||||
const VERSION = vercode+' 3.9.47beta1 20230711';
|
||||
const VERSION = vercode+' 3.9.47beta15 20230728';
|
||||
/** 已知问题记录
|
||||
* 1.影魔的jinjia2引擎不支持 {{fl}}对象直接渲染 (有能力解决的话尽量解决下,支持对象直接渲染字符串转义,如果加了|safe就不转义)[影魔牛逼,最新的文件发现这问题已经解决了]
|
||||
* Array.prototype.append = Array.prototype.push; 这种js执行后有毛病,for in 循环列表会把属性给打印出来 (这个大毛病需要重点排除一下)
|
||||
@ -97,7 +97,9 @@ var _pdfh;
|
||||
var _pdfa;
|
||||
var _pd;
|
||||
// const DOM_CHECK_ATTR = ['url', 'src', 'href', 'data-original', 'data-src'];
|
||||
const DOM_CHECK_ATTR = /(url|src|href|-original|-src|-play|-url)$/;
|
||||
const DOM_CHECK_ATTR = /(url|src|href|-original|-src|-play|-url|style)$/;
|
||||
// 过滤特殊链接,不走urlJoin
|
||||
const SPECIAL_URL = /^(ftp|magnet|thunder|ws):/;
|
||||
const NOADD_INDEX = /:eq|:lt|:gt|:first|:last|^body$|^#/; // 不自动加eq下标索引
|
||||
const URLJOIN_ATTR = /(url|src|href|-original|-src|-play|-url|style)$/; // 需要自动urljoin的属性
|
||||
const SELECT_REGEX = /:eq|:lt|:gt|#/g;
|
||||
@ -544,6 +546,8 @@ function pdfh2(html,parse){
|
||||
if(/style/.test(option.toLowerCase())&&/url\(/.test(result)){
|
||||
try {
|
||||
result = result.match(/url\((.*?)\)/)[1];
|
||||
// 2023/07/28新增 style取内部链接自动去除首尾单双引号
|
||||
result = result.replace(/^['|"](.*)['|"]$/, "$1");
|
||||
}catch (e) {}
|
||||
}
|
||||
return result
|
||||
@ -579,7 +583,7 @@ function pd2(html,parse,uri){
|
||||
if(typeof(uri)==='undefined'||!uri){
|
||||
uri = '';
|
||||
}
|
||||
if(DOM_CHECK_ATTR.test(parse)){
|
||||
if(DOM_CHECK_ATTR.test(parse) && !SPECIAL_URL.test(ret)){
|
||||
if(/http/.test(ret)){
|
||||
ret = ret.substr(ret.indexOf('http'));
|
||||
}else{
|
||||
|
||||
2
libs/drpy2.min.js
vendored
2
libs/drpy2.min.js
vendored
File diff suppressed because one or more lines are too long
@ -176,6 +176,7 @@ namespace Peach.DataAccess
|
||||
private static readonly Regex p = new ("url\\((.*?)\\)", RegexOptions.Multiline | RegexOptions.Singleline);
|
||||
private static readonly Regex NOAdd_INDEX = new (":eq|:lt|:gt|:first|:last|^body$|^#");
|
||||
private static readonly Regex URLJOIN_ATTR = new ("(url|src|href|-original|-src|-play|-url|style)$", RegexOptions.Multiline | RegexOptions.IgnoreCase);
|
||||
private static readonly Regex SPECIAL_URL = new ("^(ftp|magnet|thunder|ws):", RegexOptions.Multiline | RegexOptions.IgnoreCase);
|
||||
private static String pdfh_html = "";
|
||||
private static String pdfa_html = "";
|
||||
private static Document? pdfh_doc = null;
|
||||
@ -317,13 +318,15 @@ namespace Peach.DataAccess
|
||||
Match m = p.Match(result);
|
||||
if (m.Success)
|
||||
result = m.Groups[1]?.Value;
|
||||
result = Regex.Replace(result, "^['|\"](.*)['|\"]$", "$1");
|
||||
}
|
||||
if (!string.IsNullOrWhiteSpace(result) && !string.IsNullOrWhiteSpace(Add_url))// (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(Add_url))
|
||||
{
|
||||
// 需要自动urljoin的属性
|
||||
Match m = URLJOIN_ATTR.Match(option);
|
||||
Match n = SPECIAL_URL.Match(result);
|
||||
//if (isUrl(option)) {
|
||||
if (m.Success)
|
||||
if (m.Success && !n.Success)
|
||||
{
|
||||
if (result.Contains("http"))
|
||||
result = result[result.IndexOf("http")..];
|
||||
|
||||
@ -23,6 +23,7 @@ public class HtmlParser {
|
||||
private static final Pattern p = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
|
||||
private static final Pattern NOADD_INDEX = Pattern.compile(":eq|:lt|:gt|:first|:last|^body$|^#"); // 不自动加eq下标索引
|
||||
private static final Pattern URLJOIN_ATTR = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 需要自动urljoin的属性
|
||||
private static final Pattern SPECIAL_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 过滤特殊链接,不走urlJoin
|
||||
private static Document pdfh_doc = null;
|
||||
private static Document pdfa_doc = null;
|
||||
|
||||
@ -199,12 +200,15 @@ public class HtmlParser {
|
||||
if (m.find()) {
|
||||
result = m.group(1);
|
||||
}
|
||||
// 2023/07/28新增 style取内部链接自动去除首尾单双引号
|
||||
result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
|
||||
}
|
||||
if (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(add_url)) {
|
||||
// 需要自动urljoin的属性
|
||||
Matcher m = URLJOIN_ATTR.matcher(option);
|
||||
Matcher n = SPECIAL_URL.matcher(result);
|
||||
//if (isUrl(option)) {
|
||||
if (m.find()) {
|
||||
if (m.find() && !n.find() {
|
||||
if (result.contains("http")) {
|
||||
result = result.substring(result.indexOf("http"));
|
||||
} else {
|
||||
|
||||
@ -14,6 +14,7 @@ from jsonpath import jsonpath
|
||||
PARSE_CACHE = True # 解析缓存
|
||||
NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#' # 不自动加eq下标索引
|
||||
URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url|style)$' # 需要自动urljoin的属性
|
||||
SPECIAL_URL = '^(ftp|magnet|thunder|ws):' # 过滤特殊链接,不走urlJoin
|
||||
|
||||
|
||||
class jsoup:
|
||||
@ -193,11 +194,13 @@ class jsoup:
|
||||
if self.contains(option.lower(), 'style') and self.contains(ret, 'url('):
|
||||
try:
|
||||
ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
|
||||
# 2023/07/28新增 style取内部链接自动去除首尾单双引号
|
||||
ret = re.sub(r"^['\"]|['\"]$", '', ret)
|
||||
except:
|
||||
pass
|
||||
if ret and base_url:
|
||||
# need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
|
||||
need_add = self.test(URLJOIN_ATTR, option)
|
||||
need_add = self.test(URLJOIN_ATTR, option) and not self.test(SPECIAL_URL, ret)
|
||||
if need_add:
|
||||
if 'http' in ret:
|
||||
ret = ret[ret.find('http'):]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user