Files
SiteProxy/proxy/rewriter.go
2025-12-15 01:47:36 +08:00

307 lines
8.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//proxy/rewriter.go
package proxy
import (
"bytes"
"golang.org/x/net/html"
"net/url"
"strings"
)
type ContentRewriter struct {
baseURL *url.URL
}
func NewContentRewriter(baseURL string) (*ContentRewriter, error) {
u, err := url.Parse(baseURL)
if err != nil {
return nil, err
}
return &ContentRewriter{
baseURL: u,
}, nil
}
// RewriteHTML 重写 HTML 内容中的所有 URL
func (r *ContentRewriter) RewriteHTML(body []byte) ([]byte, error) {
doc, err := html.Parse(bytes.NewReader(body))
if err != nil {
// 如果解析失败,使用简单的字符串替换
return r.simpleRewriteHTML(body), nil
}
r.rewriteNode(doc)
var buf bytes.Buffer
if err := html.Render(&buf, doc); err != nil {
return r.simpleRewriteHTML(body), nil
}
return buf.Bytes(), nil
}
// rewriteNode 递归重写 HTML 节点
func (r *ContentRewriter) rewriteNode(n *html.Node) {
if n.Type == html.ElementNode {
// 重写需要处理的属性
attrs := map[string]bool{
"href": true,
"src": true,
"action": true,
"data": true,
}
for i, attr := range n.Attr {
if attrs[attr.Key] {
if rewritten := r.rewriteURL(attr.Val); rewritten != attr.Val {
n.Attr[i].Val = rewritten
}
}
// 处理 srcset 属性
if attr.Key == "srcset" {
n.Attr[i].Val = r.rewriteSrcset(attr.Val)
}
// 处理 style 属性中的 URL
if attr.Key == "style" {
n.Attr[i].Val = r.rewriteInlineCSS(attr.Val)
}
}
// 处理 <base> 标签
if n.Data == "base" {
for i, attr := range n.Attr {
if attr.Key == "href" {
n.Attr[i].Val = r.baseURL.String()
}
}
}
// 处理 <style> 标签内容
if n.Data == "style" && n.FirstChild != nil {
if n.FirstChild.Type == html.TextNode {
n.FirstChild.Data = r.rewriteInlineCSS(n.FirstChild.Data)
}
}
// 处理 <script> 标签,移除可能的跟踪脚本
if n.Data == "script" {
for _, attr := range n.Attr {
if attr.Key == "src" {
// 可以在这里过滤掉已知的跟踪脚本
if r.isTrackingScript(attr.Val) {
// 移除此节点
if n.Parent != nil {
n.Parent.RemoveChild(n)
return
}
}
}
}
}
}
// 递归处理子节点
for c := n.FirstChild; c != nil; c = c.NextSibling {
r.rewriteNode(c)
}
}
// rewriteURL 重写单个 URL
func (r *ContentRewriter) rewriteURL(urlStr string) string {
urlStr = strings.TrimSpace(urlStr)
// 跳过特殊协议
if strings.HasPrefix(urlStr, "javascript:") ||
strings.HasPrefix(urlStr, "data:") ||
strings.HasPrefix(urlStr, "mailto:") ||
strings.HasPrefix(urlStr, "tel:") ||
strings.HasPrefix(urlStr, "#") ||
urlStr == "" {
return urlStr
}
// 解析 URL
u, err := url.Parse(urlStr)
if err != nil {
return urlStr
}
// 如果是相对 URL转换为绝对 URL
if !u.IsAbs() {
u = r.baseURL.ResolveReference(u)
}
// 生成代理 URL
return "/proxy?url=" + url.QueryEscape(u.String())
}
// rewriteSrcset 重写 srcset 属性
func (r *ContentRewriter) rewriteSrcset(srcset string) string {
if srcset == "" {
return srcset
}
parts := strings.Split(srcset, ",")
var rewritten []string
for _, part := range parts {
part = strings.TrimSpace(part)
fields := strings.Fields(part)
if len(fields) > 0 {
fields[0] = r.rewriteURL(fields[0])
rewritten = append(rewritten, strings.Join(fields, " "))
}
}
return strings.Join(rewritten, ", ")
}
// RewriteCSS 重写 CSS 内容
func (r *ContentRewriter) RewriteCSS(body []byte) []byte {
content := string(body)
return []byte(r.rewriteInlineCSS(content))
}
// rewriteInlineCSS 重写内联 CSS 中的 URL
func (r *ContentRewriter) rewriteInlineCSS(css string) string {
// 匹配 url(...) 模式
result := css
// 处理 url("...") 和 url('...') 和 url(...)
patterns := []struct {
prefix string
suffix string
}{
{`url("`, `")`},
{`url('`, `')`},
{`url(`, `)`},
}
for _, pattern := range patterns {
start := 0
for {
idx := strings.Index(result[start:], pattern.prefix)
if idx == -1 {
break
}
idx += start
urlStart := idx + len(pattern.prefix)
urlEnd := strings.Index(result[urlStart:], pattern.suffix)
if urlEnd == -1 {
break
}
urlEnd += urlStart
originalURL := result[urlStart:urlEnd]
rewrittenURL := r.rewriteURL(originalURL)
result = result[:urlStart] + rewrittenURL + result[urlEnd:]
start = urlStart + len(rewrittenURL)
}
}
// 处理 @import
result = r.rewriteImports(result)
return result
}
// rewriteImports 重写 CSS @import 语句
func (r *ContentRewriter) rewriteImports(css string) string {
result := css
patterns := []string{
`@import "`,
`@import '`,
`@import url("`,
`@import url('`,
}
for _, pattern := range patterns {
start := 0
for {
idx := strings.Index(result[start:], pattern)
if idx == -1 {
break
}
idx += start
urlStart := idx + len(pattern)
var endChar string
if strings.Contains(pattern, `"`) {
endChar = `"`
} else {
endChar = `'`
}
urlEnd := strings.Index(result[urlStart:], endChar)
if urlEnd == -1 {
break
}
urlEnd += urlStart
originalURL := result[urlStart:urlEnd]
rewrittenURL := r.rewriteURL(originalURL)
result = result[:urlStart] + rewrittenURL + result[urlEnd:]
start = urlStart + len(rewrittenURL)
}
}
return result
}
// simpleRewriteHTML 简单的字符串替换重写(备用方案)
func (r *ContentRewriter) simpleRewriteHTML(body []byte) []byte {
content := string(body)
// 重写绝对 URL
baseStr := r.baseURL.Scheme + "://" + r.baseURL.Host
replacements := []struct {
old string
new string
}{
{`href="` + baseStr, `href="/proxy?url=` + url.QueryEscape(baseStr)},
{`src="` + baseStr, `src="/proxy?url=` + url.QueryEscape(baseStr)},
{`action="` + baseStr, `action="/proxy?url=` + url.QueryEscape(baseStr)},
{`href='` + baseStr, `href='/proxy?url=` + url.QueryEscape(baseStr)},
{`src='` + baseStr, `src='/proxy?url=` + url.QueryEscape(baseStr)},
}
for _, r := range replacements {
content = strings.ReplaceAll(content, r.old, r.new)
}
return []byte(content)
}
// isTrackingScript 检查是否是跟踪脚本
func (r *ContentRewriter) isTrackingScript(src string) bool {
trackingDomains := []string{
"google-analytics.com",
"googletagmanager.com",
"facebook.net",
"doubleclick.net",
"analytics.js",
"ga.js",
"gtag.js",
}
srcLower := strings.ToLower(src)
for _, domain := range trackingDomains {
if strings.Contains(srcLower, domain) {
return true
}
}
return false
}