Files
SiteProxy/proxy/rewriter-base.go
2025-12-15 21:09:23 +08:00

351 lines
10 KiB
Go

//proxy/rewriter.go
package proxy
import (
"bytes"
"golang.org/x/net/html"
"net/url"
"strings"
)
type ContentRewriter struct {
baseURL *url.URL
token string
}
func NewContentRewriter(baseURL, token string) (*ContentRewriter, error) {
u, err := url.Parse(baseURL)
if err != nil {
return nil, err
}
return &ContentRewriter{
baseURL: u,
token: token,
}, nil
}
func (r *ContentRewriter) RewriteHTML(body []byte) ([]byte, error) {
doc, err := html.Parse(bytes.NewReader(body))
if err != nil {
return r.simpleRewriteHTML(body), nil
}
r.rewriteNode(doc)
var buf bytes.Buffer
if err := html.Render(&buf, doc); err != nil {
return r.simpleRewriteHTML(body), nil
}
return buf.Bytes(), nil
}
func (r *ContentRewriter) rewriteNode(n *html.Node) {
if n.Type == html.ElementNode {
// 在 head 标签中注入请求拦截脚本
if n.Data == "head" && n.FirstChild != nil {
script := &html.Node{
Type: html.ElementNode,
Data: "script",
}
script.AppendChild(&html.Node{
Type: html.TextNode,
Data: `(function(){var t="/p/` + r.token + `";var b="` + r.baseURL.String() + `";function r(u){if(!u||typeof u!=="string")return u;if(u.startsWith(t))return u;if(u.startsWith("http://")||u.startsWith("https://")||u.startsWith("data:")||u.startsWith("blob:")||u.startsWith("javascript:"))return u;if(u.startsWith("/")){return t+u}try{var a=new URL(u,b);if(a.origin===new URL(b).origin){return t+a.pathname+a.search+a.hash}}catch(e){}return u}var o=XMLHttpRequest.prototype.open;XMLHttpRequest.prototype.open=function(m,u){arguments[1]=r(u);return o.apply(this,arguments)};var f=window.fetch;window.fetch=function(u,opt){return f.call(this,r(u),opt)};var oa=Element.prototype.setAttribute;Element.prototype.setAttribute=function(n,v){if((n==="href"||n==="src")&&typeof v==="string"){v=r(v)}return oa.call(this,n,v)};var oi=Image;window.Image=function(){var i=new oi();var d=Object.getOwnPropertyDescriptor(HTMLImageElement.prototype,"src");if(d&&d.set){var os=d.set;Object.defineProperty(i,"src",{set:function(v){os.call(this,r(v))},get:d.get})}return i};document.addEventListener("submit",function(e){var a=e.target.action;if(a){e.target.action=r(a)}},true)})();`,
})
// 插入到 head 最前面
if n.FirstChild != nil {
script.NextSibling = n.FirstChild
n.FirstChild.PrevSibling = script
script.Parent = n
n.FirstChild = script
} else {
n.AppendChild(script)
}
}
attrs := map[string]bool{"href": true, "src": true, "action": true, "data": true}
for i, attr := range n.Attr {
if attrs[attr.Key] {
if rewritten := r.rewriteURL(attr.Val); rewritten != attr.Val {
n.Attr[i].Val = rewritten
}
}
if attr.Key == "srcset" {
n.Attr[i].Val = r.rewriteSrcset(attr.Val)
}
if attr.Key == "style" {
n.Attr[i].Val = r.rewriteInlineCSS(attr.Val)
}
}
if n.Data == "form" {
hasAction := false
for i, attr := range n.Attr {
if attr.Key == "action" {
hasAction = true
if attr.Val == "" {
n.Attr[i].Val = r.rewriteURL(r.baseURL.String())
}
break
}
}
if !hasAction {
n.Attr = append(n.Attr, html.Attribute{
Key: "action",
Val: r.rewriteURL(r.baseURL.String()),
})
}
}
if n.Data == "base" {
for i, attr := range n.Attr {
if attr.Key == "href" {
n.Attr[i].Val = r.baseURL.String()
}
}
}
if n.Data == "style" && n.FirstChild != nil {
if n.FirstChild.Type == html.TextNode {
n.FirstChild.Data = r.rewriteInlineCSS(n.FirstChild.Data)
}
}
if n.Data == "script" {
for _, attr := range n.Attr {
if attr.Key == "src" {
if r.isTrackingScript(attr.Val) {
if n.Parent != nil {
n.Parent.RemoveChild(n)
return
}
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
r.rewriteNode(c)
}
}
func (r *ContentRewriter) rewriteURL(urlStr string) string {
urlStr = strings.TrimSpace(urlStr)
if strings.HasPrefix(urlStr, "javascript:") ||
strings.HasPrefix(urlStr, "data:") ||
strings.HasPrefix(urlStr, "mailto:") ||
strings.HasPrefix(urlStr, "tel:") ||
strings.HasPrefix(urlStr, "#") ||
urlStr == "" {
return urlStr
}
// 防止重复添加 token
if strings.HasPrefix(urlStr, "/p/"+r.token) {
return urlStr
}
if strings.HasPrefix(urlStr, "//") {
urlStr = r.baseURL.Scheme + ":" + urlStr
}
if strings.HasPrefix(urlStr, "/") && !strings.HasPrefix(urlStr, "//") {
return "/p/" + r.token + urlStr
}
u, err := url.Parse(urlStr)
if err != nil {
return urlStr
}
if !u.IsAbs() {
resolved := r.baseURL.ResolveReference(u)
if resolved.Host == r.baseURL.Host {
proxyPath := resolved.Path
if resolved.RawQuery != "" {
proxyPath += "?" + resolved.RawQuery
}
if resolved.Fragment != "" {
proxyPath += "#" + resolved.Fragment
}
return "/p/" + r.token + proxyPath
}
return "/p/" + r.token + "/" + resolved.String()
}
if u.Host == r.baseURL.Host {
proxyPath := u.Path
if u.RawQuery != "" {
proxyPath += "?" + u.RawQuery
}
if u.Fragment != "" {
proxyPath += "#" + u.Fragment
}
return "/p/" + r.token + proxyPath
}
return "/p/" + r.token + "/" + u.String()
}
func (r *ContentRewriter) rewriteSrcset(srcset string) string {
if srcset == "" {
return srcset
}
parts := strings.Split(srcset, ",")
var rewritten []string
for _, part := range parts {
part = strings.TrimSpace(part)
fields := strings.Fields(part)
if len(fields) > 0 {
fields[0] = r.rewriteURL(fields[0])
rewritten = append(rewritten, strings.Join(fields, " "))
}
}
return strings.Join(rewritten, ", ")
}
func (r *ContentRewriter) RewriteCSS(body []byte) []byte {
content := string(body)
return []byte(r.rewriteInlineCSS(content))
}
func (r *ContentRewriter) rewriteInlineCSS(css string) string {
result := css
patterns := []struct {
prefix string
suffix string
}{
{`url("`, `")`},
{`url('`, `')`},
{`url(`, `)`},
}
for _, pattern := range patterns {
start := 0
for {
idx := strings.Index(result[start:], pattern.prefix)
if idx == -1 {
break
}
idx += start
urlStart := idx + len(pattern.prefix)
urlEnd := strings.Index(result[urlStart:], pattern.suffix)
if urlEnd == -1 {
break
}
urlEnd += urlStart
originalURL := result[urlStart:urlEnd]
rewrittenURL := r.rewriteURL(originalURL)
result = result[:urlStart] + rewrittenURL + result[urlEnd:]
start = urlStart + len(rewrittenURL)
}
}
result = r.rewriteImports(result)
return result
}
func (r *ContentRewriter) rewriteImports(css string) string {
result := css
patterns := []string{`@import "`, `@import '`, `@import url("`, `@import url('`}
for _, pattern := range patterns {
start := 0
for {
idx := strings.Index(result[start:], pattern)
if idx == -1 {
break
}
idx += start
urlStart := idx + len(pattern)
var endChar string
if strings.Contains(pattern, `"`) {
endChar = `"`
} else {
endChar = `'`
}
urlEnd := strings.Index(result[urlStart:], endChar)
if urlEnd == -1 {
break
}
urlEnd += urlStart
originalURL := result[urlStart:urlEnd]
rewrittenURL := r.rewriteURL(originalURL)
result = result[:urlStart] + rewrittenURL + result[urlEnd:]
start = urlStart + len(rewrittenURL)
}
}
return result
}
func (r *ContentRewriter) simpleRewriteHTML(body []byte) []byte {
content := string(body)
baseStr := r.baseURL.Scheme + "://" + r.baseURL.Host
replacements := []struct {
old string
new string
}{
{`href="` + baseStr, `href="/p/` + r.token},
{`src="` + baseStr, `src="/p/` + r.token},
{`action="` + baseStr, `action="/p/` + r.token},
{`href='` + baseStr, `href='/p/` + r.token},
{`src='` + baseStr, `src='/p/` + r.token},
{`href="/`, `href="/p/` + r.token + `/`},
{`src="/`, `src="/p/` + r.token + `/`},
{`action="/`, `action="/p/` + r.token + `/`},
{`href='/`, `href='/p/` + r.token + `/`},
{`src='/`, `src='/p/` + r.token + `/`},
}
for _, rep := range replacements {
content = strings.ReplaceAll(content, rep.old, rep.new)
}
return []byte(content)
}
func (r *ContentRewriter) isTrackingScript(src string) bool {
trackingDomains := []string{
"google-analytics.com",
"googletagmanager.com",
"facebook.net",
"doubleclick.net",
"analytics.js",
"ga.js",
"gtag.js",
}
srcLower := strings.ToLower(src)
for _, domain := range trackingDomains {
if strings.Contains(srcLower, domain) {
return true
}
}
return false
}