feat: include a default robots.txt to reduce the impact of crawlers (#7387)

- Add a strong strict default robots.txt, if one is not provided by the instance administrators.
- Remove code for the legacy public asset path, the error has been logged for a few releases already (existed since v1.21).
- Resolves forgejo/forgejo#923

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/7387
Reviewed-by: Earl Warren <earl-warren@noreply.codeberg.org>
Reviewed-by: 0ko <0ko@noreply.codeberg.org>
Co-authored-by: Gusted <postmaster@gusted.xyz>
Co-committed-by: Gusted <postmaster@gusted.xyz>
This commit is contained in:
Gusted 2025-03-30 11:28:19 +00:00 committed by Gusted
parent 51caba694a
commit bb4e1f426f
2 changed files with 88 additions and 12 deletions

View file

@ -198,9 +198,6 @@ func serveInstalled(ctx *cli.Context) error {
for fn := range publicFilesSet.Seq() {
log.Error("Found legacy public asset %q in CustomPath. Please move it to %s/public/assets/%s", fn, setting.CustomPath, fn)
}
if _, err := os.Stat(filepath.Join(setting.CustomPath, "robots.txt")); err == nil {
log.Error(`Found legacy public asset "robots.txt" in CustomPath. Please move it to %s/public/robots.txt`, setting.CustomPath)
}
routers.InitWebInstalled(graceful.GetManager().HammerContext())

View file

@ -33,17 +33,96 @@ func DummyOK(w http.ResponseWriter, req *http.Request) {
w.WriteHeader(http.StatusOK)
}
func RobotsTxt(w http.ResponseWriter, req *http.Request) {
robotsTxt := util.FilePathJoinAbs(setting.CustomPath, "public/robots.txt")
if ok, _ := util.IsExist(robotsTxt); !ok {
robotsTxt = util.FilePathJoinAbs(setting.CustomPath, "robots.txt") // the legacy "robots.txt"
}
httpcache.SetCacheControlInHeader(w.Header(), setting.StaticCacheTime)
http.ServeFile(w, req, robotsTxt)
}
func StaticRedirect(target string) func(w http.ResponseWriter, req *http.Request) {
return func(w http.ResponseWriter, req *http.Request) {
http.Redirect(w, req, path.Join(setting.StaticURLPrefix, target), http.StatusMovedPermanently)
}
}
var defaultRobotsTxt = []byte(`# The default Forgejo robots.txt
# For more information: https://forgejo.org/docs/latest/admin/search-engines-indexation/
User-agent: *
Disallow: /api/
Disallow: /avatars/
Disallow: /user/
Disallow: /swagger.*.json
Disallow: /explore/*?*
Disallow: /repo/create
Disallow: /repo/migrate
Disallow: /org/create
Disallow: /*/*/fork
Disallow: /*/*/watchers
Disallow: /*/*/stargazers
Disallow: /*/*/forks
Disallow: /*/*/src/
Disallow: /*/*/blame/
Disallow: /*/*/commit/
Disallow: /*/*/commits/
Disallow: /*/*/raw/
Disallow: /*/*/media/
Disallow: /*/*/tags
Disallow: /*/*/graph
Disallow: /*/*/branches
Disallow: /*/*/compare
Disallow: /*/*/lastcommit/
Disallow: /*/*/rss/branch/
Disallow: /*/*/atom/branch/
Disallow: /*/*/activity
Disallow: /*/*/activity_author_data
Disallow: /*/*/actions
Disallow: /*/*/projects
Disallow: /*/*/labels
Disallow: /*/*/milestones
Disallow: /*/*/find/
Disallow: /*/*/tree-list/
Disallow: /*/*/search/
Disallow: /*/-/code
Disallow: /*/*/issues/new
Disallow: /*/*/pulls/*/files
Disallow: /*/*/pulls/*/commits
Disallow: /attachments/
Disallow: /*/*/attachments/
Disallow: /*/*/issues/*/attachments/
Disallow: /*/*/pulls/*/attachments/
Disallow: /*/*/releases/attachments
Disallow: /*/*/releases/download
Disallow: /*/*/archive/
Disallow: /*.bundle$
Disallow: /*.patch$
Disallow: /*.diff$
Disallow: /*.atom$
Disallow: /*.rss$
Disallow: /*lang=*
Disallow: /*redirect_to=*
Disallow: /*tab=*
Disallow: /*q=*
Disallow: /*sort=*
Disallow: /*repo-search-archived=*
`)
func RobotsTxt(w http.ResponseWriter, req *http.Request) {
httpcache.SetCacheControlInHeader(w.Header(), setting.StaticCacheTime)
w.Header().Set("Content-Type", "text/plain")
robotsTxt := util.FilePathJoinAbs(setting.CustomPath, "public/robots.txt")
if ok, _ := util.IsExist(robotsTxt); ok {
http.ServeFile(w, req, robotsTxt)
return
}
_, err := w.Write(defaultRobotsTxt)
if err != nil {
log.Error("failed to write robots.txt: %v", err)
}
}