exclude search results by regex (#120)
a better alternative to using `--no-idx` for this purpose since this also excludes recent uploads, not just during fs-indexing, and it doesn't prevent deduplication also speeds up searches by a tiny amount due to building the sanchecks into the exclude-filter while parsing the config, instead of during each search query
This commit is contained in:
		
							parent
							
								
									2f83c6c7d1
								
							
						
					
					
						commit
						697a4fa8a4
					
				| @ -1097,11 +1097,12 @@ using the GUI  (winXP or later): | ||||
|   * on winXP only, click the `Sign up for online storage` hyperlink instead and put the URL there | ||||
|   * providing your password as the username is recommended; the password field can be anything or empty | ||||
| 
 | ||||
| known client bugs: | ||||
| the webdav client that's built into windows has the following list of bugs; you can avoid all of these by connecting with rclone instead: | ||||
| * win7+ doesn't actually send the password to the server when reauthenticating after a reboot unless you first try to login with an incorrect password and then switch to the correct password | ||||
|   * or just type your password into the username field instead to get around it entirely | ||||
| * connecting to a folder which allows anonymous read will make writing impossible, as windows has decided it doesn't need to login | ||||
|   * workaround: connect twice; first to a folder which requires auth, then to the folder you actually want, and leave both of those mounted | ||||
|   * or set the server-option `--dav-auth` to force password-auth for all webdav clients | ||||
| * win7+ may open a new tcp connection for every file and sometimes forgets to close them, eventually needing a reboot | ||||
|   * maybe NIC-related (??), happens with win10-ltsc on e1000e but not virtio | ||||
| * windows cannot access folders which contain filenames with invalid unicode or forbidden characters (`<>:"/\|?*`), or names ending with `.` | ||||
| @ -1268,7 +1269,7 @@ note: | ||||
| 
 | ||||
| ### exclude-patterns | ||||
| 
 | ||||
| to save some time,  you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash \.iso$` or the volflag `:c,nohash=\.iso$`, this has the following consequences: | ||||
| to save some time,  you can provide a regex pattern for filepaths to only index by filename/path/size/last-modified (and not the hash of the file contents) by setting `--no-hash '\.iso$'` or the volflag `:c,nohash=\.iso$`, this has the following consequences: | ||||
| * initial indexing is way faster, especially when the volume is on a network disk | ||||
| * makes it impossible to [file-search](#file-search) | ||||
| * if someone uploads the same file contents, the upload will not be detected as a dupe, so it will not get symlinked or rejected | ||||
| @ -1279,6 +1280,8 @@ similarly, you can fully ignore files/folders using `--no-idx [...]` and `:c,noi | ||||
| 
 | ||||
| if you set `--no-hash [...]` globally, you can enable hashing for specific volumes using flag `:c,nohash=` | ||||
| 
 | ||||
| to exclude certain filepaths from search-results, use `--srch-excl` or volflag `srch_excl` instead of `--no-idx`, for example `--srch-excl 'password|logs/[0-9]'` | ||||
| 
 | ||||
| ### filesystem guards | ||||
| 
 | ||||
| avoid traversing into other filesystems  using `--xdev` / volflag `:c,xdev`, skipping any symlinks or bind-mounts to another HDD for example | ||||
|  | ||||
| @ -1401,6 +1401,7 @@ def add_db_general(ap, hcores): | ||||
|     ap2.add_argument("--db-act", metavar="SEC", type=float, default=10.0, help="defer any scheduled volume reindexing until \033[33mSEC\033[0m seconds after last db write (uploads, renames, ...)") | ||||
|     ap2.add_argument("--srch-time", metavar="SEC", type=int, default=45, help="search deadline -- terminate searches running for more than \033[33mSEC\033[0m seconds") | ||||
|     ap2.add_argument("--srch-hits", metavar="N", type=int, default=7999, help="max search results to allow clients to fetch; 125 results will be shown initially") | ||||
|     ap2.add_argument("--srch-excl", metavar="PTN", type=u, default="", help="regex: exclude files from search results if the file-URL matches \033[33mPTN\033[0m (case-sensitive). Example: [\033[32mpassword|logs/[0-9]\033[0m] any URL containing 'password' or 'logs/DIGIT' (volflag=srch_excl)") | ||||
|     ap2.add_argument("--dotsrch", action="store_true", help="show dotfiles in search results (volflags: dotsrch | nodotsrch)") | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -1880,6 +1880,7 @@ class AuthSrv(object): | ||||
|                 ["no_hash", "nohash"], | ||||
|                 ["no_idx", "noidx"], | ||||
|                 ["og_ua", "og_ua"], | ||||
|                 ["srch_excl", "srch_excl"], | ||||
|             ]: | ||||
|                 if vf in vol.flags: | ||||
|                     ptn = re.compile(vol.flags.pop(vf)) | ||||
| @ -2086,6 +2087,22 @@ class AuthSrv(object): | ||||
|                 self.log(t.format(mtp), 1) | ||||
|                 errors = True | ||||
| 
 | ||||
|         for vol in vfs.all_vols.values(): | ||||
|             re1: Optional[re.Pattern] = vol.flags.get("srch_excl") | ||||
|             excl = [re1.pattern] if re1 else [] | ||||
| 
 | ||||
|             vpaths = [] | ||||
|             vtop = vol.vpath | ||||
|             for vp2 in vfs.all_vols.keys(): | ||||
|                 if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2: | ||||
|                     vpaths.append(re.escape(vp2[len(vtop) :].lstrip("/"))) | ||||
|             if vpaths: | ||||
|                 excl.append("^(%s)/" % ("|".join(vpaths),)) | ||||
| 
 | ||||
|             vol.flags["srch_re_dots"] = re.compile("|".join(excl or ["^$"])) | ||||
|             excl.extend([r"^\.", r"/\."]) | ||||
|             vol.flags["srch_re_nodot"] = re.compile("|".join(excl)) | ||||
| 
 | ||||
|         have_daw = False | ||||
|         for vol in vfs.all_nodes.values(): | ||||
|             daw = vol.flags.get("daw") or self.args.daw | ||||
|  | ||||
| @ -191,6 +191,7 @@ flagcats = { | ||||
|         "xvol": "do not follow symlinks leaving the volume root", | ||||
|         "dotsrch": "show dotfiles in search results", | ||||
|         "nodotsrch": "hide dotfiles in search results (default)", | ||||
|         "srch_excl": "exclude search results with URL matching this regex", | ||||
|     }, | ||||
|     'database, audio tags\n"mte", "mth", "mtp", "mtm" all work the same as -mte, -mth, ...': { | ||||
|         "mtp=.bpm=f,audio-bpm.py": 'uses the "audio-bpm.py" program to\ngenerate ".bpm" tags from uploads (f = overwrite tags)', | ||||
|  | ||||
| @ -793,7 +793,7 @@ class SvcHub(object): | ||||
|         al.exp_md = odfusion(exp, al.exp_md.replace(" ", ",")) | ||||
|         al.exp_lg = odfusion(exp, al.exp_lg.replace(" ", ",")) | ||||
| 
 | ||||
|         for k in ["no_hash", "no_idx", "og_ua"]: | ||||
|         for k in ["no_hash", "no_idx", "og_ua", "srch_excl"]: | ||||
|             ptn = getattr(self.args, k) | ||||
|             if ptn: | ||||
|                 setattr(self.args, k, re.compile(ptn)) | ||||
|  | ||||
| @ -324,7 +324,8 @@ class U2idx(object): | ||||
|         sort: bool, | ||||
|         lim: int, | ||||
|     ) -> tuple[list[dict[str, Any]], list[str], bool]: | ||||
|         if self.args.srch_dbg: | ||||
|         dbg = self.args.srch_dbg | ||||
|         if dbg: | ||||
|             t = "searching across all %s volumes in which the user has 'r' (full read access):\n  %s" | ||||
|             zs = "\n  ".join(["/%s = %s" % (x.vpath, x.realpath) for x in vols]) | ||||
|             self.log(t % (len(vols), zs), 5) | ||||
| @ -367,14 +368,14 @@ class U2idx(object): | ||||
|             if not cur: | ||||
|                 continue | ||||
| 
 | ||||
|             excl = [] | ||||
|             for vp2 in self.asrv.vfs.all_vols.keys(): | ||||
|                 if vp2.startswith((vtop + "/").lstrip("/")) and vtop != vp2: | ||||
|                     excl.append(vp2[len(vtop) :].lstrip("/")) | ||||
|             dots = flags.get("dotsrch") and uname in vol.axs.udot | ||||
|             zs = "srch_re_dots" if dots else "srch_re_nodot" | ||||
|             rex: re.Pattern = flags.get(zs)  # type: ignore | ||||
| 
 | ||||
|             if self.args.srch_dbg: | ||||
|                 t = "searching in volume /%s (%s), excludelist %s" | ||||
|                 self.log(t % (vtop, ptop, excl), 5) | ||||
|             if dbg: | ||||
|                 t = "searching in volume /%s (%s), excluding %s" | ||||
|                 self.log(t % (vtop, ptop, rex.pattern), 5) | ||||
|                 rex_cfg: Optional[re.Pattern] = flags.get("srch_excl") | ||||
| 
 | ||||
|             self.active_cur = cur | ||||
| 
 | ||||
| @ -387,7 +388,6 @@ class U2idx(object): | ||||
| 
 | ||||
|             sret = [] | ||||
|             fk = flags.get("fk") | ||||
|             dots = flags.get("dotsrch") and uname in vol.axs.udot | ||||
|             fk_alg = 2 if "fka" in flags else 1 | ||||
|             c = cur.execute(uq, tuple(vuv)) | ||||
|             for hit in c: | ||||
| @ -396,20 +396,23 @@ class U2idx(object): | ||||
|                 if rd.startswith("//") or fn.startswith("//"): | ||||
|                     rd, fn = s3dec(rd, fn) | ||||
| 
 | ||||
|                 if rd in excl or any([x for x in excl if rd.startswith(x + "/")]): | ||||
|                     if self.args.srch_dbg: | ||||
|                         zs = vjoin(vjoin(vtop, rd), fn) | ||||
|                         t = "database inconsistency in volume '/%s'; ignoring: %s" | ||||
|                         self.log(t % (vtop, zs), 1) | ||||
|                 vp = vjoin(vjoin(vtop, rd), fn) | ||||
| 
 | ||||
|                 if vp in seen_rps: | ||||
|                     continue | ||||
| 
 | ||||
|                 rp = quotep("/".join([x for x in [vtop, rd, fn] if x])) | ||||
|                 if not dots and "/." in ("/" + rp): | ||||
|                     continue | ||||
| 
 | ||||
|                 if rp in seen_rps: | ||||
|                 if rex.search(vp): | ||||
|                     if dbg: | ||||
|                         if rex_cfg and rex_cfg.search(vp):  # type: ignore | ||||
|                             self.log("filtered by srch_excl: %s" % (vp,), 6) | ||||
|                         elif not dots and "/." in ("/" + vp): | ||||
|                             pass | ||||
|                         else: | ||||
|                             t = "database inconsistency in volume '/%s'; ignoring: %s" | ||||
|                             self.log(t % (vtop, vp), 1) | ||||
|                     continue | ||||
| 
 | ||||
|                 rp = quotep(vp) | ||||
|                 if not fk: | ||||
|                     suf = "" | ||||
|                 else: | ||||
| @ -431,7 +434,7 @@ class U2idx(object): | ||||
|                 if lim < 0: | ||||
|                     break | ||||
| 
 | ||||
|                 if self.args.srch_dbg: | ||||
|                 if dbg: | ||||
|                     t = "in volume '/%s': hit: %s" | ||||
|                     self.log(t % (vtop, rp), 5) | ||||
| 
 | ||||
| @ -461,7 +464,7 @@ class U2idx(object): | ||||
|             ret.extend(sret) | ||||
|             # print("[{}] {}".format(ptop, sret)) | ||||
| 
 | ||||
|             if self.args.srch_dbg: | ||||
|             if dbg: | ||||
|                 t = "in volume '/%s': got %d hits, %d total so far" | ||||
|                 self.log(t % (vtop, len(sret), len(ret)), 5) | ||||
| 
 | ||||
|  | ||||
| @ -1078,7 +1078,8 @@ class Up2k(object): | ||||
|         ft = "\033[0;32m{}{:.0}" | ||||
|         ff = "\033[0;35m{}{:.0}" | ||||
|         fv = "\033[0;36m{}:\033[90m{}" | ||||
|         fx = set(("html_head", "rm_re_t", "rm_re_r", "mv_re_t", "mv_re_r")) | ||||
|         zs = "html_head mv_re_r mv_re_t rm_re_r rm_re_t srch_re_dots srch_re_nodot" | ||||
|         fx = set(zs.split()) | ||||
|         fd = vf_bmap() | ||||
|         fd.update(vf_cmap()) | ||||
|         fd.update(vf_vmap()) | ||||
| @ -1241,9 +1242,9 @@ class Up2k(object): | ||||
| 
 | ||||
|         # also consider volflags which affect indexing | ||||
|         for vp in vps: | ||||
|             vf = self.vfs.all_vols[vp].flags.items() | ||||
|             vf = {k: v for k, v in vf if k in VF_AFFECTS_INDEXING} | ||||
|             seed.append(str(vf)) | ||||
|             vf = self.vfs.all_vols[vp].flags | ||||
|             vf = {k: v for k, v in vf.items() if k in VF_AFFECTS_INDEXING} | ||||
|             seed.append(str(sorted(vf.items()))) | ||||
| 
 | ||||
|         zb = hashlib.sha1("\n".join(seed).encode("utf-8", "replace")).digest() | ||||
|         vcfg = ub64enc(zb[:18]).decode("ascii") | ||||
|  | ||||
| @ -122,7 +122,7 @@ class Cfg(Namespace): | ||||
|     def __init__(self, a=None, v=None, c=None, **ka0): | ||||
|         ka = {} | ||||
| 
 | ||||
|         ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs" | ||||
|         ex = "chpw daw dav_auth dav_inf dav_mac dav_rt e2d e2ds e2dsa e2t e2ts e2tsr e2v e2vu e2vp early_ban ed emp exp force_js getmod grid gsel hardlink ih ihead magic hardlink_only nid nih no_acode no_athumb no_clone no_cp no_dav no_db_ip no_del no_dirsz no_dupe no_lifetime no_logues no_mv no_pipe no_poll no_readme no_robots no_sb_md no_sb_lg no_scandir no_tarcmp no_thumb no_vthumb no_zip nrand nsort nw og og_no_head og_s_title ohead q rand re_dirsz rss smb srch_dbg srch_excl stats uqe vague_403 vc ver write_uplog xdev xlink xvol zs" | ||||
|         ka.update(**{k: False for k in ex.split()}) | ||||
| 
 | ||||
|         ex = "dedup dotpart dotsrch hook_v no_dhash no_fastboot no_fpool no_htp no_rescan no_sendfile no_ses no_snap no_up_list no_voldump re_dhash plain_ip" | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 ed
						ed