verify on-disk contents before dedup;
previously, the assumption was made that the database and filesystem would not desync, and that an upload could safely be substituted with a symlink to an existing copy on-disk, assuming said copy still existed on-disk at all this is fine if copyparty is the only software that makes changes to the filesystem, but that is a shitty assumption to make in hindsight add `--safe-dedup` which takes a "safety level", and by default (50) it will no longer blindly expect that the filesystem has not been altered through other means; the file contents will now be hashed and compared to the database deduplication can be much slower as a result, but definitely worth it as this avoids some potentially very unpleasant surprises the previous behavior can be restored with `--safe-dedup 1`
This commit is contained in:
		
							parent
							
								
									08848be784
								
							
						
					
					
						commit
						6e671c5245
					
				| @ -1978,7 +1978,7 @@ safety profiles: | ||||
|   * `--hardlink` creates hardlinks instead of symlinks when deduplicating uploads, which is less maintenance | ||||
|     * however note if you edit one file it will also affect the other copies | ||||
|   * `--vague-403` returns a "404 not found" instead of "401 unauthorized" which is a common enterprise meme | ||||
|   * `--nih` removes the server hostname from directory listings | ||||
|   * `-nih` removes the server hostname from directory listings | ||||
| 
 | ||||
| * option `-sss` is a shortcut for the above plus: | ||||
|   * `--no-dav` disables webdav support | ||||
|  | ||||
| @ -992,6 +992,7 @@ def add_upload(ap): | ||||
|     ap2.add_argument("--reg-cap", metavar="N", type=int, default=38400, help="max number of uploads to keep in memory when running without \033[33m-e2d\033[0m; roughly 1 MiB RAM per 600") | ||||
|     ap2.add_argument("--no-fpool", action="store_true", help="disable file-handle pooling -- instead, repeatedly close and reopen files during upload (bad idea to enable this on windows and/or cow filesystems)") | ||||
|     ap2.add_argument("--use-fpool", action="store_true", help="force file-handle pooling, even when it might be dangerous (multiprocessing, filesystems lacking sparse-files support, ...)") | ||||
|     ap2.add_argument("--safe-dedup", metavar="N", type=int, default=50, help="how careful to be when deduplicating files; [\033[32m1\033[0m] = just verify the filesize, [\033[32m50\033[0m] = verify file contents have not been altered (volflag=safededup)") | ||||
|     ap2.add_argument("--hardlink", action="store_true", help="prefer hardlinks instead of symlinks when possible (within same filesystem) (volflag=hardlink)") | ||||
|     ap2.add_argument("--never-symlink", action="store_true", help="do not fallback to symlinks when a hardlink cannot be made (volflag=neversymlink)") | ||||
|     ap2.add_argument("--no-dedup", action="store_true", help="disable symlink/hardlink creation; copy file contents instead (volflag=copydupes)") | ||||
|  | ||||
| @ -21,6 +21,7 @@ def vf_bmap() -> dict[str, str]: | ||||
|         "no_thumb": "dthumb", | ||||
|         "no_vthumb": "dvthumb", | ||||
|         "no_athumb": "dathumb", | ||||
|         "safe_dedup": "safededup", | ||||
|     } | ||||
|     for k in ( | ||||
|         "dotsrch", | ||||
| @ -132,6 +133,7 @@ flagcats = { | ||||
|         "nodupe": "rejects existing files (instead of symlinking them)", | ||||
|         "hardlink": "does dedup with hardlinks instead of symlinks", | ||||
|         "neversymlink": "disables symlink fallback; full copy instead", | ||||
|         "safededup": "verify on-disk data before using it for dedup", | ||||
|         "copydupes": "disables dedup, always saves full copies of dupes", | ||||
|         "sparse": "force use of sparse files, mainly for s3-backed storage", | ||||
|         "daw": "enable full WebDAV write support (dangerous);\nPUT-operations will now \033[1;31mOVERWRITE\033[0;35m existing files", | ||||
|  | ||||
| @ -1462,7 +1462,7 @@ class Up2k(object): | ||||
|                     self.log("file: {}".format(abspath)) | ||||
| 
 | ||||
|                 try: | ||||
|                     hashes = self._hashlist_from_file( | ||||
|                     hashes, _ = self._hashlist_from_file( | ||||
|                         abspath, "a{}, ".format(self.pp.n) | ||||
|                     ) | ||||
|                 except Exception as ex: | ||||
| @ -1711,7 +1711,7 @@ class Up2k(object): | ||||
|                         self.log("file: {}".format(abspath)) | ||||
| 
 | ||||
|                     try: | ||||
|                         hashes = self._hashlist_from_file(abspath, pf) | ||||
|                         hashes, _ = self._hashlist_from_file(abspath, pf) | ||||
|                     except Exception as ex: | ||||
|                         self.log("hash: {} @ [{}]".format(repr(ex), abspath)) | ||||
|                         continue | ||||
| @ -2670,6 +2670,9 @@ class Up2k(object): | ||||
|             rand = vfs.flags.get("rand") or cj.get("rand") | ||||
|             lost: list[tuple["sqlite3.Cursor", str, str]] = [] | ||||
| 
 | ||||
|             safe_dedup = vfs.flags.get("safededup") or 50 | ||||
|             data_ok = safe_dedup < 10 or n4g | ||||
| 
 | ||||
|             vols = [(ptop, jcur)] if jcur else [] | ||||
|             if vfs.flags.get("xlink"): | ||||
|                 vols += [(k, v) for k, v in self.cur.items() if k != ptop] | ||||
| @ -2677,7 +2680,7 @@ class Up2k(object): | ||||
|                 # force upload time rather than last-modified | ||||
|                 cj["lmod"] = int(time.time()) | ||||
| 
 | ||||
|             alts: list[tuple[int, int, dict[str, Any]]] = [] | ||||
|             alts: list[tuple[int, int, dict[str, Any], "sqlite3.Cursor", str, str]] = [] | ||||
|             for ptop, cur in vols: | ||||
|                 allv = self.asrv.vfs.all_vols | ||||
|                 cvfs = next((v for v in allv.values() if v.realpath == ptop), vfs) | ||||
| @ -2707,13 +2710,12 @@ class Up2k(object): | ||||
|                                 wark, st.st_size, dsize, st.st_mtime, dtime, dp_abs | ||||
|                             ) | ||||
|                             self.log(t) | ||||
|                             raise Exception("desync") | ||||
|                             raise Exception() | ||||
|                     except Exception as ex: | ||||
|                         if n4g: | ||||
|                             st = os.stat_result((0, -1, -1, 0, 0, 0, 0, 0, 0, 0)) | ||||
|                         else: | ||||
|                             if str(ex) != "desync": | ||||
|                                 lost.append((cur, dp_dir, dp_fn)) | ||||
|                             lost.append((cur, dp_dir, dp_fn)) | ||||
|                             continue | ||||
| 
 | ||||
|                     j = { | ||||
| @ -2736,18 +2738,42 @@ class Up2k(object): | ||||
|                         if k in cj: | ||||
|                             j[k] = cj[k] | ||||
| 
 | ||||
|                     # offset of 1st diff in vpaths | ||||
|                     zig = ( | ||||
|                         n + 1 | ||||
|                         for n, (c1, c2) in enumerate( | ||||
|                             zip(dp_dir + "\r", cj["prel"] + "\n") | ||||
|                         ) | ||||
|                         if c1 != c2 | ||||
|                     ) | ||||
|                     score = ( | ||||
|                         (3 if st.st_dev == dev else 0) | ||||
|                         + (2 if dp_dir == cj["prel"] else 0) | ||||
|                         (6969 if st.st_dev == dev else 0) | ||||
|                         + (3210 if dp_dir == cj["prel"] else next(zig)) | ||||
|                         + (1 if dp_fn == cj["name"] else 0) | ||||
|                     ) | ||||
|                     alts.append((score, -len(alts), j)) | ||||
|                     alts.append((score, -len(alts), j, cur, dp_dir, dp_fn)) | ||||
| 
 | ||||
|             if alts: | ||||
|                 best = sorted(alts, reverse=True)[0] | ||||
|                 job = best[2] | ||||
|             else: | ||||
|                 job = None | ||||
|             job = None | ||||
|             inc_ap = djoin(cj["ptop"], cj["prel"], cj["name"]) | ||||
|             for dupe in sorted(alts, reverse=True): | ||||
|                 rj = dupe[2] | ||||
|                 orig_ap = djoin(rj["ptop"], rj["prel"], rj["name"]) | ||||
|                 if data_ok or inc_ap == orig_ap: | ||||
|                     data_ok = True | ||||
|                     job = rj | ||||
|                     break | ||||
|                 else: | ||||
|                     self.log("asserting contents of %s" % (orig_ap,)) | ||||
|                     dhashes, st = self._hashlist_from_file(orig_ap) | ||||
|                     dwark = up2k_wark_from_hashlist(self.salt, st.st_size, dhashes) | ||||
|                     if wark != dwark: | ||||
|                         t = "will not dedup (fs index desync): fs=%s, db=%s, file: %s" | ||||
|                         self.log(t % (dwark, wark, orig_ap)) | ||||
|                         lost.append(dupe[3:]) | ||||
|                         continue | ||||
|                     data_ok = True | ||||
|                     job = rj | ||||
|                     break | ||||
| 
 | ||||
|             if job and wark in reg: | ||||
|                 # self.log("pop " + wark + "  " + job["name"] + " handle_json db", 4) | ||||
| @ -2756,7 +2782,7 @@ class Up2k(object): | ||||
|             if lost: | ||||
|                 c2 = None | ||||
|                 for cur, dp_dir, dp_fn in lost: | ||||
|                     t = "forgetting deleted file: /{}" | ||||
|                     t = "forgetting desynced db entry: /{}" | ||||
|                     self.log(t.format(vjoin(vjoin(vfs.vpath, dp_dir), dp_fn))) | ||||
|                     self.db_rm(cur, dp_dir, dp_fn, cj["size"]) | ||||
|                     if c2 and c2 != cur: | ||||
| @ -2791,7 +2817,13 @@ class Up2k(object): | ||||
|                             del reg[wark] | ||||
|                         break | ||||
| 
 | ||||
|                 if st and not self.args.nw and not n4g and st.st_size != rj["size"]: | ||||
|                 inc_ap = djoin(cj["ptop"], cj["prel"], cj["name"]) | ||||
|                 orig_ap = djoin(rj["ptop"], rj["prel"], rj["name"]) | ||||
| 
 | ||||
|                 if self.args.nw or n4g or not st: | ||||
|                     pass | ||||
| 
 | ||||
|                 elif st.st_size != rj["size"]: | ||||
|                     t = "will not dedup (fs index desync): {}, size fs={} db={}, mtime fs={} db={}, file: {}" | ||||
|                     t = t.format( | ||||
|                         wark, st.st_size, rj["size"], st.st_mtime, rj["lmod"], path | ||||
| @ -2799,6 +2831,15 @@ class Up2k(object): | ||||
|                     self.log(t) | ||||
|                     del reg[wark] | ||||
| 
 | ||||
|                 elif inc_ap != orig_ap and not data_ok: | ||||
|                     self.log("asserting contents of %s" % (orig_ap,)) | ||||
|                     dhashes, _ = self._hashlist_from_file(orig_ap) | ||||
|                     dwark = up2k_wark_from_hashlist(self.salt, st.st_size, dhashes) | ||||
|                     if wark != dwark: | ||||
|                         t = "will not dedup (fs index desync): fs=%s, idx=%s, file: %s" | ||||
|                         self.log(t % (dwark, wark, orig_ap)) | ||||
|                         del reg[wark] | ||||
| 
 | ||||
|             if job or wark in reg: | ||||
|                 job = job or reg[wark] | ||||
|                 if ( | ||||
| @ -4246,8 +4287,11 @@ class Up2k(object): | ||||
| 
 | ||||
|         return wark | ||||
| 
 | ||||
|     def _hashlist_from_file(self, path: str, prefix: str = "") -> list[str]: | ||||
|         fsz = bos.path.getsize(path) | ||||
|     def _hashlist_from_file( | ||||
|         self, path: str, prefix: str = "" | ||||
|     ) -> tuple[list[str], os.stat_result]: | ||||
|         st = bos.stat(path) | ||||
|         fsz = st.st_size | ||||
|         csz = up2k_chunksize(fsz) | ||||
|         ret = [] | ||||
|         suffix = " MB, {}".format(path) | ||||
| @ -4260,7 +4304,7 @@ class Up2k(object): | ||||
|             while fsz > 0: | ||||
|                 # same as `hash_at` except for `imutex` / bufsz | ||||
|                 if self.stop: | ||||
|                     return [] | ||||
|                     return [], st | ||||
| 
 | ||||
|                 if self.pp: | ||||
|                     mb = fsz // (1024 * 1024) | ||||
| @ -4281,7 +4325,7 @@ class Up2k(object): | ||||
|                 digest = base64.urlsafe_b64encode(digest) | ||||
|                 ret.append(digest.decode("utf-8")) | ||||
| 
 | ||||
|         return ret | ||||
|         return ret, st | ||||
| 
 | ||||
|     def _new_upload(self, job: dict[str, Any], vfs: VFS, depth: int) -> dict[str, str]: | ||||
|         pdir = djoin(job["ptop"], job["prel"]) | ||||
| @ -4582,7 +4626,7 @@ class Up2k(object): | ||||
|                 self.salt, inf.st_size, int(inf.st_mtime), rd, fn | ||||
|             ) | ||||
|         else: | ||||
|             hashes = self._hashlist_from_file(abspath) | ||||
|             hashes, _ = self._hashlist_from_file(abspath) | ||||
|             if not hashes: | ||||
|                 return False | ||||
| 
 | ||||
|  | ||||
| @ -126,7 +126,7 @@ class Cfg(Namespace): | ||||
|         ex = "ah_cli ah_gen css_browser hist js_browser js_other mime mimes no_forget no_hash no_idx nonsus_urls og_tpl og_ua" | ||||
|         ka.update(**{k: None for k in ex.split()}) | ||||
| 
 | ||||
|         ex = "hash_mt srch_time u2abort u2j u2sz" | ||||
|         ex = "hash_mt safe_dedup srch_time u2abort u2j u2sz" | ||||
|         ka.update(**{k: 1 for k in ex.split()}) | ||||
| 
 | ||||
|         ex = "au_vol mtab_age reg_cap s_thead s_tbody th_convt" | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 ed
						ed