copyparty/copyparty/up2k.py

# coding: utf-8
from __future__ import print_function, unicode_literals

import re
import os
import time
import math
import json
import gzip
import stat
import shutil
import base64
import hashlib
import threading
import traceback
from copy import deepcopy

from .__init__ import WINDOWS
from .util import (
    Pebkac,
    Queue,
    ProgressPrinter,
    fsdec,
    fsenc,
    sanitize_fn,
    ren_open,
    atomic_move,
    s3enc,
    s3dec,
    statdir,
)
from .mtag import MTag
from .authsrv import AuthSrv

try:
    HAVE_SQLITE3 = True
    import sqlite3
except:
    HAVE_SQLITE3 = False


class Up2k(object):
    """
    TODO:
      * documentation
      * registry persistence
        * ~/.config flatfiles for active jobs
    """

    def __init__(self, broker):
        self.broker = broker
        self.args = broker.args
        self.log_func = broker.log

        # config
        self.salt = broker.args.salt

        # state
        self.mutex = threading.Lock()
        self.hashq = Queue()
        self.tagq = Queue()
        self.registry = {}
        self.entags = {}
        self.flags = {}
        self.cur = {}
        self.mtag = None
        self.n_mtag_tags_added = -1

        self.mem_cur = None
        self.sqlite_ver = None
        self.no_expr_idx = False
        if HAVE_SQLITE3:
            # mojibake detector
            self.mem_cur = self._orz(":memory:")
            self.mem_cur.execute(r"create table a (b text)")
            self.sqlite_ver = tuple([int(x) for x in sqlite3.sqlite_version.split(".")])
            if self.sqlite_ver < (3, 9):
                self.no_expr_idx = True

        if WINDOWS:
            # usually fails to set lastmod too quickly
            self.lastmod_q = Queue()
            thr = threading.Thread(target=self._lastmodder)
            thr.daemon = True
            thr.start()

        # static
        self.r_hash = re.compile("^[0-9a-zA-Z_-]{43}$")

        if not HAVE_SQLITE3:
            self.log("could not initialize sqlite3, will use in-memory registry only")

        # this is kinda jank
        auth = AuthSrv(self.args, self.log_func, False)
        have_e2d = self.init_indexes(auth)

        if have_e2d:
            thr = threading.Thread(target=self._snapshot)
            thr.daemon = True
            thr.start()

            thr = threading.Thread(target=self._tagger)
            thr.daemon = True
            thr.start()

            thr = threading.Thread(target=self._hasher)
            thr.daemon = True
            thr.start()

    def log(self, msg, c=0):
        self.log_func("up2k", msg + "\033[K", c)

    def _vis_job_progress(self, job):
        perc = 100 - (len(job["need"]) * 100.0 / len(job["hash"]))
        path = os.path.join(job["ptop"], job["prel"], job["name"])
        return "{:5.1f}% {}".format(perc, path)

    def _vis_reg_progress(self, reg):
        ret = []
        for _, job in reg.items():
            ret.append(self._vis_job_progress(job))

        return ret

    def _expr_idx_filter(self, flags):
        if not self.no_expr_idx:
            return False, flags

        ret = {k: v for k, v in flags.items() if not k.startswith("e2t")}
        if ret.keys() == flags.keys():
            return False, flags

        return True, ret

    def init_indexes(self, auth):
        self.pp = ProgressPrinter()
        vols = auth.vfs.all_vols.values()
        t0 = time.time()
        have_e2d = False

        if self.no_expr_idx:
            modified = False
            for vol in vols:
                m, f = self._expr_idx_filter(vol.flags)
                if m:
                    vol.flags = f
                    modified = True

            if modified:
                msg = "disabling -e2t because your sqlite belongs in a museum"
                self.log(msg, c=3)

        live_vols = []
        for vol in vols:
            try:
                os.listdir(vol.realpath)
                live_vols.append(vol)
            except:
                self.log("cannot access " + vol.realpath, c=1)

        vols = live_vols

        need_mtag = False
        for vol in vols:
            if "e2t" in vol.flags:
                need_mtag = True

        if need_mtag:
            self.mtag = MTag(self.log_func, self.args)
            if not self.mtag.usable:
                self.mtag = None

        # e2ds(a) volumes first,
        # also covers tags where e2ts is set
        for vol in vols:
            en = {}
            if "mte" in vol.flags:
                en = {k: True for k in vol.flags["mte"].split(",")}

            self.entags[vol.realpath] = en

            if "e2d" in vol.flags:
                have_e2d = True

            if "e2ds" in vol.flags:
                r = self._build_file_index(vol, vols)
                if not r:
                    needed_mutagen = True

        # open the rest + do any e2ts(a)
        needed_mutagen = False
        for vol in vols:
            r = self.register_vpath(vol.realpath, vol.flags)
            if not r or "e2ts" not in vol.flags:
                continue

            cur, db_path, sz0 = r
            n_add, n_rm, success = self._build_tags_index(vol.realpath)
            if not success:
                needed_mutagen = True

            if n_add or n_rm:
                self.vac(cur, db_path, n_add, n_rm, sz0)

        self.pp.end = True
        msg = "{} volumes in {:.2f} sec"
        self.log(msg.format(len(vols), time.time() - t0))

        if needed_mutagen:
            msg = "could not read tags because no backends are available (mutagen or ffprobe)"
            self.log(msg, c=1)

        return have_e2d

    def register_vpath(self, ptop, flags):
        with self.mutex:
            if ptop in self.registry:
                return None

            _, flags = self._expr_idx_filter(flags)

            reg = {}
            path = os.path.join(ptop, ".hist", "up2k.snap")
            if "e2d" in flags and os.path.exists(path):
                with gzip.GzipFile(path, "rb") as f:
                    j = f.read().decode("utf-8")

                reg = json.loads(j)
                for _, job in reg.items():
                    job["poke"] = time.time()

                m = "loaded snap {} |{}|".format(path, len(reg.keys()))
                m = [m] + self._vis_reg_progress(reg)
                self.log("\n".join(m))

            self.flags[ptop] = flags
            self.registry[ptop] = reg
            if not HAVE_SQLITE3 or "e2d" not in flags or "d2d" in flags:
                return None

            try:
                os.mkdir(os.path.join(ptop, ".hist"))
            except:
                pass

            db_path = os.path.join(ptop, ".hist", "up2k.db")
            if ptop in self.cur:
                return None

            try:
                sz0 = 0
                if os.path.exists(db_path):
                    sz0 = os.path.getsize(db_path) // 1024

                cur = self._open_db(db_path)
                self.cur[ptop] = cur
                return [cur, db_path, sz0]
            except:
                msg = "cannot use database at [{}]:\n{}"
                self.log(msg.format(ptop, traceback.format_exc()))

            return None

    def _build_file_index(self, vol, all_vols):
        do_vac = False
        top = vol.realpath
        reg = self.register_vpath(top, vol.flags)
        if not reg:
            return

        _, db_path, sz0 = reg
        dbw = [reg[0], 0, time.time()]
        self.pp.n = next(dbw[0].execute("select count(w) from up"))[0]

        # can be symlink so don't `and d.startswith(top)``
        excl = set([d.realpath for d in all_vols if d != vol])
        n_add = self._build_dir(dbw, top, excl, top)
        n_rm = self._drop_lost(dbw[0], top)
        if dbw[1]:
            self.log("commit {} new files".format(dbw[1]))
            dbw[0].connection.commit()

        n_add, n_rm, success = self._build_tags_index(vol.realpath)

        dbw[0].connection.commit()
        if n_add or n_rm or do_vac:
            self.vac(dbw[0], db_path, n_add, n_rm, sz0)

        return success

    def vac(self, cur, db_path, n_add, n_rm, sz0):
        sz1 = os.path.getsize(db_path) // 1024
        cur.execute("vacuum")
        sz2 = os.path.getsize(db_path) // 1024
        msg = "{} new, {} del, {} kB vacced, {} kB gain, {} kB now".format(
            n_add, n_rm, sz1 - sz2, sz2 - sz0, sz2
        )
        self.log(msg)

    def _build_dir(self, dbw, top, excl, cdir):
        self.pp.msg = "a{} {}".format(self.pp.n, cdir)
        histdir = os.path.join(top, ".hist")
        ret = 0
        for iname, inf in statdir(self.log, not self.args.no_scandir, False, cdir):
            abspath = os.path.join(cdir, iname)
            lmod = int(inf.st_mtime)
            if stat.S_ISDIR(inf.st_mode):
                if abspath in excl or abspath == histdir:
                    continue
                # self.log(" dir: {}".format(abspath))
                ret += self._build_dir(dbw, top, excl, abspath)
            else:
                # self.log("file: {}".format(abspath))
                rp = abspath[len(top) :].replace("\\", "/").strip("/")
                rd, fn = rp.rsplit("/", 1) if "/" in rp else ["", rp]
                sql = "select * from up where rd = ? and fn = ?"
                try:
                    c = dbw[0].execute(sql, (rd, fn))
                except:
                    c = dbw[0].execute(sql, s3enc(self.mem_cur, rd, fn))

                in_db = list(c.fetchall())
                if in_db:
                    self.pp.n -= 1
                    _, dts, dsz, _, _ = in_db[0]
                    if len(in_db) > 1:
                        m = "WARN: multiple entries: [{}] => [{}] |{}|\n{}"
                        rep_db = "\n".join([repr(x) for x in in_db])
                        self.log(m.format(top, rp, len(in_db), rep_db))
                        dts = -1

                    if dts == lmod and dsz == inf.st_size:
                        continue

                    m = "reindex [{}] => [{}] ({}/{}) ({}/{})".format(
                        top, rp, dts, lmod, dsz, inf.st_size
                    )
                    self.log(m)
                    self.db_rm(dbw[0], rd, fn)
                    ret += 1
                    dbw[1] += 1
                    in_db = None

                self.pp.msg = "a{} {}".format(self.pp.n, abspath)
                if inf.st_size > 1024 * 1024:
                    self.log("file: {}".format(abspath))

                try:
                    hashes = self._hashlist_from_file(abspath)
                except Exception as ex:
                    self.log("hash: {} @ [{}]".format(repr(ex), abspath))
                    continue

                wark = up2k_wark_from_hashlist(self.salt, inf.st_size, hashes)
                self.db_add(dbw[0], wark, rd, fn, lmod, inf.st_size)
                dbw[1] += 1
                ret += 1
                td = time.time() - dbw[2]
                if dbw[1] >= 4096 or td >= 60:
                    self.log("commit {} new files".format(dbw[1]))
                    dbw[0].connection.commit()
                    dbw[1] = 0
                    dbw[2] = time.time()
        return ret

    def _drop_lost(self, cur, top):
        rm = []
        nchecked = 0
        nfiles = next(cur.execute("select count(w) from up"))[0]
        c = cur.execute("select * from up")
        for dwark, dts, dsz, drd, dfn in c:
            nchecked += 1
            if drd.startswith("//") or dfn.startswith("//"):
                drd, dfn = s3dec(drd, dfn)

            abspath = os.path.join(top, drd, dfn)
            # almost zero overhead dw
            self.pp.msg = "b{} {}".format(nfiles - nchecked, abspath)
            try:
                if not os.path.exists(fsenc(abspath)):
                    rm.append([drd, dfn])
            except Exception as ex:
                self.log("stat-rm: {} @ [{}]".format(repr(ex), abspath))

        if rm:
            self.log("forgetting {} deleted files".format(len(rm)))
            for rd, fn in rm:
                # self.log("{} / {}".format(rd, fn))
                self.db_rm(cur, rd, fn)

        return len(rm)

    def _build_tags_index(self, ptop):
        entags = self.entags[ptop]
        flags = self.flags[ptop]
        cur = self.cur[ptop]
        n_add = 0
        n_rm = 0
        n_buf = 0
        last_write = time.time()

        if "e2tsr" in flags:
            n_rm = cur.execute("select count(w) from mt").fetchone()[0]
            if n_rm:
                self.log("discarding {} media tags for a full rescan".format(n_rm))
                cur.execute("delete from mt")
            else:
                self.log("volume has e2tsr but there are no media tags to discard")

        # integrity: drop tags for tracks that were deleted
        if "e2t" in flags:
            drops = []
            c2 = cur.connection.cursor()
            up_q = "select w from up where substr(w,1,16) = ?"
            for (w,) in cur.execute("select w from mt"):
                if not c2.execute(up_q, (w,)).fetchone():
                    drops.append(w[:16])
            c2.close()

            if drops:
                msg = "discarding media tags for {} deleted files"
                self.log(msg.format(len(drops)))
                n_rm += len(drops)
                for w in drops:
                    cur.execute("delete from mt where w = ?", (w,))

        # bail if a volume flag disables indexing
        if "d2t" in flags or "d2d" in flags:
            return n_add, n_rm, True

        # add tags for new files
        if "e2ts" in flags:
            if not self.mtag:
                return n_add, n_rm, False

            mpool = False
            if self.mtag.prefer_mt and not self.args.no_mtag_mt:
                # mp.pool.ThreadPool and concurrent.futures.ThreadPoolExecutor
                # both do crazy runahead so lets reinvent another wheel
                nw = os.cpu_count() if hasattr(os, "cpu_count") else 4
                if self.n_mtag_tags_added == -1:
                    self.log("using {}x {}".format(nw, self.mtag.backend))
                    self.n_mtag_tags_added = 0

                mpool = Queue(nw)
                for _ in range(nw):
                    thr = threading.Thread(target=self._tag_thr, args=(mpool,))
                    thr.daemon = True
                    thr.start()

            c2 = cur.connection.cursor()
            c3 = cur.connection.cursor()
            n_left = cur.execute("select count(w) from up").fetchone()[0]
            for w, rd, fn in cur.execute("select w, rd, fn from up"):
                n_left -= 1
                q = "select w from mt where w = ?"
                if c2.execute(q, (w[:16],)).fetchone():
                    continue

                if rd.startswith("//") or fn.startswith("//"):
                    rd, fn = s3dec(rd, fn)

                abspath = os.path.join(ptop, rd, fn)
                self.pp.msg = "c{} {}".format(n_left, abspath)
                args = c3, entags, w, abspath
                if not mpool:
                    n_tags = self._tag_file(*args)
                else:
                    mpool.put(args)
                    with self.mutex:
                        n_tags = self.n_mtag_tags_added
                        self.n_mtag_tags_added = 0

                n_add += n_tags
                n_buf += n_tags

                td = time.time() - last_write
                if n_buf >= 4096 or td >= 60:
                    self.log("commit {} new tags".format(n_buf))
                    cur.connection.commit()
                    last_write = time.time()
                    n_buf = 0

            if mpool:
                for _ in range(mpool.maxsize):
                    mpool.put(None)

                mpool.join()

            c3.close()
            c2.close()

        return n_add, n_rm, True

    def _tag_thr(self, q):
        while True:
            task = q.get()
            if not task:
                q.task_done()
                return

            try:
                write_cur, entags, wark, abspath = task
                tags = self.mtag.get(abspath)
                with self.mutex:
                    n = self._tag_file(write_cur, entags, wark, abspath, tags)
                    self.n_mtag_tags_added += n
            except:
                ex = traceback.format_exc()
                msg = "{} failed to read tags from {}:\n{}"
                self.log(msg.format(self.mtag.backend, abspath, ex), c=3)

            q.task_done()

    def _tag_file(self, write_cur, entags, wark, abspath, tags=None):
        tags = tags or self.mtag.get(abspath)
        tags = {k: v for k, v in tags.items() if k in entags}
        if not tags:
            # indicate scanned without tags
            tags = {"x": 0}

        ret = 0
        for k, v in tags.items():
            q = "insert into mt values (?,?,?)"
            write_cur.execute(q, (wark[:16], k, v))
            ret += 1

        return ret

    def _orz(self, db_path):
        return sqlite3.connect(db_path, check_same_thread=False).cursor()

    def _open_db(self, db_path):
        existed = os.path.exists(db_path)
        cur = self._orz(db_path)
        ver = self._read_ver(cur)
        if not existed and ver is None:
            return self._create_db(db_path, cur)

        orig_ver = ver
        if not ver or ver < 3:
            bak = "{}.bak.{:x}.v{}".format(db_path, int(time.time()), ver)
            db = cur.connection
            cur.close()
            db.close()
            msg = "creating new DB (old is bad); backup: {}"
            if ver:
                msg = "creating backup before upgrade: {}"

            self.log(msg.format(bak))
            shutil.copy2(db_path, bak)
            cur = self._orz(db_path)

        if ver == 1:
            cur = self._upgrade_v1(cur, db_path)
            if cur:
                ver = 2

        if ver == 2:
            cur = self._create_v3(cur)
            ver = self._read_ver(cur) if cur else None

        if ver == 3:
            if orig_ver != ver:
                cur.connection.commit()
                cur.execute("vacuum")
                cur.connection.commit()

            try:
                nfiles = next(cur.execute("select count(w) from up"))[0]
                self.log("OK: {} |{}|".format(db_path, nfiles))
                return cur
            except Exception as ex:
                self.log("WARN: could not list files, DB corrupt?\n  " + repr(ex))

        if cur:
            db = cur.connection
            cur.close()
            db.close()

        return self._create_db(db_path, None)

    def _create_db(self, db_path, cur):
        if not cur:
            cur = self._orz(db_path)

        self._create_v2(cur)
        self._create_v3(cur)
        cur.connection.commit()
        self.log("created DB at {}".format(db_path))
        return cur

    def _read_ver(self, cur):
        for tab in ["ki", "kv"]:
            try:
                c = cur.execute(r"select v from {} where k = 'sver'".format(tab))
            except:
                continue

            rows = c.fetchall()
            if rows:
                return int(rows[0][0])

    def _create_v2(self, cur):
        for cmd in [
            r"create table up (w text, mt int, sz int, rd text, fn text)",
            r"create index up_rd on up(rd)",
            r"create index up_fn on up(fn)",
        ]:
            cur.execute(cmd)
        return cur

    def _create_v3(self, cur):
        """
        collision in 2^(n/2) files where n = bits (6 bits/ch)
          10*6/2 = 2^30 =       1'073'741'824, 24.1mb idx
          12*6/2 = 2^36 =      68'719'476'736, 24.8mb idx
          16*6/2 = 2^48 = 281'474'976'710'656, 26.1mb idx
        """
        for c, ks in [["drop table k", "isv"], ["drop index up_", "w"]]:
            for k in ks:
                try:
                    cur.execute(c + k)
                except:
                    pass

        idx = r"create index up_w on up(substr(w,1,16))"
        if self.no_expr_idx:
            idx = r"create index up_w on up(w)"

        for cmd in [
            idx,
            r"create table mt (w text, k text, v int)",
            r"create index mt_w on mt(w)",
            r"create index mt_k on mt(k)",
            r"create index mt_v on mt(v)",
            r"create table kv (k text, v int)",
            r"insert into kv values ('sver', 3)",
        ]:
            cur.execute(cmd)
        return cur

    def _upgrade_v1(self, odb, db_path):
        npath = db_path + ".next"
        if os.path.exists(npath):
            os.unlink(npath)

        ndb = self._orz(npath)
        self._create_v2(ndb)

        c = odb.execute("select * from up")
        for wark, ts, sz, rp in c:
            rd, fn = rp.rsplit("/", 1) if "/" in rp else ["", rp]
            v = (wark, ts, sz, rd, fn)
            ndb.execute("insert into up values (?,?,?,?,?)", v)

        ndb.connection.commit()
        ndb.connection.close()
        odb.connection.close()
        atomic_move(npath, db_path)
        return self._orz(db_path)

    def handle_json(self, cj):
        if not self.register_vpath(cj["ptop"], cj["vcfg"]):
            if cj["ptop"] not in self.registry:
                raise Pebkac(410, "location unavailable")

        cj["name"] = sanitize_fn(cj["name"])
        cj["poke"] = time.time()
        wark = self._get_wark(cj)
        now = time.time()
        job = None
        with self.mutex:
            cur = self.cur.get(cj["ptop"], None)
            reg = self.registry[cj["ptop"]]
            if cur:
                if self.no_expr_idx:
                    q = r"select * from up where w = ?"
                    argv = (wark,)
                else:
                    q = r"select * from up where substr(w,1,16) = ? and w = ?"
                    argv = (wark[:16], wark)

                cur = cur.execute(q, argv)
                for _, dtime, dsize, dp_dir, dp_fn in cur:
                    if dp_dir.startswith("//") or dp_fn.startswith("//"):
                        dp_dir, dp_fn = s3dec(dp_dir, dp_fn)

                    dp_abs = os.path.join(cj["ptop"], dp_dir, dp_fn).replace("\\", "/")
                    # relying on path.exists to return false on broken symlinks
                    if os.path.exists(fsenc(dp_abs)):
                        job = {
                            "name": dp_fn,
                            "prel": dp_dir,
                            "vtop": cj["vtop"],
                            "ptop": cj["ptop"],
                            "size": dsize,
                            "lmod": dtime,
                            "hash": [],
                            "need": [],
                        }
                        break

                if job and wark in reg:
                    del reg[wark]

            if job or wark in reg:
                job = job or reg[wark]
                if job["prel"] == cj["prel"] and job["name"] == cj["name"]:
                    # ensure the files haven't been deleted manually
                    names = [job[x] for x in ["name", "tnam"] if x in job]
                    for fn in names:
                        path = os.path.join(job["ptop"], job["prel"], fn)
                        try:
                            if os.path.getsize(path) > 0:
                                # upload completed or both present
                                break
                        except:
                            # missing; restart
                            job = None
                            break
                else:
                    # file contents match, but not the path
                    src = os.path.join(job["ptop"], job["prel"], job["name"])
                    dst = os.path.join(cj["ptop"], cj["prel"], cj["name"])
                    vsrc = os.path.join(job["vtop"], job["prel"], job["name"])
                    vsrc = vsrc.replace("\\", "/")  # just for prints anyways
                    if job["need"]:
                        self.log("unfinished:\n  {0}\n  {1}".format(src, dst))
                        err = "partial upload exists at a different location; please resume uploading here instead:\n"
                        err += "/" + vsrc + " "
                        raise Pebkac(400, err)
                    elif "nodupe" in self.flags[job["ptop"]]:
                        self.log("dupe-reject:\n  {0}\n  {1}".format(src, dst))
                        err = "upload rejected, file already exists:\n/" + vsrc + " "
                        raise Pebkac(400, err)
                    else:
                        # symlink to the client-provided name,
                        # returning the previous upload info
                        job = deepcopy(job)
                        for k in ["ptop", "vtop", "prel"]:
                            job[k] = cj[k]

                        pdir = os.path.join(cj["ptop"], cj["prel"])
                        job["name"] = self._untaken(pdir, cj["name"], now, cj["addr"])
                        dst = os.path.join(job["ptop"], job["prel"], job["name"])
                        os.unlink(fsenc(dst))  # TODO ed pls
                        self._symlink(src, dst)

            if not job:
                job = {
                    "wark": wark,
                    "t0": now,
                    "hash": deepcopy(cj["hash"]),
                    "need": [],
                }
                # client-provided, sanitized by _get_wark: name, size, lmod
                for k in [
                    "addr",
                    "vtop",
                    "ptop",
                    "prel",
                    "name",
                    "size",
                    "lmod",
                    "poke",
                ]:
                    job[k] = cj[k]

                # one chunk may occur multiple times in a file;
                # filter to unique values for the list of missing chunks
                # (preserve order to reduce disk thrashing)
                lut = {}
                for k in cj["hash"]:
                    if k not in lut:
                        job["need"].append(k)
                        lut[k] = 1

                self._new_upload(job)

            return {
                "name": job["name"],
                "size": job["size"],
                "lmod": job["lmod"],
                "hash": job["need"],
                "wark": wark,
            }

    def _untaken(self, fdir, fname, ts, ip):
        # TODO broker which avoid this race and
        # provides a new filename if taken (same as bup)
        suffix = ".{:.6f}-{}".format(ts, ip)
        with ren_open(fname, "wb", fdir=fdir, suffix=suffix) as f:
            return f["orz"][1]

    def _symlink(self, src, dst):
        # TODO store this in linktab so we never delete src if there are links to it
        self.log("linking dupe:\n  {0}\n  {1}".format(src, dst))
        try:
            lsrc = src
            ldst = dst
            fs1 = os.stat(fsenc(os.path.split(src)[0])).st_dev
            fs2 = os.stat(fsenc(os.path.split(dst)[0])).st_dev
            if fs1 == 0:
                # py2 on winxp or other unsupported combination
                raise OSError()
            elif fs1 == fs2:
                # same fs; make symlink as relative as possible
                v = []
                for p in [src, dst]:
                    if WINDOWS:
                        p = p.replace("\\", "/")
                    v.append(p.split("/"))

                nsrc, ndst = v
                nc = 0
                for a, b in zip(nsrc, ndst):
                    if a != b:
                        break
                    nc += 1
                if nc > 1:
                    lsrc = nsrc[nc:]
                    hops = len(ndst[nc:]) - 1
                    lsrc = "../" * hops + "/".join(lsrc)
            os.symlink(fsenc(lsrc), fsenc(ldst))
        except (AttributeError, OSError) as ex:
            self.log("cannot symlink; creating copy: " + repr(ex))
            shutil.copy2(fsenc(src), fsenc(dst))

    def handle_chunk(self, ptop, wark, chash):
        with self.mutex:
            job = self.registry[ptop].get(wark, None)
            if not job:
                raise Pebkac(400, "unknown wark")

            if chash not in job["need"]:
                raise Pebkac(400, "already got that but thanks??")

            nchunk = [n for n, v in enumerate(job["hash"]) if v == chash]
            if not nchunk:
                raise Pebkac(400, "unknown chunk")

        job["poke"] = time.time()

        chunksize = up2k_chunksize(job["size"])
        ofs = [chunksize * x for x in nchunk]

        path = os.path.join(job["ptop"], job["prel"], job["tnam"])

        return [chunksize, ofs, path, job["lmod"]]

    def confirm_chunk(self, ptop, wark, chash):
        with self.mutex:
            try:
                job = self.registry[ptop][wark]
                pdir = os.path.join(job["ptop"], job["prel"])
                src = os.path.join(pdir, job["tnam"])
                dst = os.path.join(pdir, job["name"])
            except Exception as ex:
                return "confirm_chunk, wark, " + repr(ex)

            try:
                job["need"].remove(chash)
            except Exception as ex:
                return "confirm_chunk, chash, " + repr(ex)

            ret = len(job["need"])
            if ret > 0:
                return ret, src

            atomic_move(src, dst)

            if WINDOWS:
                self.lastmod_q.put([dst, (int(time.time()), int(job["lmod"]))])

            # legit api sware 2 me mum
            if self.idx_wark(
                job["ptop"],
                job["wark"],
                job["prel"],
                job["name"],
                job["lmod"],
                job["size"],
            ):
                del self.registry[ptop][wark]
                # in-memory registry is reserved for unfinished uploads

        return ret, dst

    def idx_wark(self, ptop, wark, rd, fn, lmod, sz):
        cur = self.cur.get(ptop, None)
        if not cur:
            return False

        self.db_rm(cur, rd, fn)
        self.db_add(cur, wark, rd, fn, int(lmod), sz)
        cur.connection.commit()

        if "e2t" in self.flags[ptop]:
            self.tagq.put([ptop, wark, rd, fn])

        return True

    def db_rm(self, db, rd, fn):
        sql = "delete from up where rd = ? and fn = ?"
        try:
            db.execute(sql, (rd, fn))
        except:
            db.execute(sql, s3enc(self.mem_cur, rd, fn))

    def db_add(self, db, wark, rd, fn, ts, sz):
        sql = "insert into up values (?,?,?,?,?)"
        v = (wark, int(ts), sz, rd, fn)
        try:
            db.execute(sql, v)
        except:
            rd, fn = s3enc(self.mem_cur, rd, fn)
            v = (wark, ts, sz, rd, fn)
            db.execute(sql, v)

    def _get_wark(self, cj):
        if len(cj["name"]) > 1024 or len(cj["hash"]) > 512 * 1024:  # 16TiB
            raise Pebkac(400, "name or numchunks not according to spec")

        for k in cj["hash"]:
            if not self.r_hash.match(k):
                raise Pebkac(
                    400, "at least one hash is not according to spec: {}".format(k)
                )

        # try to use client-provided timestamp, don't care if it fails somehow
        try:
            cj["lmod"] = int(cj["lmod"])
        except:
            cj["lmod"] = int(time.time())

        wark = up2k_wark_from_hashlist(self.salt, cj["size"], cj["hash"])
        return wark

    def _hashlist_from_file(self, path):
        fsz = os.path.getsize(path)
        csz = up2k_chunksize(fsz)
        ret = []
        with open(path, "rb", 512 * 1024) as f:
            while fsz > 0:
                self.pp.msg = "{} MB, {}".format(int(fsz / 1024 / 1024), path)
                hashobj = hashlib.sha512()
                rem = min(csz, fsz)
                fsz -= rem
                while rem > 0:
                    buf = f.read(min(rem, 64 * 1024))
                    if not buf:
                        raise Exception("EOF at " + str(f.tell()))

                    hashobj.update(buf)
                    rem -= len(buf)

                digest = hashobj.digest()[:32]
                digest = base64.urlsafe_b64encode(digest)
                ret.append(digest.decode("utf-8").rstrip("="))

        return ret

    def _new_upload(self, job):
        self.registry[job["ptop"]][job["wark"]] = job
        pdir = os.path.join(job["ptop"], job["prel"])
        job["name"] = self._untaken(pdir, job["name"], job["t0"], job["addr"])
        # if len(job["name"].split(".")) > 8:
        #    raise Exception("aaa")

        tnam = job["name"] + ".PARTIAL"
        suffix = ".{:.6f}-{}".format(job["t0"], job["addr"])
        with ren_open(tnam, "wb", fdir=pdir, suffix=suffix) as f:
            f, job["tnam"] = f["orz"]
            f.seek(job["size"] - 1)
            f.write(b"e")

    def _lastmodder(self):
        while True:
            ready = []
            while not self.lastmod_q.empty():
                ready.append(self.lastmod_q.get())

            # self.log("lmod: got {}".format(len(ready)))
            time.sleep(5)
            for path, times in ready:
                self.log("lmod: setting times {} on {}".format(times, path))
                try:
                    os.utime(fsenc(path), times)
                except:
                    self.log("lmod: failed to utime ({}, {})".format(path, times))

    def _snapshot(self):
        persist_interval = 30  # persist unfinished uploads index every 30 sec
        discard_interval = 21600  # drop unfinished uploads after 6 hours inactivity
        prev = {}
        while True:
            time.sleep(persist_interval)
            with self.mutex:
                for k, reg in self.registry.items():
                    self._snap_reg(prev, k, reg, discard_interval)

    def _snap_reg(self, prev, k, reg, discard_interval):
        now = time.time()
        rm = [x for x in reg.values() if now - x["poke"] > discard_interval]
        if rm:
            m = "dropping {} abandoned uploads in {}".format(len(rm), k)
            vis = [self._vis_job_progress(x) for x in rm]
            self.log("\n".join([m] + vis))
            for job in rm:
                del reg[job["wark"]]
                try:
                    # remove the filename reservation
                    path = os.path.join(job["ptop"], job["prel"], job["name"])
                    if os.path.getsize(path) == 0:
                        os.unlink(path)

                    if len(job["hash"]) == len(job["need"]):
                        # PARTIAL is empty, delete that too
                        path = os.path.join(job["ptop"], job["prel"], job["tnam"])
                        os.unlink(path)
                except:
                    pass

        path = os.path.join(k, ".hist", "up2k.snap")
        if not reg:
            if k not in prev or prev[k] is not None:
                prev[k] = None
                if os.path.exists(path):
                    os.unlink(path)
            return

        newest = max(x["poke"] for _, x in reg.items()) if reg else 0
        etag = [len(reg), newest]
        if etag == prev.get(k, None):
            return

        try:
            os.mkdir(os.path.join(k, ".hist"))
        except:
            pass

        path2 = "{}.{}".format(path, os.getpid())
        j = json.dumps(reg, indent=2, sort_keys=True).encode("utf-8")
        with gzip.GzipFile(path2, "wb") as f:
            f.write(j)

        atomic_move(path2, path)

        self.log("snap: {} |{}|".format(path, len(reg.keys())))
        prev[k] = etag

    def _tagger(self):
        while True:
            ptop, wark, rd, fn = self.tagq.get()
            abspath = os.path.join(ptop, rd, fn)
            self.log("tagging " + abspath)
            with self.mutex:
                cur = self.cur[ptop]
                if not cur:
                    self.log("no cursor to write tags with??", c=1)
                    continue

                entags = self.entags[ptop]
                if not entags:
                    self.log("no entags okay.jpg", c=3)
                    continue

                if "e2t" in self.flags[ptop]:
                    self._tag_file(cur, entags, wark, abspath)

                cur.connection.commit()

    def _hasher(self):
        while True:
            ptop, rd, fn = self.hashq.get()
            if "e2d" not in self.flags[ptop]:
                continue

            abspath = os.path.join(ptop, rd, fn)
            self.log("hashing " + abspath)
            inf = os.stat(fsenc(abspath))
            hashes = self._hashlist_from_file(abspath)
            wark = up2k_wark_from_hashlist(self.salt, inf.st_size, hashes)
            with self.mutex:
                self.idx_wark(ptop, wark, rd, fn, inf.st_mtime, inf.st_size)

    def hash_file(self, ptop, flags, rd, fn):
        self.register_vpath(ptop, flags)
        self.hashq.put([ptop, rd, fn])


def up2k_chunksize(filesize):
    chunksize = 1024 * 1024
    stepsize = 512 * 1024
    while True:
        for mul in [1, 2]:
            nchunks = math.ceil(filesize * 1.0 / chunksize)
            if nchunks <= 256 or chunksize >= 32 * 1024 * 1024:
                return chunksize

            chunksize += stepsize
            stepsize *= mul


def up2k_wark_from_hashlist(salt, filesize, hashes):
    """ server-reproducible file identifier, independent of name or location """
    ident = [salt, str(filesize)]
    ident.extend(hashes)
    ident = "\n".join(ident)

    hasher = hashlib.sha512()
    hasher.update(ident.encode("utf-8"))
    digest = hasher.digest()[:32]

    wark = base64.urlsafe_b64encode(digest)
    return wark.decode("utf-8").rstrip("=")