model.zig

const std = @import("std");
const main = @import("main.zig");
const ui = @import("ui.zig");
usingnamespace @import("util.zig");

// While an arena allocator is optimimal for almost all scenarios in which ncdu
// is used, it doesn't allow for re-using deleted nodes after doing a delete or
// refresh operation, so a long-running ncdu session with regular refreshes
// will leak memory, but I'd say that's worth the efficiency gains.
// TODO: Can still implement a simple bucketed free list on top of this arena
// allocator to reuse nodes, if necessary.
var allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);

pub const EType = packed enum(u2) { dir, link, file };

// Memory layout:
//      Dir + name (+ alignment + Ext)
//  or: Link + name (+ alignment + Ext)
//  or: File + name (+ alignment + Ext)
//
// Entry is always the first part of Dir, Link and File, so a pointer cast to
// *Entry is always safe and an *Entry can be casted to the full type.
// (TODO: What are the aliassing rules for Zig? There is a 'noalias' keyword,
// but does that mean all unmarked pointers are allowed to alias?)
// (TODO: The 'alignment' in the layout above is a lie, none of these structs
// or fields have any sort of alignment. This is great for saving memory but
// perhaps not very great for code size or performance. Might want to
// experiment with setting some alignment and measure the impact)
// (TODO: Putting Ext before the Entry pointer may be a little faster; removes
// the need to iterate over the name)
pub const Entry = packed struct {
    etype: EType,
    isext: bool,
    blocks: u61, // 512-byte blocks
    size: u64,
    next: ?*Entry,

    const Self = @This();

    pub fn dir(self: *Self) ?*Dir {
        return if (self.etype == .dir) @ptrCast(*Dir, self) else null;
    }

    pub fn link(self: *Self) ?*Link {
        return if (self.etype == .link) @ptrCast(*Link, self) else null;
    }

    pub fn file(self: *Self) ?*File {
        return if (self.etype == .file) @ptrCast(*File, self) else null;
    }

    // Whether this entry should be displayed as a "directory".
    // Some dirs are actually represented in this data model as a File for efficiency.
    pub fn isDirectory(self: *Self) bool {
        return if (self.file()) |f| f.other_fs or f.kernfs else self.etype == .dir;
    }

    fn nameOffset(etype: EType) usize {
        return switch (etype) {
            .dir => @byteOffsetOf(Dir, "name"),
            .link => @byteOffsetOf(Link, "name"),
            .file => @byteOffsetOf(File, "name"),
        };
    }

    pub fn name(self: *const Self) [:0]const u8 {
        const ptr = @intToPtr([*:0]u8, @ptrToInt(self) + nameOffset(self.etype));
        return ptr[0..std.mem.lenZ(ptr) :0];
    }

    pub fn ext(self: *Self) ?*Ext {
        if (!self.isext) return null;
        const n = self.name();
        return @intToPtr(*Ext, std.mem.alignForward(@ptrToInt(self) + nameOffset(self.etype) + n.len + 1, @alignOf(Ext)));
    }

    pub fn create(etype: EType, isext: bool, ename: []const u8) *Entry {
        const base_size = nameOffset(etype) + ename.len + 1;
        const size = (if (isext) std.mem.alignForward(base_size, @alignOf(Ext))+@sizeOf(Ext) else base_size);
        var ptr = blk: {
            while (true) {
                if (allocator.allocator.allocWithOptions(u8, size, @alignOf(Entry), null)) |p|
                    break :blk p
                else |_| {}
                ui.oom();
            }
        };
        std.mem.set(u8, ptr, 0); // kind of ugly, but does the trick
        var e = @ptrCast(*Entry, ptr);
        e.etype = etype;
        e.isext = isext;
        var name_ptr = @intToPtr([*]u8, @ptrToInt(e) + nameOffset(etype));
        std.mem.copy(u8, name_ptr[0..ename.len], ename);
        return e;
    }

    // Set the 'err' flag on Dirs and Files, propagating 'suberr' to parents.
    pub fn set_err(self: *Self, parents: *const Parents) void {
        if (self.dir()) |d| d.err = true
        else if (self.file()) |f| f.err = true
        else unreachable;
        var it = parents.iter();
        if (&parents.top().entry == self) _ = it.next();
        while (it.next()) |p| {
            if (p.suberr) break;
            p.suberr = true;
        }
    }

    fn addStats(self: *Entry, parents: *const Parents) void {
        const dev = parents.top().dev;
        // Set if this is the first time we've found this hardlink in the bottom-most directory of the given dev.
        // Means we should count it for other-dev parent dirs, too.
        var new_hl = false;

        var it = parents.iter();
        while(it.next()) |p| {
            var add_total = false;

            if (self.ext()) |e|
                if (p.entry.ext()) |pe|
                    if (e.mtime > pe.mtime) { pe.mtime = e.mtime; };
            p.items = saturateAdd(p.items, 1);

            // Hardlink in a subdirectory with a different device, only count it the first time.
            if (self.etype == .link and dev != p.dev) {
                add_total = new_hl;

            } else if (self.link()) |l| {
                const n = devices.HardlinkNode{ .ino = l.ino, .dir = p };
                var d = devices.list.items[dev].hardlinks.getOrPut(n) catch unreachable;
                new_hl = !d.found_existing;
                // First time we encounter this file in this dir, count it.
                if (!d.found_existing) {
                    d.value_ptr.* = 1;
                    add_total = true;
                    p.shared_size = saturateAdd(p.shared_size, self.size);
                    p.shared_blocks = saturateAdd(p.shared_blocks, self.blocks);
                } else {
                    d.value_ptr.* += 1;
                    // Encountered this file in this dir the same number of times as its link count, meaning it's not shared with other dirs.
                    if(d.value_ptr.* == l.nlink) {
                        p.shared_size = saturateSub(p.shared_size, self.size);
                        p.shared_blocks = saturateSub(p.shared_blocks, self.blocks);
                    }
                }

            } else
                add_total = true;
            if(add_total) {
                p.entry.size = saturateAdd(p.entry.size, self.size);
                p.entry.blocks = saturateAdd(p.entry.blocks, self.blocks);
            }
        }
    }

    // Insert this entry into the tree at the given directory, updating parent sizes and item counts.
    pub fn insert(self: *Entry, parents: *const Parents) void {
        self.next = parents.top().sub;
        parents.top().sub = self;
        if (self.dir()) |d| std.debug.assert(d.sub == null);

        // Links with nlink == 0 are counted after we're done scanning.
        if (if (self.link()) |l| l.nlink == 0 else false)
            link_count.add(parents.top().dev, self.link().?.ino)
        else
            self.addStats(parents);
    }
};

const DevId = u30; // Can be reduced to make room for more flags in Dir.

pub const Dir = packed struct {
    entry: Entry,

    sub: ?*Entry,

    // entry.{blocks,size}: Total size of all unique files + dirs. Non-shared hardlinks are counted only once.
    //   (i.e. the space you'll need if you created a filesystem with only this dir)
    // shared_*: Unique hardlinks that still have references outside of this directory.
    //   (i.e. the space you won't reclaim by deleting this dir)
    // (space reclaimed by deleting a dir =~ entry. - shared_)
    shared_blocks: u64,
    shared_size: u64,
    items: u32,

    // Indexes into the global 'devices.list' array
    dev: DevId,

    err: bool,
    suberr: bool,

    // Only used to find the @byteOffsetOff, the name is written at this point as a 0-terminated string.
    // (Old C habits die hard)
    name: u8,
};

// File that's been hardlinked (i.e. nlink > 1)
pub const Link = packed struct {
    entry: Entry,
    // dev is inherited from the parent Dir
    ino: u64,
    // Special value '0' means: "This link hasn't been counted in the parent
    // sizes yet because we only know that it's a hard link but not how many
    // links it has". These are added to the tree structure first and are
    // counted after the scan is complete (see link_count below).
    nlink: u32,
    name: u8,
};

// Anything that's not an (indexed) directory or hardlink. Excluded directories are also "Files".
pub const File = packed struct {
    entry: Entry,

    err: bool,
    excluded: bool,
    other_fs: bool,
    kernfs: bool,
    notreg: bool,
    _pad: u3,

    name: u8,
};

pub const Ext = packed struct {
    mtime: u64 = 0,
    uid: u32 = 0,
    gid: u32 = 0,
    mode: u16 = 0,
};

comptime {
    std.debug.assert(@bitOffsetOf(Dir, "name") % 8 == 0);
    std.debug.assert(@bitOffsetOf(Link, "name") % 8 == 0);
    std.debug.assert(@bitOffsetOf(File, "name") % 8 == 0);
}


// Hardlink handling:
//
//   Global lookup table of dev -> (ino,*Dir) -> num_files
//
//   num_files is how many times the file has been found in the particular dir.
//   num_links is the file's st_nlink count.
//
//   Adding a hardlink: O(parents)
//
//     for dir in file.parents:
//       add to dir.total_* if it's not yet in the lookup table
//       add to num_files in the lookup table
//       add to dir.shared_* where num_files == 1
//
//   Removing a hardlink: O(parents)
//
//     for dir in file.parents:
//       subtract from num_files in the lookup table
//       subtract from dir.total_* if num_files == 0
//       subtract from dir.shared_* if num_files == num_links-1
//       remove from lookup table if num_files == 0
//
//   Re-calculating full hardlink stats (only possible when also storing sizes):
//
//     reset total_* and shared_* for all dirs
//     for (file,dir) in lookup_table:
//       dir.total_* += file
//       if file.num_links != dir.num_files:
//         dir.shared_* += file
//
// Problem: num_links is not available in ncdu JSON dumps, will have to assume
//   that there are no shared hardlinks outside of the given dump.
//
// Problem: This data structure does not provide a way to easily list all paths
//   with the same dev,ino. ncdu provides this list in the info window. Doesn't
//   seem too commonly used, can still be provided by a slow full scan of the
//   tree.
//
// Problem: A file's st_nlink count may have changed during a scan and hence be
//   inconsistent with other entries for the same file. Not ~too~ common so a
//   few glitches are fine, but I haven't worked out the impact of this yet.


pub const devices = struct {
    var list: std.ArrayList(Device) = std.ArrayList(Device).init(main.allocator);
    // dev -> id
    var lookup: std.AutoHashMap(u64, DevId) = std.AutoHashMap(u64, DevId).init(main.allocator);

    // 20 bytes per hardlink/Dir entry, 16 for the key + 4 for the value.
    //
    // Potential problem: HashMap uses a 32bit item counter, which may be exceeded in extreme scenarios.
    // (ncdu 1.x doesn't support more than 31bit-counted files, but this table is hardlink_count*parent_dirs and may grow a bit)
    const HardlinkNode = struct { ino: u64, dir: *Dir };
    const Hardlinks = std.AutoHashMap(HardlinkNode, u32);

    // Device entry, this is used for two reasons:
    // 1. st_dev ids are 64-bit, but in a typical filesystem there's only a few
    //    unique ids, hence we can save RAM by only storing smaller DevId's in Dir
    //    entries and using that as an index to a lookup table.
    // 2. Keeping track of hardlink counts for each dir and inode, as described above.
    //
    // (Device entries are never deallocated)
    const Device = struct {
        dev: u64,
        hardlinks: Hardlinks = Hardlinks.init(main.allocator),
    };

    pub fn getId(dev: u64) DevId {
        var d = lookup.getOrPut(dev) catch unreachable;
        if (!d.found_existing) {
            d.value_ptr.* = @intCast(DevId, list.items.len);
            list.append(.{ .dev = dev }) catch unreachable;
        }
        return d.value_ptr.*;
    }

    pub fn getDev(id: DevId) u64 {
        return list.items[id].dev;
    }
};

// Special hash table for counting hard links with nlink=0.
pub const link_count = struct {
    var nodes = std.HashMap(Node, void, HashContext, 80).init(main.allocator);

    // Single node for both key (dev,ino) and value (count), in order to prevent padding between hash table node entries.
    const Node = struct {
        ino: u64,
        dev: u32, // DevId, but 32-bits for easier hashing
        count: u32,
    };

    const HashContext = struct {
        pub fn hash(self: @This(), v: Node) u64 {
            var h = std.hash.Wyhash.init(0);
            h.update(std.mem.asBytes(&v.dev));
            h.update(std.mem.asBytes(&v.ino));
            return h.final();
        }

        pub fn eql(self: @This(), a: Node, b: Node) bool {
            return a.ino == b.ino and a.dev == b.dev;
        }
    };

    pub fn add(dev: DevId, ino: u64) void {
        const n = Node{ .dev = dev, .ino = ino, .count = 1 };
        var d = nodes.getOrPut(n) catch unreachable;
        if (d.found_existing) d.key_ptr.*.count += 1;
    }

    var final_dir: Parents = undefined;

    fn final_rec() void {
        var it = final_dir.top().sub;
        while (it) |e| : (it = e.next) {
            if (e.dir()) |d| {
                final_dir.push(d);
                final_rec();
                final_dir.pop();
                continue;
            }
            const l = e.link() orelse continue;
            if (l.nlink > 0) continue;
            const s = Node{ .dev = final_dir.top().dev, .ino = l.ino, .count = 0 };
            if (nodes.getEntry(s)) |n| {
                l.nlink = n.key_ptr.*.count;
                e.addStats(&final_dir);
            }
        }
    }

    // Called when all files have been added, will traverse the directory to
    // find all links, update their nlink count and parent sizes.
    pub fn final() void {
        if (nodes.count() == 0) return;
        final_dir = Parents{};
        final_rec();
        nodes.clearAndFree();
        final_dir.deinit();
    }
};

pub var root: *Dir = undefined;

// Stack of parent directories, convenient helper when constructing and traversing the tree.
// The 'root' node is always implicitely at the bottom of the stack.
pub const Parents = struct {
    stack: std.ArrayList(*Dir) = std.ArrayList(*Dir).init(main.allocator),

    const Self = @This();

    pub fn push(self: *Self, dir: *Dir) void {
        return self.stack.append(dir) catch unreachable;
    }

    // Attempting to remove the root node is considered a bug.
    pub fn pop(self: *Self) void {
        _ = self.stack.pop();
    }

    pub fn top(self: *const Self) *Dir {
        return if (self.stack.items.len == 0) root else self.stack.items[self.stack.items.len-1];
    }

    pub const Iterator = struct {
        lst: *const Self,
        index: usize = 0, // 0 = top of the stack, counts upwards to go down

        pub fn next(it: *Iterator) ?*Dir {
            const len = it.lst.stack.items.len;
            if (it.index > len) return null;
            it.index += 1;
            return if (it.index > len) root else it.lst.stack.items[len-it.index];
        }
    };

    // Iterate from top to bottom of the stack.
    pub fn iter(self: *const Self) Iterator {
        return .{ .lst = self };
    }

    // Append the path to the given arraylist. The list is assumed to use main.allocator, so it can't fail.
    pub fn path(self: *const Self, out: *std.ArrayList(u8)) void {
        const r = root.entry.name();
        out.appendSlice(r) catch unreachable;
        var i: usize = 0;
        while (i < self.stack.items.len) {
            if (i != 0 or r[r.len-1] != '/') out.append('/') catch unreachable;
            out.appendSlice(self.stack.items[i].entry.name()) catch unreachable;
            i += 1;
        }
    }

    pub fn deinit(self: *Self) void {
        self.stack.deinit();
    }
};


test "entry" {
    var e = Entry.create(.file, false, "hello") catch unreachable;
    std.debug.assert(e.etype == .file);
    std.debug.assert(!e.isext);
    std.testing.expectEqualStrings(e.name(), "hello");
}