target/linux/generic-2.6/patches-2.6.31/230-union_mounts.patch

   1 --- /dev/null
   2 +++ b/Documentation/filesystems/union-mounts.txt
   3 @@ -0,0 +1,187 @@
   4 +VFS based Union Mounts
   5 +----------------------
   6 +
   7 + 1. What are "Union Mounts"
   8 + 2. The Union Stack
   9 + 3. Whiteouts, Opaque Directories, and Fallthrus
  10 + 4. Copy-up
  11 + 5. Directory Reading
  12 + 6. Known Problems
  13 + 7. References
  14 +
  15 +-------------------------------------------------------------------------------
  16 +
  17 +1. What are "Union Mounts"
  18 +==========================
  19 +
  20 +Please note: this is NOT about UnionFS and it is NOT derived work!
  21 +
  22 +Traditionally the mount operation is opaque, which means that the content of
  23 +the mount point, the directory where the file system is mounted on, is hidden
  24 +by the content of the mounted file system's root directory until the file
  25 +system is unmounted again. Unlike the traditional UNIX mount mechanism, that
  26 +hides the contents of the mount point, a union mount presents a view as if
  27 +both filesystems are merged together. Although only the topmost layer of the
  28 +mount stack can be altered, it appears as if transparent file system mounts
  29 +allow any file to be created, modified or deleted.
  30 +
  31 +Most people know the concepts and features of union mounts from other
  32 +operating systems like Sun's Translucent Filesystem, Plan9 or BSD. For an
  33 +in-depth review of union mounts and other unioning file systems, see:
  34 +
  35 +http://lwn.net/Articles/324291/
  36 +http://lwn.net/Articles/325369/
  37 +http://lwn.net/Articles/327738/
  38 +
  39 +Here are the key features of this implementation:
  40 +- completely VFS based
  41 +- does not change the namespace stacking
  42 +- directory listings have duplicate entries removed in the kernel
  43 +- writable unions: only the topmost file system layer may be writable
  44 +- writable unions: new whiteout filetype handled inside the kernel
  45 +
  46 +-------------------------------------------------------------------------------
  47 +
  48 +2. The Union Stack
  49 +==================
  50 +
  51 +The mounted file systems are organized in the "file system hierarchy" (tree of
  52 +vfsmount structures), which keeps track about the stacking of file systems
  53 +upon each other. The per-directory view on the file system hierarchy is called
  54 +"mount stack" and reflects the order of file systems, which are mounted on a
  55 +specific directory.
  56 +
  57 +Union mounts present a single unified view of the contents of two or more file
  58 +systems as if they are merged together. Since the information which file
  59 +system objects are part of a unified view is not directly available from the
  60 +file system hierarchy there is a need for a new structure. The file system
  61 +objects, which are part of a unified view are ordered in a so-called "union
  62 +stack". Only directories can be part of a unified view.
  63 +
  64 +The link between two layers of the union stack is maintained using the
  65 +union_mount structure (#include <linux/union.h>):
  66 +
  67 +struct union_mount {
  68 +       atomic_t u_count;               /* reference count */
  69 +       struct mutex u_mutex;
  70 +       struct list_head u_unions;      /* list head for d_unions */
  71 +       struct hlist_node u_hash;       /* list head for searching */
  72 +       struct hlist_node u_rhash;      /* list head for reverse searching */
  73 +
  74 +       struct path u_this;             /* this is me */
  75 +       struct path u_next;             /* this is what I overlay */
  76 +};
  77 +
  78 +The union_mount structure holds a reference (dget,mntget) to the next lower
  79 +layer of the union stack. Since a dentry can be part of multiple unions
  80 +(e.g. with bind mounts) they are tied together via the d_unions field of the
  81 +dentry structure.
  82 +
  83 +All union_mount structures are cached in two hash tables, one for lookups of
  84 +the next lower layer of the union stack and one for reverse lookups of the
  85 +next upper layer of the union stack. The reverse lookup is necessary to
  86 +resolve CWD relative path lookups. For calculation of the hash value, the
  87 +(dentry,vfsmount) pair is used. The u_this field is used for the hash table
  88 +which is used in forward lookups and the u_next field for the reverse lookups.
  89 +
  90 +During every new mount (or mount propagation), a new union_mount structure is
  91 +allocated. A reference to the mountpoint's vfsmount and dentry is taken and
  92 +stored in the u_next field.  In almost the same manner an union_mount
  93 +structure is created during the first time lookup of a directory within a
  94 +union mount point. In this case the lookup proceeds to all lower layers of the
  95 +union. Therefore the complete union stack is constructed during lookups.
  96 +
  97 +The union_mount structures of a dentry are destroyed when the dentry itself is
  98 +destroyed. Therefore the dentry cache is indirectly driving the union_mount
  99 +cache like this is done for inodes too. Please note that lower layer
 100 +union_mount structures are kept in memory until the topmost dentry is
 101 +destroyed.
 102 +
 103 +-------------------------------------------------------------------------------
 104 +
 105 +3. Whiteouts, Opaque Directories, and Fallthrus
 106 +===========================================================
 107 +
 108 +The whiteout filetype isn't new. It has been there for quite some time now
 109 +but Linux's VFS hasn't used it yet. With the availability of union mount code
 110 +inside the VFS the whiteout filetype is getting important to support writable
 111 +union mounts. For read-only union mounts, support for whiteouts or
 112 +copy-on-open is not necessary.
 113 +
 114 +The whiteout filetype has the same function as negative dentries: they
 115 +describe a filename which isn't there. The creation of whiteouts needs
 116 +lowlevel filesystem support. At the time of writing this, there is whiteout
 117 +support for tmpfs, ext2 and ext3 available. The VFS is extended to make the
 118 +whiteout handling transparent to all its users. The whiteouts are not
 119 +visible to user-space.
 120 +
 121 +What happens when we create a directory that was previously whited-out? We
 122 +don't want the directory entries from underlying filesystems to suddenly appear
 123 +in the newly created directory.  So we mark the directory opaque (the file
 124 +system must support storage of the opaque flag).
 125 +
 126 +Fallthrus are directory entries that override the opaque flag on a directory
 127 +for that specific directory entry name (the lookup "falls through" to the next
 128 +layer of the union mount).  Fallthrus are mainly useful for implementing
 129 +readdir().
 130 +
 131 +-------------------------------------------------------------------------------
 132 +
 133 +4. Copy-up
 134 +===========
 135 +
 136 +Any write to an object on any layer other than the topmost triggers a copy-up
 137 +of the object to the topmost file system. For regular files, the copy-up
 138 +happens when it is opened in writable mode.
 139 +
 140 +Directories are copied up on open, regardless of intent to write, to simplify
 141 +copy-up of any object located below it in the namespace. Otherwise we have to
 142 +walk the entire pathname to create intermediate directories whenever we do a
 143 +copy-up. This is the same approach as BSD union mounts and uses a negigible
 144 +amount of disk space.  Note that the actual directory entries themselves are
 145 +not copied-up from the lower levels until (a) the directory is written to, or
 146 +(b) the first readdir() of the directory (more on that later).
 147 +
 148 +Rename across different levels of the union is implemented as a copy-up
 149 +operation for regular files. Rename of directories simply returns EXDEV, the
 150 +same as if we tried to rename across different mounts. Most applications have
 151 +to handle this case anyway. Some applications do not expect EXDEV on
 152 +rename operations within the same directory, but these applications will also
 153 +be broken with bind mounts.
 154 +
 155 +-------------------------------------------------------------------------------
 156 +
 157 +5. Directory Reading
 158 +====================
 159 +
 160 +readdir() is somewhat difficult to implement in a unioning file system. We must
 161 +eliminate duplicates, apply whiteouts, and start up readdir() where we left
 162 +off, given a single f_pos value. Our solution is to copy up all the directory
 163 +entries to the topmost directory the first time readdir() is called on a
 164 +directory. During this copy-up, we skip duplicates and entries covered by
 165 +whiteouts, and then create fallthru entries for each remaining visible dentry.
 166 +Then we mark the whole directory opaque. From then on, we just use the topmost
 167 +file system's normal readdir() operation.
 168 +
 169 +-------------------------------------------------------------------------------
 170 +
 171 +6. Known Problems
 172 +=================
 173 +
 174 +- copyup() for other filetypes that reg and dir (e.g. for chown() on devices)
 175 +- symlinks are untested
 176 +
 177 +-------------------------------------------------------------------------------
 178 +
 179 +7. References
 180 +=============
 181 +
 182 +[1] http://marc.info/?l=linux-fsdevel&m=96035682927821&w=2
 183 +[2] http://marc.info/?l=linux-fsdevel&m=117681527820133&w=2
 184 +[3] http://marc.info/?l=linux-fsdevel&m=117913503200362&w=2
 185 +[4] http://marc.info/?l=linux-fsdevel&m=118231827024394&w=2
 186 +
 187 +Authors:
 188 +Jan Blunck <jblunck@suse.de>
 189 +Bharata B Rao <bharata@linux.vnet.ibm.com>
 190 +Valerie Aurora <vaurora@redhat.com>
 191 --- a/fs/autofs4/autofs_i.h
 192 +++ b/fs/autofs4/autofs_i.h
 193 @@ -130,6 +130,7 @@
 194         int reghost_enabled;
 195         int needs_reghost;
 196         struct super_block *sb;
 197 +       struct vfsmount *mnt;
 198         struct mutex wq_mutex;
 199         spinlock_t fs_lock;
 200         struct autofs_wait_queue *queues; /* Wait queue pointer */
 201 --- a/fs/autofs4/init.c
 202 +++ b/fs/autofs4/init.c
 203 @@ -17,7 +17,16 @@
 204  static int autofs_get_sb(struct file_system_type *fs_type,
 205         int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 206  {
 207 -       return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 208 +       struct autofs_sb_info *sbi;
 209 +       int ret;
 210 +
 211 +       ret = get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 212 +       if (ret)
 213 +               return ret;
 214 +
 215 +       sbi = autofs4_sbi(mnt->mnt_sb);
 216 +       sbi->mnt = mnt;
 217 +       return 0;
 218  }
 219
 220  static struct file_system_type autofs_fs_type = {
 221 --- a/fs/autofs4/root.c
 222 +++ b/fs/autofs4/root.c
 223 @@ -179,6 +179,12 @@
 224         DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
 225                 dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
 226                 nd->flags);
 227 +
 228 +       dput(nd->path.dentry);
 229 +       mntput(nd->path.mnt);
 230 +       nd->path.mnt = mntget(sbi->mnt);
 231 +       nd->path.dentry = dget(dentry);
 232 +
 233         /*
 234          * For an expire of a covered direct or offset mount we need
 235          * to break out of follow_down() at the autofs mount trigger
 236 --- a/fs/compat.c
 237 +++ b/fs/compat.c
 238 @@ -847,6 +847,9 @@
 239         struct compat_old_linux_dirent __user *dirent;
 240         compat_ulong_t d_ino;
 241
 242 +       if (d_type == DT_WHT)
 243 +               return 0;
 244 +
 245         if (buf->result)
 246                 return -EINVAL;
 247         d_ino = ino;
 248 @@ -918,6 +921,9 @@
 249         compat_ulong_t d_ino;
 250         int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t));
 251
 252 +       if (d_type == DT_WHT)
 253 +               return 0;
 254 +
 255         buf->error = -EINVAL;   /* only used if we fail.. */
 256         if (reclen > buf->count)
 257                 return -EINVAL;
 258 @@ -1007,6 +1013,9 @@
 259         int reclen = ALIGN(jj + namlen + 1, sizeof(u64));
 260         u64 off;
 261
 262 +       if (d_type == DT_WHT)
 263 +               return 0;
 264 +
 265         buf->error = -EINVAL;   /* only used if we fail.. */
 266         if (reclen > buf->count)
 267                 return -EINVAL;
 268 --- a/fs/dcache.c
 269 +++ b/fs/dcache.c
 270 @@ -18,6 +18,7 @@
 271  #include <linux/string.h>
 272  #include <linux/mm.h>
 273  #include <linux/fs.h>
 274 +#include <linux/union.h>
 275  #include <linux/fsnotify.h>
 276  #include <linux/slab.h>
 277  #include <linux/init.h>
 278 @@ -157,14 +158,19 @@
 279  }
 280
 281  /**
 282 - * d_kill - kill dentry and return parent
 283 + * __d_kill - kill dentry and return parent
 284   * @dentry: dentry to kill
 285 + * @list: kill list
 286 + * @greedy: return parent instead of putting it on the kill list
 287   *
 288   * The dentry must already be unhashed and removed from the LRU.
 289   *
 290 - * If this is the root of the dentry tree, return NULL.
 291 + * If this is the root of the dentry tree, return NULL. If greedy is zero, we
 292 + * put the parent of this dentry on the kill list instead. The callers must
 293 + * make sure that __d_kill_final() is called on all dentries on the kill list.
 294   */
 295 -static struct dentry *d_kill(struct dentry *dentry)
 296 +static struct dentry *__d_kill(struct dentry *dentry, struct list_head *list,
 297 +                              int greedy)
 298         __releases(dentry->d_lock)
 299         __releases(dcache_lock)
 300  {
 301 @@ -172,13 +178,78 @@
 302
 303         list_del(&dentry->d_u.d_child);
 304         dentry_stat.nr_dentry--;        /* For d_free, below */
 305 -       /*drops the locks, at that point nobody can reach this dentry */
 306 +
 307 +       /*
 308 +        * If we are not greedy we just put this on a list for later processing
 309 +        * (follow up to parent, releasing of inode and freeing dentry memory).
 310 +        */
 311 +       if (!greedy) {
 312 +               list_del_init(&dentry->d_alias);
 313 +               /* at this point nobody can reach this dentry */
 314 +               list_add(&dentry->d_lru, list);
 315 +               spin_unlock(&dentry->d_lock);
 316 +               spin_unlock(&dcache_lock);
 317 +               __shrink_d_unions(dentry, list);
 318 +               return NULL;
 319 +       }
 320 +
 321 +       /* drops the locks, at that point nobody can reach this dentry */
 322         dentry_iput(dentry);
 323 +       /* If the dentry was in an union delete them */
 324 +       __shrink_d_unions(dentry, list);
 325 +       if (IS_ROOT(dentry))
 326 +               parent = NULL;
 327 +       else
 328 +               parent = dentry->d_parent;
 329 +       d_free(dentry);
 330 +       return parent;
 331 +}
 332 +
 333 +void __dput(struct dentry *, struct list_head *, int);
 334 +
 335 +static void __d_kill_final(struct dentry *dentry, struct list_head *list)
 336 +{
 337 +       struct dentry *parent;
 338 +       struct inode *inode = dentry->d_inode;
 339 +
 340 +       if (inode) {
 341 +               dentry->d_inode = NULL;
 342 +               if (!inode->i_nlink)
 343 +                       fsnotify_inoderemove(inode);
 344 +               if (dentry->d_op && dentry->d_op->d_iput)
 345 +                       dentry->d_op->d_iput(dentry, inode);
 346 +               else
 347 +                       iput(inode);
 348 +       }
 349 +
 350         if (IS_ROOT(dentry))
 351                 parent = NULL;
 352         else
 353                 parent = dentry->d_parent;
 354         d_free(dentry);
 355 +       __dput(parent, list, 1);
 356 +}
 357 +
 358 +/**
 359 + * d_kill - kill dentry and return parent
 360 + * @dentry: dentry to kill
 361 + *
 362 + * The dentry must already be unhashed and removed from the LRU.
 363 + *
 364 + * If this is the root of the dentry tree, return NULL.
 365 + */
 366 +static struct dentry *d_kill(struct dentry *dentry)
 367 +{
 368 +       LIST_HEAD(mortuary);
 369 +       struct dentry *parent;
 370 +
 371 +       parent = __d_kill(dentry, &mortuary, 1);
 372 +       while (!list_empty(&mortuary)) {
 373 +               dentry = list_entry(mortuary.next, struct dentry, d_lru);
 374 +               list_del(&dentry->d_lru);
 375 +               __d_kill_final(dentry, &mortuary);
 376 +       }
 377 +
 378         return parent;
 379  }
 380
 381 @@ -199,19 +270,24 @@
 382   * Real recursion would eat up our stack space.
 383   */
 384
 385 -/*
 386 - * dput - release a dentry
 387 - * @dentry: dentry to release
 388 +/**
 389 + * __dput - release a dentry
 390 + * @dentry: dentry to release
 391 + * @list: kill list argument for __d_kill()
 392 + * @greedy: greedy argument for __d_kill()
 393   *
 394   * Release a dentry. This will drop the usage count and if appropriate
 395   * call the dentry unlink method as well as removing it from the queues and
 396   * releasing its resources. If the parent dentries were scheduled for release
 397 - * they too may now get deleted.
 398 + * they too may now get deleted if @greedy is not zero. Otherwise parent is
 399 + * added to the kill list. The callers must make sure that __d_kill_final() is
 400 + * called on all dentries on the kill list.
 401 + *
 402 + * You probably want to use dput() instead.
 403   *
 404   * no dcache lock, please.
 405   */
 406 -
 407 -void dput(struct dentry *dentry)
 408 +void __dput(struct dentry *dentry, struct list_head *list, int greedy)
 409  {
 410         if (!dentry)
 411                 return;
 412 @@ -252,12 +328,35 @@
 413  kill_it:
 414         /* if dentry was on the d_lru list delete it from there */
 415         dentry_lru_del(dentry);
 416 -       dentry = d_kill(dentry);
 417 +       dentry = __d_kill(dentry, list, greedy);
 418         if (dentry)
 419                 goto repeat;
 420  }
 421
 422  /**
 423 + * dput - release a dentry
 424 + * @dentry: dentry to release
 425 + *
 426 + * Release a dentry. This will drop the usage count and if appropriate
 427 + * call the dentry unlink method as well as removing it from the queues and
 428 + * releasing its resources. If the parent dentries were scheduled for release
 429 + * they too may now get deleted.
 430 + *
 431 + * no dcache lock, please.
 432 + */
 433 +void dput(struct dentry *dentry)
 434 +{
 435 +       LIST_HEAD(mortuary);
 436 +
 437 +       __dput(dentry, &mortuary, 1);
 438 +       while (!list_empty(&mortuary)) {
 439 +               dentry = list_entry(mortuary.next, struct dentry, d_lru);
 440 +               list_del(&dentry->d_lru);
 441 +               __d_kill_final(dentry, &mortuary);
 442 +       }
 443 +}
 444 +
 445 +/**
 446   * d_invalidate - invalidate a dentry
 447   * @dentry: dentry to invalidate
 448   *
 449 @@ -689,6 +788,7 @@
 450                                         iput(inode);
 451                         }
 452
 453 +                       shrink_d_unions(dentry);
 454                         d_free(dentry);
 455
 456                         /* finished when we fall off the top of the tree,
 457 @@ -951,6 +1051,10 @@
 458         INIT_LIST_HEAD(&dentry->d_lru);
 459         INIT_LIST_HEAD(&dentry->d_subdirs);
 460         INIT_LIST_HEAD(&dentry->d_alias);
 461 +#ifdef CONFIG_UNION_MOUNT
 462 +       INIT_LIST_HEAD(&dentry->d_unions);
 463 +       dentry->d_unionized = 0;
 464 +#endif
 465
 466         if (parent) {
 467                 dentry->d_parent = dget(parent);
 468 @@ -981,8 +1085,10 @@
 469  /* the caller must hold dcache_lock */
 470  static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 471  {
 472 -       if (inode)
 473 +       if (inode) {
 474 +               dentry->d_flags &= ~(DCACHE_WHITEOUT|DCACHE_FALLTHRU);
 475                 list_add(&dentry->d_alias, &inode->i_dentry);
 476 +       }
 477         dentry->d_inode = inode;
 478         fsnotify_d_instantiate(dentry, inode);
 479  }
 480 @@ -1513,7 +1619,9 @@
 481         spin_lock(&dentry->d_lock);
 482         isdir = S_ISDIR(dentry->d_inode->i_mode);
 483         if (atomic_read(&dentry->d_count) == 1) {
 484 +               __d_drop_unions(dentry);
 485                 dentry_iput(dentry);
 486 +               shrink_d_unions(dentry);
 487                 fsnotify_nameremove(dentry, isdir);
 488                 return;
 489         }
 490 @@ -1524,14 +1632,14 @@
 491         spin_unlock(&dentry->d_lock);
 492         spin_unlock(&dcache_lock);
 493
 494 +       shrink_d_unions(dentry);
 495         fsnotify_nameremove(dentry, isdir);
 496  }
 497
 498  static void __d_rehash(struct dentry * entry, struct hlist_head *list)
 499  {
 500 -
 501 -       entry->d_flags &= ~DCACHE_UNHASHED;
 502 -       hlist_add_head_rcu(&entry->d_hash, list);
 503 +       entry->d_flags &= ~DCACHE_UNHASHED;
 504 +       hlist_add_head_rcu(&entry->d_hash, list);
 505  }
 506
 507  static void _d_rehash(struct dentry * entry)
 508 @@ -1550,6 +1658,7 @@
 509  {
 510         spin_lock(&dcache_lock);
 511         spin_lock(&entry->d_lock);
 512 +       BUG_ON(!d_unhashed(entry));
 513         _d_rehash(entry);
 514         spin_unlock(&entry->d_lock);
 515         spin_unlock(&dcache_lock);
 516 @@ -2182,7 +2291,9 @@
 517                 struct list_head *tmp = next;
 518                 struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 519                 next = tmp->next;
 520 -               if (d_unhashed(dentry)||!dentry->d_inode)
 521 +               if (d_unhashed(dentry)||(!dentry->d_inode &&
 522 +                                        !d_is_whiteout(dentry) &&
 523 +                                        !d_is_fallthru(dentry)))
 524                         continue;
 525                 if (!list_empty(&dentry->d_subdirs)) {
 526                         this_parent = dentry;
 527 --- a/fs/ext2/dir.c
 528 +++ b/fs/ext2/dir.c
 529 @@ -219,7 +219,8 @@
 530  {
 531         if (len != de->name_len)
 532                 return 0;
 533 -       if (!de->inode)
 534 +       if (!de->inode && ((de->file_type != EXT2_FT_WHT) &&
 535 +                          (de->file_type != EXT2_FT_FALLTHRU)))
 536                 return 0;
 537         return !memcmp(name, de->name, len);
 538  }
 539 @@ -255,6 +256,8 @@
 540         [EXT2_FT_FIFO]          = DT_FIFO,
 541         [EXT2_FT_SOCK]          = DT_SOCK,
 542         [EXT2_FT_SYMLINK]       = DT_LNK,
 543 +       [EXT2_FT_WHT]           = DT_WHT,
 544 +       [EXT2_FT_FALLTHRU]      = DT_UNKNOWN,
 545  };
 546
 547  #define S_SHIFT 12
 548 @@ -341,6 +344,18 @@
 549                                         ext2_put_page(page);
 550                                         return 0;
 551                                 }
 552 +                       } else if (de->file_type == EXT2_FT_FALLTHRU) {
 553 +                               int over;
 554 +                               unsigned char d_type = DT_UNKNOWN;
 555 +
 556 +                               offset = (char *)de - kaddr;
 557 +                               over = filldir(dirent, de->name, de->name_len,
 558 +                                               (n<<PAGE_CACHE_SHIFT) | offset,
 559 +                                               123, d_type);
 560 +                               if (over) {
 561 +                                       ext2_put_page(page);
 562 +                                       return 0;
 563 +                               }
 564                         }
 565                         filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
 566                 }
 567 @@ -448,6 +463,30 @@
 568         return res;
 569  }
 570
 571 +/* Special version for filetype based whiteout support */
 572 +ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
 573 +{
 574 +       ino_t res = 0;
 575 +       struct ext2_dir_entry_2 *de;
 576 +       struct page *page;
 577 +
 578 +       de = ext2_find_entry (dir, &dentry->d_name, &page);
 579 +       if (de) {
 580 +               res = le32_to_cpu(de->inode);
 581 +               if (!res && de->file_type == EXT2_FT_WHT) {
 582 +                       spin_lock(&dentry->d_lock);
 583 +                       dentry->d_flags |= DCACHE_WHITEOUT;
 584 +                       spin_unlock(&dentry->d_lock);
 585 +               } else if(!res && de->file_type == EXT2_FT_FALLTHRU) {
 586 +                       spin_lock(&dentry->d_lock);
 587 +                       dentry->d_flags |= DCACHE_FALLTHRU;
 588 +                       spin_unlock(&dentry->d_lock);
 589 +               }
 590 +               ext2_put_page(page);
 591 +       }
 592 +       return res;
 593 +}
 594 +
 595  /* Releases the page */
 596  void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 597                    struct page *page, struct inode *inode, int update_times)
 598 @@ -472,9 +511,10 @@
 599  }
 600
 601  /*
 602 - *     Parent is locked.
 603 + * Find or append a given dentry to the parent directory
 604   */
 605 -int ext2_add_link (struct dentry *dentry, struct inode *inode)
 606 +static ext2_dirent * ext2_append_entry(struct dentry * dentry,
 607 +                                      struct page ** page)
 608  {
 609         struct inode *dir = dentry->d_parent->d_inode;
 610         const char *name = dentry->d_name.name;
 611 @@ -482,13 +522,10 @@
 612         unsigned chunk_size = ext2_chunk_size(dir);
 613         unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 614         unsigned short rec_len, name_len;
 615 -       struct page *page = NULL;
 616 -       ext2_dirent * de;
 617 +       ext2_dirent * de = NULL;
 618         unsigned long npages = dir_pages(dir);
 619         unsigned long n;
 620         char *kaddr;
 621 -       loff_t pos;
 622 -       int err;
 623
 624         /*
 625          * We take care of directory expansion in the same loop.
 626 @@ -498,55 +535,97 @@
 627         for (n = 0; n <= npages; n++) {
 628                 char *dir_end;
 629
 630 -               page = ext2_get_page(dir, n, 0);
 631 -               err = PTR_ERR(page);
 632 -               if (IS_ERR(page))
 633 +               *page = ext2_get_page(dir, n, 0);
 634 +               de = ERR_PTR(PTR_ERR(*page));
 635 +               if (IS_ERR(*page))
 636                         goto out;
 637 -               lock_page(page);
 638 -               kaddr = page_address(page);
 639 +               lock_page(*page);
 640 +               kaddr = page_address(*page);
 641                 dir_end = kaddr + ext2_last_byte(dir, n);
 642                 de = (ext2_dirent *)kaddr;
 643                 kaddr += PAGE_CACHE_SIZE - reclen;
 644                 while ((char *)de <= kaddr) {
 645                         if ((char *)de == dir_end) {
 646                                 /* We hit i_size */
 647 -                               name_len = 0;
 648 -                               rec_len = chunk_size;
 649 +                               de->name_len = 0;
 650                                 de->rec_len = ext2_rec_len_to_disk(chunk_size);
 651                                 de->inode = 0;
 652 +                               de->file_type = 0;
 653                                 goto got_it;
 654                         }
 655                         if (de->rec_len == 0) {
 656                                 ext2_error(dir->i_sb, __func__,
 657                                         "zero-length directory entry");
 658 -                               err = -EIO;
 659 +                               de = ERR_PTR(-EIO);
 660                                 goto out_unlock;
 661                         }
 662 -                       err = -EEXIST;
 663                         if (ext2_match (namelen, name, de))
 664 -                               goto out_unlock;
 665 +                               goto got_it;
 666                         name_len = EXT2_DIR_REC_LEN(de->name_len);
 667                         rec_len = ext2_rec_len_from_disk(de->rec_len);
 668 -                       if (!de->inode && rec_len >= reclen)
 669 +                       if (!de->inode && (de->file_type != EXT2_FT_WHT) &&
 670 +                           (de->file_type != EXT2_FT_FALLTHRU) &&
 671 +                           (rec_len >= reclen))
 672                                 goto got_it;
 673                         if (rec_len >= name_len + reclen)
 674                                 goto got_it;
 675                         de = (ext2_dirent *) ((char *) de + rec_len);
 676                 }
 677 -               unlock_page(page);
 678 -               ext2_put_page(page);
 679 +               unlock_page(*page);
 680 +               ext2_put_page(*page);
 681         }
 682 +
 683         BUG();
 684 -       return -EINVAL;
 685
 686  got_it:
 687 +       return de;
 688 +       /* OFFSET_CACHE */
 689 +out_unlock:
 690 +       unlock_page(*page);
 691 +       ext2_put_page(*page);
 692 +out:
 693 +       return de;
 694 +}
 695 +
 696 +/*
 697 + *     Parent is locked.
 698 + */
 699 +int ext2_add_link (struct dentry *dentry, struct inode *inode)
 700 +{
 701 +       struct inode *dir = dentry->d_parent->d_inode;
 702 +       const char *name = dentry->d_name.name;
 703 +       int namelen = dentry->d_name.len;
 704 +       unsigned short rec_len, name_len;
 705 +       ext2_dirent * de;
 706 +       struct page *page;
 707 +       loff_t pos;
 708 +       int err;
 709 +
 710 +       de = ext2_append_entry(dentry, &page);
 711 +       if (IS_ERR(de))
 712 +               return PTR_ERR(de);
 713 +
 714 +       err = -EEXIST;
 715 +       if (ext2_match (namelen, name, de)) {
 716 +               if ((de->file_type == EXT2_FT_WHT) ||
 717 +                   (de->file_type == EXT2_FT_FALLTHRU))
 718 +                       goto got_it;
 719 +               goto out_unlock;
 720 +       }
 721 +
 722 +got_it:
 723 +       name_len = EXT2_DIR_REC_LEN(de->name_len);
 724 +       rec_len = ext2_rec_len_from_disk(de->rec_len);
 725 +
 726         pos = page_offset(page) +
 727                 (char*)de - (char*)page_address(page);
 728         err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
 729                                                         &page, NULL);
 730         if (err)
 731                 goto out_unlock;
 732 -       if (de->inode) {
 733 +       if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
 734 +                          (de->file_type == EXT2_FT_FALLTHRU)) &&
 735 +                         !ext2_match (namelen, name, de))) {
 736                 ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
 737                 de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
 738                 de->rec_len = ext2_rec_len_to_disk(name_len);
 739 @@ -563,7 +642,60 @@
 740         /* OFFSET_CACHE */
 741  out_put:
 742         ext2_put_page(page);
 743 -out:
 744 +       return err;
 745 +out_unlock:
 746 +       unlock_page(page);
 747 +       goto out_put;
 748 +}
 749 +
 750 +/*
 751 + * Create a fallthru entry.
 752 + */
 753 +int ext2_fallthru_entry (struct inode *dir, struct dentry *dentry)
 754 +{
 755 +       const char *name = dentry->d_name.name;
 756 +       int namelen = dentry->d_name.len;
 757 +       unsigned short rec_len, name_len;
 758 +       ext2_dirent * de;
 759 +       struct page *page;
 760 +       loff_t pos;
 761 +       int err;
 762 +
 763 +       de = ext2_append_entry(dentry, &page);
 764 +       if (IS_ERR(de))
 765 +               return PTR_ERR(de);
 766 +
 767 +       err = -EEXIST;
 768 +       if (ext2_match (namelen, name, de))
 769 +               goto out_unlock;
 770 +
 771 +       name_len = EXT2_DIR_REC_LEN(de->name_len);
 772 +       rec_len = ext2_rec_len_from_disk(de->rec_len);
 773 +
 774 +       pos = page_offset(page) +
 775 +               (char*)de - (char*)page_address(page);
 776 +       err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
 777 +                                                       &page, NULL);
 778 +       if (err)
 779 +               goto out_unlock;
 780 +       if (de->inode || (de->file_type == EXT2_FT_WHT) ||
 781 +           (de->file_type == EXT2_FT_FALLTHRU)) {
 782 +               ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
 783 +               de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
 784 +               de->rec_len = ext2_rec_len_to_disk(name_len);
 785 +               de = de1;
 786 +       }
 787 +       de->name_len = namelen;
 788 +       memcpy(de->name, name, namelen);
 789 +       de->inode = 0;
 790 +       de->file_type = EXT2_FT_FALLTHRU;
 791 +       err = ext2_commit_chunk(page, pos, rec_len);
 792 +       dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
 793 +       EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 794 +       mark_inode_dirty(dir);
 795 +       /* OFFSET_CACHE */
 796 +out_put:
 797 +       ext2_put_page(page);
 798         return err;
 799  out_unlock:
 800         unlock_page(page);
 801 @@ -616,6 +748,70 @@
 802         return err;
 803  }
 804
 805 +int ext2_whiteout_entry (struct inode * dir, struct dentry * dentry,
 806 +                        struct ext2_dir_entry_2 * de, struct page * page)
 807 +{
 808 +       const char *name = dentry->d_name.name;
 809 +       int namelen = dentry->d_name.len;
 810 +       unsigned short rec_len, name_len;
 811 +       loff_t pos;
 812 +       int err;
 813 +
 814 +       if (!de) {
 815 +               de = ext2_append_entry(dentry, &page);
 816 +               BUG_ON(!de);
 817 +       }
 818 +
 819 +       err = -EEXIST;
 820 +       if (ext2_match (namelen, name, de) &&
 821 +           (de->file_type == EXT2_FT_WHT)) {
 822 +               ext2_error(dir->i_sb, __func__,
 823 +                          "entry is already a whiteout in directory #%lu",
 824 +                          dir->i_ino);
 825 +               goto out_unlock;
 826 +       }
 827 +
 828 +       name_len = EXT2_DIR_REC_LEN(de->name_len);
 829 +       rec_len = ext2_rec_len_from_disk(de->rec_len);
 830 +
 831 +       pos = page_offset(page) +
 832 +               (char*)de - (char*)page_address(page);
 833 +       err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
 834 +                                                       &page, NULL);
 835 +       if (err)
 836 +               goto out_unlock;
 837 +       /*
 838 +        * We whiteout an existing entry. Do what ext2_delete_entry() would do,
 839 +        * except that we don't need to merge with the previous entry since
 840 +        * we are going to reuse it.
 841 +        */
 842 +       if (ext2_match (namelen, name, de))
 843 +               de->inode = 0;
 844 +       if (de->inode || (((de->file_type == EXT2_FT_WHT) ||
 845 +                          (de->file_type == EXT2_FT_FALLTHRU)) &&
 846 +                         !ext2_match (namelen, name, de))) {
 847 +               ext2_dirent *de1 = (ext2_dirent *) ((char *) de + name_len);
 848 +               de1->rec_len = ext2_rec_len_to_disk(rec_len - name_len);
 849 +               de->rec_len = ext2_rec_len_to_disk(name_len);
 850 +               de = de1;
 851 +       }
 852 +       de->name_len = namelen;
 853 +       memcpy(de->name, name, namelen);
 854 +       de->inode = 0;
 855 +       de->file_type = EXT2_FT_WHT;
 856 +       err = ext2_commit_chunk(page, pos, rec_len);
 857 +       dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
 858 +       EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 859 +       mark_inode_dirty(dir);
 860 +       /* OFFSET_CACHE */
 861 +out_put:
 862 +       ext2_put_page(page);
 863 +       return err;
 864 +out_unlock:
 865 +       unlock_page(page);
 866 +       goto out_put;
 867 +}
 868 +
 869  /*
 870   * Set the first fragment of directory.
 871   */
 872 --- a/fs/ext2/ext2.h
 873 +++ b/fs/ext2/ext2.h
 874 @@ -102,9 +102,13 @@
 875  /* dir.c */
 876  extern int ext2_add_link (struct dentry *, struct inode *);
 877  extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
 878 +extern ino_t ext2_inode_by_dentry(struct inode *, struct dentry *);
 879  extern int ext2_make_empty(struct inode *, struct inode *);
 880  extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
 881  extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 882 +extern int ext2_whiteout_entry (struct inode *, struct dentry *,
 883 +                               struct ext2_dir_entry_2 *, struct page *);
 884 +extern int ext2_fallthru_entry (struct inode *, struct dentry *);
 885  extern int ext2_empty_dir (struct inode *);
 886  extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 887  extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 888 --- a/fs/ext2/inode.c
 889 +++ b/fs/ext2/inode.c
 890 @@ -1176,7 +1176,8 @@
 891  {
 892         unsigned int flags = EXT2_I(inode)->i_flags;
 893
 894 -       inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 895 +       inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
 896 +                           S_OPAQUE);
 897         if (flags & EXT2_SYNC_FL)
 898                 inode->i_flags |= S_SYNC;
 899         if (flags & EXT2_APPEND_FL)
 900 @@ -1187,6 +1188,8 @@
 901                 inode->i_flags |= S_NOATIME;
 902         if (flags & EXT2_DIRSYNC_FL)
 903                 inode->i_flags |= S_DIRSYNC;
 904 +       if (flags & EXT2_OPAQUE_FL)
 905 +               inode->i_flags |= S_OPAQUE;
 906  }
 907
 908  /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
 909 @@ -1194,8 +1197,8 @@
 910  {
 911         unsigned int flags = ei->vfs_inode.i_flags;
 912
 913 -       ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
 914 -                       EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
 915 +       ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|EXT2_IMMUTABLE_FL|
 916 +                        EXT2_NOATIME_FL|EXT2_DIRSYNC_FL|EXT2_OPAQUE_FL);
 917         if (flags & S_SYNC)
 918                 ei->i_flags |= EXT2_SYNC_FL;
 919         if (flags & S_APPEND)
 920 @@ -1206,6 +1209,8 @@
 921                 ei->i_flags |= EXT2_NOATIME_FL;
 922         if (flags & S_DIRSYNC)
 923                 ei->i_flags |= EXT2_DIRSYNC_FL;
 924 +       if (flags & S_OPAQUE)
 925 +               ei->i_flags |= EXT2_OPAQUE_FL;
 926  }
 927
 928  struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 929 --- a/fs/ext2/namei.c
 930 +++ b/fs/ext2/namei.c
 931 @@ -54,15 +54,16 @@
 932   * Methods themselves.
 933   */
 934
 935 -static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 936 +static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry,
 937 +                                 struct nameidata *nd)
 938  {
 939         struct inode * inode;
 940         ino_t ino;
 941 -
 942 +
 943         if (dentry->d_name.len > EXT2_NAME_LEN)
 944                 return ERR_PTR(-ENAMETOOLONG);
 945
 946 -       ino = ext2_inode_by_name(dir, &dentry->d_name);
 947 +       ino = ext2_inode_by_dentry(dir, dentry);
 948         inode = NULL;
 949         if (ino) {
 950                 inode = ext2_iget(dir->i_sb, ino);
 951 @@ -230,6 +231,10 @@
 952         else
 953                 inode->i_mapping->a_ops = &ext2_aops;
 954
 955 +       /* if we call mkdir on a whiteout create an opaque directory */
 956 +       if (dentry->d_flags & DCACHE_WHITEOUT)
 957 +               inode->i_flags |= S_OPAQUE;
 958 +
 959         inode_inc_link_count(inode);
 960
 961         err = ext2_make_empty(inode, dir);
 962 @@ -293,6 +298,78 @@
 963         return err;
 964  }
 965
 966 +/*
 967 + * Create a whiteout for the dentry
 968 + */
 969 +static int ext2_whiteout(struct inode *dir, struct dentry *dentry,
 970 +                        struct dentry *new_dentry)
 971 +{
 972 +       struct inode * inode = dentry->d_inode;
 973 +       struct ext2_dir_entry_2 * de = NULL;
 974 +       struct page * page;
 975 +       int err = -ENOTEMPTY;
 976 +
 977 +       if (!EXT2_HAS_INCOMPAT_FEATURE(dir->i_sb,
 978 +                                      EXT2_FEATURE_INCOMPAT_FILETYPE)) {
 979 +               ext2_error (dir->i_sb, "ext2_whiteout",
 980 +                           "can't set whiteout filetype");
 981 +               err = -EPERM;
 982 +               goto out;
 983 +       }
 984 +
 985 +       if (inode) {
 986 +               if (S_ISDIR(inode->i_mode) && !ext2_empty_dir(inode))
 987 +                       goto out;
 988 +
 989 +               err = -ENOENT;
 990 +               de = ext2_find_entry (dir, &dentry->d_name, &page);
 991 +               if (!de)
 992 +                       goto out;
 993 +               lock_page(page);
 994 +       }
 995 +
 996 +       err = ext2_whiteout_entry (dir, dentry, de, page);
 997 +       if (err)
 998 +               goto out;
 999 +
1000 +       spin_lock(&new_dentry->d_lock);
1001 +       new_dentry->d_flags &= ~DCACHE_FALLTHRU;
1002 +       new_dentry->d_flags |= DCACHE_WHITEOUT;
1003 +       spin_unlock(&new_dentry->d_lock);
1004 +       d_add(new_dentry, NULL);
1005 +
1006 +       if (inode) {
1007 +               inode->i_ctime = dir->i_ctime;
1008 +               inode_dec_link_count(inode);
1009 +               if (S_ISDIR(inode->i_mode)) {
1010 +                       inode->i_size = 0;
1011 +                       inode_dec_link_count(inode);
1012 +                       inode_dec_link_count(dir);
1013 +               }
1014 +       }
1015 +       err = 0;
1016 +out:
1017 +       return err;
1018 +}
1019 +
1020 +/*
1021 + * Create a fallthru entry.
1022 + */
1023 +static int ext2_fallthru (struct inode *dir, struct dentry *dentry)
1024 +{
1025 +       int err;
1026 +
1027 +       err = ext2_fallthru_entry(dir, dentry);
1028 +       if (err)
1029 +               return err;
1030 +
1031 +       d_instantiate(dentry, NULL);
1032 +       spin_lock(&dentry->d_lock);
1033 +       dentry->d_flags |= DCACHE_FALLTHRU;
1034 +       spin_unlock(&dentry->d_lock);
1035 +       return 0;
1036 +}
1037 +
1038  static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
1039         struct inode * new_dir, struct dentry * new_dentry )
1040  {
1041 @@ -392,6 +469,8 @@
1042         .mkdir          = ext2_mkdir,
1043         .rmdir          = ext2_rmdir,
1044         .mknod          = ext2_mknod,
1045 +       .whiteout       = ext2_whiteout,
1046 +       .fallthru       = ext2_fallthru,
1047         .rename         = ext2_rename,
1048  #ifdef CONFIG_EXT2_FS_XATTR
1049         .setxattr       = generic_setxattr,
1050 --- a/fs/ext2/super.c
1051 +++ b/fs/ext2/super.c
1052 @@ -1062,6 +1062,13 @@
1053         if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
1054                 ext2_warning(sb, __func__,
1055                         "mounting ext3 filesystem as ext2");
1056 +
1057 +       /*
1058 +        * Whiteouts (and fallthrus) require explicit whiteout support.
1059 +        */
1060 +       if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_WHITEOUT))
1061 +               sb->s_flags |= MS_WHITEOUT;
1062 +
1063         ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1064         return 0;
1065
1066 --- a/fs/Kconfig
1067 +++ b/fs/Kconfig
1068 @@ -58,6 +58,14 @@
1069
1070  source "fs/quota/Kconfig"
1071
1072 +config UNION_MOUNT
1073 +       bool "Union mount support (EXPERIMENTAL)"
1074 +       depends on EXPERIMENTAL
1075 +       ---help---
1076 +         If you say Y here, you will be able to mount file systems as
1077 +         union mount stacks. This is a VFS based implementation and
1078 +         should work with all file systems. If unsure, say N.
1079 +
1080  source "fs/autofs/Kconfig"
1081  source "fs/autofs4/Kconfig"
1082  source "fs/fuse/Kconfig"
1083 --- a/fs/libfs.c
1084 +++ b/fs/libfs.c
1085 @@ -133,6 +133,7 @@
1086         struct dentry *cursor = filp->private_data;
1087         struct list_head *p, *q = &cursor->d_u.d_child;
1088         ino_t ino;
1089 +       int d_type;
1090         int i = filp->f_pos;
1091
1092         switch (i) {
1093 @@ -158,14 +159,25 @@
1094                         for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
1095                                 struct dentry *next;
1096                                 next = list_entry(p, struct dentry, d_u.d_child);
1097 -                               if (d_unhashed(next) || !next->d_inode)
1098 +                               if (d_unhashed(next) || (!next->d_inode && !d_is_fallthru(next)))
1099                                         continue;
1100
1101 +                               if (d_is_fallthru(next)) {
1102 +                                       /* XXX Make up things we can
1103 +                                        * only get out of the inode.
1104 +                                        * Should probably really do a
1105 +                                        * lookup instead. */
1106 +                                       ino = 100; /* XXX Made up number of no significance */
1107 +                                       d_type = DT_UNKNOWN;
1108 +                               } else {
1109 +                                       ino = next->d_inode->i_ino;
1110 +                                       d_type = dt_type(next->d_inode);
1111 +                               }
1112 +
1113                                 spin_unlock(&dcache_lock);
1114                                 if (filldir(dirent, next->d_name.name,
1115                                             next->d_name.len, filp->f_pos,
1116 -                                           next->d_inode->i_ino,
1117 -                                           dt_type(next->d_inode)) < 0)
1118 +                                           ino, d_type) < 0)
1119                                         return 0;
1120                                 spin_lock(&dcache_lock);
1121                                 /* next is still alive */
1122 --- a/fs/Makefile
1123 +++ b/fs/Makefile
1124 @@ -52,6 +52,7 @@
1125  obj-$(CONFIG_GENERIC_ACL)      += generic_acl.o
1126
1127  obj-y                          += quota/
1128 +obj-$(CONFIG_UNION_MOUNT)      += union.o
1129
1130  obj-$(CONFIG_PROC_FS)          += proc/
1131  obj-y                          += partitions/
1132 --- a/fs/namei.c
1133 +++ b/fs/namei.c
1134 @@ -33,6 +33,7 @@
1135  #include <linux/fcntl.h>
1136  #include <linux/device_cgroup.h>
1137  #include <linux/fs_struct.h>
1138 +#include <linux/union.h>
1139  #include <asm/uaccess.h>
1140
1141  #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
1142 @@ -229,16 +230,17 @@
1143  }
1144
1145  /**
1146 - * inode_permission  -  check for access rights to a given inode
1147 + * __inode_permission  -  check for access rights to a given inode
1148   * @inode:     inode to check permission on
1149   * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
1150 + * @rofs:      check for read-only fs
1151   *
1152   * Used to check for read/write/execute permissions on an inode.
1153   * We use "fsuid" for this, letting us set arbitrary permissions
1154   * for filesystem access without changing the "normal" uids which
1155   * are used for other things.
1156   */
1157 -int inode_permission(struct inode *inode, int mask)
1158 +int __inode_permission(struct inode *inode, int mask, int rofs)
1159  {
1160         int retval;
1161
1162 @@ -248,7 +250,7 @@
1163                 /*
1164                  * Nobody gets write access to a read-only fs.
1165                  */
1166 -               if (IS_RDONLY(inode) &&
1167 +               if ((rofs & IS_RDONLY(inode)) &&
1168                     (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1169                         return -EROFS;
1170
1171 @@ -276,6 +278,18 @@
1172  }
1173
1174  /**
1175 + * inode_permission  -  check for access rights to a given inode
1176 + * @inode:     inode to check permission on
1177 + * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
1178 + *
1179 + * This version pays attention to the MS_RDONLY flag on the fs.
1180 + */
1181 +int inode_permission(struct inode *inode, int mask)
1182 +{
1183 +       return __inode_permission(inode, mask, 1);
1184 +}
1185 +
1186 +/**
1187   * file_permission  -  check for additional access rights to a given file
1188   * @file:      file to check access rights for
1189   * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
1190 @@ -404,15 +418,10 @@
1191   * Internal lookup() using the new generic dcache.
1192   * SMP-safe
1193   */
1194 -static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
1195 +static struct dentry *cache_lookup(struct dentry *parent, struct qstr *name,
1196 +                                  struct nameidata *nd)
1197  {
1198 -       struct dentry * dentry = __d_lookup(parent, name);
1199 -
1200 -       /* lockess __d_lookup may fail due to concurrent d_move()
1201 -        * in some unrelated directory, so try with d_lookup
1202 -        */
1203 -       if (!dentry)
1204 -               dentry = d_lookup(parent, name);
1205 +       struct dentry *dentry = d_lookup(parent, name);
1206
1207         if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1208                 dentry = do_revalidate(dentry, nd);
1209 @@ -421,6 +430,208 @@
1210  }
1211
1212  /*
1213 + * Theory of operation for opaque, whiteout, and fallthru:
1214 + *
1215 + * whiteout: Unconditionally stop lookup here - ENOENT
1216 + *
1217 + * opaque: Don't lookup in directories lower in the union stack
1218 + *
1219 + * fallthru: While looking up an entry, ignore the opaque flag for the
1220 + * current directory only.
1221 + *
1222 + * A union stack is a linked list of directory dentries which appear
1223 + * in the same place in the namespace.  When constructing the union
1224 + * stack, we include directories below opaque directories so that we
1225 + * can properly handle fallthrus.  All non-fallthru lookups have to
1226 + * check for the opaque flag on the parent directory and obey it.
1227 + *
1228 + * In general, the code pattern is to lookup the the topmost entry
1229 + * first (either the first visible non-negative dentry or a negative
1230 + * dentry in the topmost layer of the union), then build the union
1231 + * stack for the newly looked-up entry (if it is a directory).
1232 + */
1233 +
1234 +/**
1235 + * __cache_lookup_topmost - lookup the topmost (non-)negative dentry
1236 + *
1237 + * @nd - parent's nameidata
1238 + * @name - pathname part to lookup
1239 + * @path - found dentry for pathname part
1240 + *
1241 + * This is used for union mount lookups from dcache. The first non-negative
1242 + * dentry is searched on all layers of the union stack. Otherwise the topmost
1243 + * negative dentry is returned.
1244 + */
1245 +static int __cache_lookup_topmost(struct nameidata *nd, struct qstr *name,
1246 +                                 struct path *path)
1247 +{
1248 +       struct dentry *dentry;
1249 +
1250 +       dentry = d_lookup(nd->path.dentry, name);
1251 +       if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1252 +               dentry = do_revalidate(dentry, nd);
1253 +
1254 +       /*
1255 +        * Remember the topmost negative dentry in case we don't find anything
1256 +        */
1257 +       path->dentry = dentry;
1258 +       path->mnt = dentry ? nd->path.mnt : NULL;
1259 +
1260 +       if (!dentry || (dentry->d_inode || d_is_whiteout(dentry)))
1261 +               return !dentry;
1262 +
1263 +       /* Keep going through opaque directories if we found a fallthru */
1264 +       if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(dentry))
1265 +               return !dentry;
1266 +
1267 +       /* look for the first non-negative or whiteout dentry */
1268 +
1269 +       while (follow_union_down(&nd->path)) {
1270 +               dentry = d_hash_and_lookup(nd->path.dentry, name);
1271 +
1272 +               /*
1273 +                * If parts of the union stack are not in the dcache we need
1274 +                * to do a real lookup
1275 +                */
1276 +               if (!dentry)
1277 +                       goto out_dput;
1278 +
1279 +               /*
1280 +                * If parts of the union don't survive the revalidation we
1281 +                * need to do a real lookup
1282 +                */
1283 +               if (dentry->d_op && dentry->d_op->d_revalidate) {
1284 +                       dentry = do_revalidate(dentry, nd);
1285 +                       if (!dentry)
1286 +                               goto out_dput;
1287 +               }
1288 +
1289 +               if (dentry->d_inode || d_is_whiteout(dentry))
1290 +                       goto out_dput;
1291 +
1292 +               /* Stop the lookup on opaque parent and non-fallthru child */
1293 +               if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(dentry))
1294 +                       goto out_dput;
1295 +
1296 +               dput(dentry);
1297 +       }
1298 +
1299 +       return !dentry;
1300 +
1301 +out_dput:
1302 +       dput(path->dentry);
1303 +       path->dentry = dentry;
1304 +       path->mnt = dentry ? mntget(nd->path.mnt) : NULL;
1305 +       return !dentry;
1306 +}
1307 +
1308 +/**
1309 + * __cache_lookup_build_union - build the union stack for this part,
1310 + * cached version
1311 + *
1312 + * This is called after you have the topmost dentry in @path.
1313 + */
1314 +static int __cache_lookup_build_union(struct nameidata *nd, struct qstr *name,
1315 +                                     struct path *path)
1316 +{
1317 +       struct path last = *path;
1318 +       struct dentry *dentry;
1319 +
1320 +       while (follow_union_down(&nd->path)) {
1321 +               dentry = d_hash_and_lookup(nd->path.dentry, name);
1322 +               if (!dentry)
1323 +                       return 1;
1324 +
1325 +               if (dentry->d_op && dentry->d_op->d_revalidate) {
1326 +                       dentry = do_revalidate(dentry, nd);
1327 +                       if (!dentry)
1328 +                               return 1;
1329 +               }
1330 +
1331 +               if (d_is_whiteout(dentry)) {
1332 +                       dput(dentry);
1333 +                       break;
1334 +               }
1335 +
1336 +               if (!dentry->d_inode) {
1337 +                       dput(dentry);
1338 +                       continue;
1339 +               }
1340 +
1341 +               /* only directories can be part of a union stack */
1342 +               if (!S_ISDIR(dentry->d_inode->i_mode)) {
1343 +                       dput(dentry);
1344 +                       break;
1345 +               }
1346 +
1347 +               /* Add the newly discovered dir to the union stack */
1348 +               append_to_union(last.mnt, last.dentry, nd->path.mnt, dentry);
1349 +
1350 +               if (last.dentry != path->dentry)
1351 +                       path_put(&last);
1352 +               last.dentry = dentry;
1353 +               last.mnt = mntget(nd->path.mnt);
1354 +       }
1355 +
1356 +       if (last.dentry != path->dentry)
1357 +               path_put(&last);
1358 +
1359 +       return 0;
1360 +}
1361 +
1362 +/**
1363 + * cache_lookup_union - lookup a single pathname part from dcache
1364 + *
1365 + * This is a union mount capable version of what d_lookup() & revalidate()
1366 + * would do. This function returns a valid (union) dentry on success.
1367 + *
1368 + * Remember: On failure it means that parts of the union aren't cached. You
1369 + * should call real_lookup() afterwards to find the proper (union) dentry.
1370 + */
1371 +static int cache_lookup_union(struct nameidata *nd, struct qstr *name,
1372 +                             struct path *path)
1373 +{
1374 +       int res ;
1375 +
1376 +       if (!IS_MNT_UNION(nd->path.mnt)) {
1377 +               path->dentry = cache_lookup(nd->path.dentry, name, nd);
1378 +               path->mnt = path->dentry ? nd->path.mnt : NULL;
1379 +               res = path->dentry ? 0 : 1;
1380 +       } else {
1381 +               struct path safe = {
1382 +                       .dentry = nd->path.dentry,
1383 +                       .mnt = nd->path.mnt
1384 +               };
1385 +
1386 +               path_get(&safe);
1387 +               res = __cache_lookup_topmost(nd, name, path);
1388 +               if (res)
1389 +                       goto out;
1390 +
1391 +               /* only directories can be part of a union stack */
1392 +               if (!path->dentry->d_inode ||
1393 +                   !S_ISDIR(path->dentry->d_inode->i_mode))
1394 +                       goto out;
1395 +
1396 +               /* Build the union stack for this part */
1397 +               res = __cache_lookup_build_union(nd, name, path);
1398 +               if (res) {
1399 +                       dput(path->dentry);
1400 +                       if (path->mnt != safe.mnt)
1401 +                               mntput(path->mnt);
1402 +                       goto out;
1403 +               }
1404 +
1405 +out:
1406 +               path_put(&nd->path);
1407 +               nd->path.dentry = safe.dentry;
1408 +               nd->path.mnt = safe.mnt;
1409 +       }
1410 +
1411 +       return res;
1412 +}
1413 +
1414 +/*
1415   * Short-cut version of permission(), for calling by
1416   * path_walk(), when dcache lock is held.  Combines parts
1417   * of permission() and generic_permission(), and tests ONLY for
1418 @@ -467,10 +678,11 @@
1419   * make sure that nobody added the entry to the dcache in the meantime..
1420   * SMP-safe
1421   */
1422 -static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
1423 +static int real_lookup(struct nameidata *nd, struct qstr *name,
1424 +                      struct path *path)
1425  {
1426 -       struct dentry * result;
1427 -       struct inode *dir = parent->d_inode;
1428 +       struct inode *dir = nd->path.dentry->d_inode;
1429 +       int res = 0;
1430
1431         mutex_lock(&dir->i_mutex);
1432         /*
1433 @@ -487,27 +699,36 @@
1434          *
1435          * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
1436          */
1437 -       result = d_lookup(parent, name);
1438 -       if (!result) {
1439 +       path->dentry = d_lookup(nd->path.dentry, name);
1440 +       path->mnt = nd->path.mnt;
1441 +       if (!path->dentry) {
1442                 struct dentry *dentry;
1443
1444                 /* Don't create child dentry for a dead directory. */
1445 -               result = ERR_PTR(-ENOENT);
1446 -               if (IS_DEADDIR(dir))
1447 +               if (IS_DEADDIR(dir)) {
1448 +                       res = -ENOENT;
1449                         goto out_unlock;
1450 +               }
1451
1452 -               dentry = d_alloc(parent, name);
1453 -               result = ERR_PTR(-ENOMEM);
1454 +               dentry = d_alloc(nd->path.dentry, name);
1455                 if (dentry) {
1456 -                       result = dir->i_op->lookup(dir, dentry, nd);
1457 -                       if (result)
1458 +                       path->dentry = dir->i_op->lookup(dir, dentry, nd);
1459 +                       if (path->dentry) {
1460                                 dput(dentry);
1461 -                       else
1462 -                               result = dentry;
1463 +                               if (IS_ERR(path->dentry)) {
1464 +                                       res = PTR_ERR(path->dentry);
1465 +                                       path->dentry = NULL;
1466 +                                       path->mnt = NULL;
1467 +                               }
1468 +                       } else
1469 +                               path->dentry = dentry;
1470 +               } else {
1471 +                       res = -ENOMEM;
1472 +                       path->mnt = NULL;
1473                 }
1474  out_unlock:
1475                 mutex_unlock(&dir->i_mutex);
1476 -               return result;
1477 +               return res;
1478         }
1479
1480         /*
1481 @@ -515,12 +736,170 @@
1482          * we waited on the semaphore. Need to revalidate.
1483          */
1484         mutex_unlock(&dir->i_mutex);
1485 -       if (result->d_op && result->d_op->d_revalidate) {
1486 -               result = do_revalidate(result, nd);
1487 -               if (!result)
1488 -                       result = ERR_PTR(-ENOENT);
1489 +       if (path->dentry->d_op && path->dentry->d_op->d_revalidate) {
1490 +               path->dentry = do_revalidate(path->dentry, nd);
1491 +               if (!path->dentry) {
1492 +                       res = -ENOENT;
1493 +                       path->mnt = NULL;
1494 +               }
1495 +               if (IS_ERR(path->dentry)) {
1496 +                       res = PTR_ERR(path->dentry);
1497 +                       path->dentry = NULL;
1498 +                       path->mnt = NULL;
1499 +               }
1500         }
1501 -       return result;
1502 +
1503 +       return res;
1504 +}
1505 +
1506 +/**
1507 + * __real_lookup_topmost - lookup topmost dentry, non-cached version
1508 + *
1509 + * If we reach a dentry with restricted access, we just stop the lookup
1510 + * because we shouldn't see through that dentry. Same thing for dentry
1511 + * type mismatch and whiteouts.
1512 + *
1513 + * FIXME:
1514 + * - handle union stacks in use
1515 + * - handle union stacks mounted upon union stacks
1516 + * - avoid unnecessary allocations of union locks
1517 + */
1518 +static int __real_lookup_topmost(struct nameidata *nd, struct qstr *name,
1519 +                                struct path *path)
1520 +{
1521 +       struct path next;
1522 +       int err;
1523 +
1524 +       err = real_lookup(nd, name, path);
1525 +       if (err)
1526 +               return err;
1527 +
1528 +       if (path->dentry->d_inode || d_is_whiteout(path->dentry))
1529 +               return 0;
1530 +
1531 +       if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(path->dentry))
1532 +               return 0;
1533 +
1534 +       while (follow_union_down(&nd->path)) {
1535 +               name->hash = full_name_hash(name->name, name->len);
1536 +               if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1537 +                       err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1538 +                                                           name);
1539 +                       if (err < 0)
1540 +                               goto out;
1541 +               }
1542 +
1543 +               err = real_lookup(nd, name, &next);
1544 +               if (err)
1545 +                       goto out;
1546 +
1547 +               if (next.dentry->d_inode || d_is_whiteout(next.dentry)) {
1548 +                       dput(path->dentry);
1549 +                       mntget(next.mnt);
1550 +                       *path = next;
1551 +                       goto out;
1552 +               }
1553 +
1554 +               if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(next.dentry))
1555 +                       goto out;
1556 +
1557 +               dput(next.dentry);
1558 +       }
1559 +out:
1560 +       if (err)
1561 +               dput(path->dentry);
1562 +       return err;
1563 +}
1564 +
1565 +/**
1566 + * __real_lookup_build_union: build the union stack for this pathname
1567 + * part, non-cached version
1568 + *
1569 + * Called when not all parts of the union stack are in cache
1570 + */
1571 +
1572 +static int __real_lookup_build_union(struct nameidata *nd, struct qstr *name,
1573 +                                    struct path *path)
1574 +{
1575 +       struct path last = *path;
1576 +       struct path next;
1577 +       int err = 0;
1578 +
1579 +       while (follow_union_down(&nd->path)) {
1580 +               /* We need to recompute the hash for lower layer lookups */
1581 +               name->hash = full_name_hash(name->name, name->len);
1582 +               if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1583 +                       err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1584 +                                                           name);
1585 +                       if (err < 0)
1586 +                               goto out;
1587 +               }
1588 +
1589 +               err = real_lookup(nd, name, &next);
1590 +               if (err)
1591 +                       goto out;
1592 +
1593 +               if (d_is_whiteout(next.dentry)) {
1594 +                       dput(next.dentry);
1595 +                       break;
1596 +               }
1597 +
1598 +               if (!next.dentry->d_inode) {
1599 +                       dput(next.dentry);
1600 +                       continue;
1601 +               }
1602 +
1603 +               /* only directories can be part of a union stack */
1604 +               if (!S_ISDIR(next.dentry->d_inode->i_mode)) {
1605 +                       dput(next.dentry);
1606 +                       break;
1607 +               }
1608 +
1609 +               /* now we know we found something "real" */
1610 +               append_to_union(last.mnt, last.dentry, next.mnt, next.dentry);
1611 +
1612 +               if (last.dentry != path->dentry)
1613 +                       path_put(&last);
1614 +               last.dentry = next.dentry;
1615 +               last.mnt = mntget(next.mnt);
1616 +       }
1617 +
1618 +       if (last.dentry != path->dentry)
1619 +               path_put(&last);
1620 +out:
1621 +       return err;
1622 +}
1623 +
1624 +static int real_lookup_union(struct nameidata *nd, struct qstr *name,
1625 +                            struct path *path)
1626 +{
1627 +       struct path safe = { .dentry = nd->path.dentry, .mnt = nd->path.mnt };
1628 +       int res ;
1629 +
1630 +       path_get(&safe);
1631 +       res = __real_lookup_topmost(nd, name, path);
1632 +       if (res)
1633 +               goto out;
1634 +
1635 +       /* only directories can be part of a union stack */
1636 +       if (!path->dentry->d_inode ||
1637 +           !S_ISDIR(path->dentry->d_inode->i_mode))
1638 +               goto out;
1639 +
1640 +       /* Build the union stack for this part */
1641 +       res = __real_lookup_build_union(nd, name, path);
1642 +       if (res) {
1643 +               dput(path->dentry);
1644 +               if (path->mnt != safe.mnt)
1645 +                       mntput(path->mnt);
1646 +               goto out;
1647 +       }
1648 +
1649 +out:
1650 +       path_put(&nd->path);
1651 +       nd->path.dentry = safe.dentry;
1652 +       nd->path.mnt = safe.mnt;
1653 +       return res;
1654  }
1655
1656  /*
1657 @@ -623,11 +1002,8 @@
1658         touch_atime(path->mnt, dentry);
1659         nd_set_link(nd, NULL);
1660
1661 -       if (path->mnt != nd->path.mnt) {
1662 -               path_to_nameidata(path, nd);
1663 -               dget(dentry);
1664 -       }
1665 -       mntget(path->mnt);
1666 +       if (path->mnt == nd->path.mnt)
1667 +               mntget(nd->path.mnt);
1668         cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
1669         error = PTR_ERR(cookie);
1670         if (!IS_ERR(cookie)) {
1671 @@ -715,7 +1091,7 @@
1672         return res;
1673  }
1674
1675 -static void follow_mount(struct path *path)
1676 +void follow_mount(struct path *path)
1677  {
1678         while (d_mountpoint(path->dentry)) {
1679                 struct vfsmount *mounted = lookup_mnt(path);
1680 @@ -780,6 +1156,7 @@
1681                 nd->path.mnt = parent;
1682         }
1683         follow_mount(&nd->path);
1684 +       follow_union_mount(&nd->path);
1685  }
1686
1687  /*
1688 @@ -790,35 +1167,55 @@
1689  static int do_lookup(struct nameidata *nd, struct qstr *name,
1690                      struct path *path)
1691  {
1692 -       struct vfsmount *mnt = nd->path.mnt;
1693 -       struct dentry *dentry = __d_lookup(nd->path.dentry, name);
1694 +       int err;
1695 +
1696 +       if (IS_MNT_UNION(nd->path.mnt))
1697 +               goto need_union_lookup;
1698
1699 -       if (!dentry)
1700 +       path->dentry = __d_lookup(nd->path.dentry, name);
1701 +       path->mnt = nd->path.mnt;
1702 +       if (!path->dentry)
1703                 goto need_lookup;
1704 -       if (dentry->d_op && dentry->d_op->d_revalidate)
1705 +       if (path->dentry->d_op && path->dentry->d_op->d_revalidate)
1706                 goto need_revalidate;
1707 +
1708  done:
1709 -       path->mnt = mnt;
1710 -       path->dentry = dentry;
1711 -       __follow_mount(path);
1712 +       if (nd->path.mnt != path->mnt) {
1713 +               nd->um_flags |= LAST_LOWLEVEL;
1714 +               follow_mount(path);
1715 +       } else
1716 +               __follow_mount(path);
1717 +       follow_union_mount(path);
1718         return 0;
1719
1720  need_lookup:
1721 -       dentry = real_lookup(nd->path.dentry, name, nd);
1722 -       if (IS_ERR(dentry))
1723 +       err = real_lookup(nd, name, path);
1724 +       if (err)
1725 +               goto fail;
1726 +       goto done;
1727 +
1728 +need_union_lookup:
1729 +       err = cache_lookup_union(nd, name, path);
1730 +       if (!err && path->dentry)
1731 +               goto done;
1732 +
1733 +       err = real_lookup_union(nd, name, path);
1734 +       if (err)
1735                 goto fail;
1736         goto done;
1737
1738  need_revalidate:
1739 -       dentry = do_revalidate(dentry, nd);
1740 -       if (!dentry)
1741 +       path->dentry = do_revalidate(path->dentry, nd);
1742 +       if (!path->dentry)
1743                 goto need_lookup;
1744 -       if (IS_ERR(dentry))
1745 +       if (IS_ERR(path->dentry)) {
1746 +               err = PTR_ERR(path->dentry);
1747                 goto fail;
1748 +       }
1749         goto done;
1750
1751  fail:
1752 -       return PTR_ERR(dentry);
1753 +       return err;
1754  }
1755
1756  /*
1757 @@ -845,6 +1242,8 @@
1758         if (nd->depth)
1759                 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
1760
1761 +       follow_union_mount(&nd->path);
1762 +
1763         /* At this point we know we have a real path component. */
1764         for(;;) {
1765                 unsigned long hash;
1766 @@ -913,6 +1312,44 @@
1767                 if (err)
1768                         break;
1769
1770 +               /*
1771 +                * We want to create this element on the top level
1772 +                * file system in two cases:
1773 +                *
1774 +                * - We are specifically told to - LOOKUP_TOPMOST.
1775 +                * - This is a directory, and it does not yet exist on
1776 +                *   the top level.  Various tricks only work if
1777 +                *   directories always exist on the top level.
1778 +                *
1779 +                * In either case, only create this element on the top
1780 +                * level if the last element is located on the lower
1781 +                * level.  If the last element is located on the top
1782 +                * level, then every single element in the path
1783 +                * already exists on the top level.
1784 +                *
1785 +                * Note that we can assume that the parent is on the
1786 +                * top level since we always create the directory on
1787 +                * the top level.
1788 +                */
1789 +
1790 +               if ((nd->um_flags & LAST_LOWLEVEL) &&
1791 +                   ((next.dentry->d_inode &&
1792 +                     S_ISDIR(next.dentry->d_inode->i_mode) &&
1793 +                     (nd->path.mnt != next.mnt)) ||
1794 +                    (nd->flags & LOOKUP_TOPMOST))) {
1795 +                       struct dentry *dentry;
1796 +
1797 +                       dentry = union_create_topmost(nd, &this, &next);
1798 +                       if (IS_ERR(dentry)) {
1799 +                               err = PTR_ERR(dentry);
1800 +                               goto out_dput;
1801 +                       }
1802 +                       path_put_conditional(&next, nd);
1803 +                       next.mnt = nd->path.mnt;
1804 +                       next.dentry = dentry;
1805 +                       nd->um_flags &= ~LAST_LOWLEVEL;
1806 +               }
1807 +
1808                 err = -ENOENT;
1809                 inode = next.dentry->d_inode;
1810                 if (!inode)
1811 @@ -962,6 +1399,25 @@
1812                 err = do_lookup(nd, &this, &next);
1813                 if (err)
1814                         break;
1815 +
1816 +               if ((nd->um_flags & LAST_LOWLEVEL) &&
1817 +                   ((next.dentry->d_inode &&
1818 +                     S_ISDIR(next.dentry->d_inode->i_mode) &&
1819 +                     (nd->path.mnt != next.mnt)) ||
1820 +                    (nd->flags & LOOKUP_TOPMOST))) {
1821 +                       struct dentry *dentry;
1822 +
1823 +                       dentry = union_create_topmost(nd, &this, &next);
1824 +                       if (IS_ERR(dentry)) {
1825 +                               err = PTR_ERR(dentry);
1826 +                               goto out_dput;
1827 +                       }
1828 +                       path_put_conditional(&next, nd);
1829 +                       next.mnt = nd->path.mnt;
1830 +                       next.dentry = dentry;
1831 +                       nd->um_flags &= ~LAST_LOWLEVEL;
1832 +               }
1833 +
1834                 inode = next.dentry->d_inode;
1835                 if ((lookup_flags & LOOKUP_FOLLOW)
1836                     && inode && inode->i_op->follow_link) {
1837 @@ -1029,6 +1485,7 @@
1838
1839         nd->last_type = LAST_ROOT; /* if there are only slashes... */
1840         nd->flags = flags;
1841 +       nd->um_flags = 0;
1842         nd->depth = 0;
1843         nd->root.mnt = NULL;
1844
1845 @@ -1172,61 +1629,437 @@
1846  }
1847
1848  static struct dentry *__lookup_hash(struct qstr *name,
1849 -               struct dentry *base, struct nameidata *nd)
1850 +                                   struct dentry *base, struct nameidata *nd)
1851 +{
1852 +       struct dentry *dentry;
1853 +       struct inode *inode;
1854 +       int err;
1855 +
1856 +       inode = base->d_inode;
1857 +
1858 +       /*
1859 +        * See if the low-level filesystem might want
1860 +        * to use its own hash..
1861 +        */
1862 +       if (base->d_op && base->d_op->d_hash) {
1863 +               err = base->d_op->d_hash(base, name);
1864 +               dentry = ERR_PTR(err);
1865 +               if (err < 0)
1866 +                       goto out;
1867 +       }
1868 +
1869 +       dentry = cache_lookup(base, name, nd);
1870 +       if (!dentry) {
1871 +               struct dentry *new;
1872 +
1873 +               /* Don't create child dentry for a dead directory. */
1874 +               dentry = ERR_PTR(-ENOENT);
1875 +               if (IS_DEADDIR(inode))
1876 +                       goto out;
1877 +
1878 +               new = d_alloc(base, name);
1879 +               dentry = ERR_PTR(-ENOMEM);
1880 +               if (!new)
1881 +                       goto out;
1882 +               dentry = inode->i_op->lookup(inode, new, nd);
1883 +               if (!dentry)
1884 +                       dentry = new;
1885 +               else
1886 +                       dput(new);
1887 +       }
1888 +out:
1889 +       return dentry;
1890 +}
1891 +
1892 +/*
1893 + * Restricted form of lookup. Doesn't follow links, single-component only,
1894 + * needs parent already locked. Doesn't follow mounts.
1895 + * SMP-safe.
1896 + */
1897 +static int lookup_hash(struct nameidata *nd, struct qstr *name,
1898 +                      struct path *path)
1899 +{
1900 +       int err;
1901 +
1902 +       err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
1903 +       if (err)
1904 +               return err;
1905 +       path->mnt = nd->path.mnt;
1906 +       path->dentry =  __lookup_hash(name, nd->path.dentry, nd);
1907 +       if (IS_ERR(path->dentry)) {
1908 +               err = PTR_ERR(path->dentry);
1909 +               path->dentry = NULL;
1910 +               path->mnt = NULL;
1911 +       }
1912 +       return err;
1913 +}
1914 +
1915 +static int __hash_lookup_topmost(struct nameidata *nd, struct qstr *name,
1916 +                                struct path *path)
1917 +{
1918 +       struct path next;
1919 +       int err;
1920 +
1921 +       err = lookup_hash(nd, name, path);
1922 +       if (err)
1923 +               return err;
1924 +
1925 +       if (path->dentry->d_inode || d_is_whiteout(path->dentry))
1926 +               return 0;
1927 +
1928 +       if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(path->dentry))
1929 +               return 0;
1930 +
1931 +       while (follow_union_down(&nd->path)) {
1932 +               name->hash = full_name_hash(name->name, name->len);
1933 +               if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1934 +                       err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1935 +                                                           name);
1936 +                       if (err < 0)
1937 +                               goto out;
1938 +               }
1939 +
1940 +               mutex_lock(&nd->path.dentry->d_inode->i_mutex);
1941 +               err = lookup_hash(nd, name, &next);
1942 +               mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
1943 +               if (err)
1944 +                       goto out;
1945 +
1946 +               if (next.dentry->d_inode || d_is_whiteout(next.dentry)) {
1947 +                       dput(path->dentry);
1948 +                       mntget(next.mnt);
1949 +                       *path = next;
1950 +                       goto out;
1951 +               }
1952 +
1953 +               if (IS_OPAQUE(nd->path.dentry->d_inode) && !d_is_fallthru(next.dentry))
1954 +                       goto out;
1955 +
1956 +               dput(next.dentry);
1957 +       }
1958 +out:
1959 +       if (err)
1960 +               dput(path->dentry);
1961 +       return err;
1962 +}
1963 +
1964 +static int __hash_lookup_build_union(struct nameidata *nd, struct qstr *name,
1965 +                                    struct path *path)
1966 +{
1967 +       struct path last = *path;
1968 +       struct path next;
1969 +       int err = 0;
1970 +
1971 +       while (follow_union_down(&nd->path)) {
1972 +               /* We need to recompute the hash for lower layer lookups */
1973 +               name->hash = full_name_hash(name->name, name->len);
1974 +               if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
1975 +                       err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
1976 +                                                           name);
1977 +                       if (err < 0)
1978 +                               goto out;
1979 +               }
1980 +
1981 +               mutex_lock(&nd->path.dentry->d_inode->i_mutex);
1982 +               err = lookup_hash(nd, name, &next);
1983 +               mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
1984 +               if (err)
1985 +                       goto out;
1986 +
1987 +               if (d_is_whiteout(next.dentry)) {
1988 +                       dput(next.dentry);
1989 +                       break;
1990 +               }
1991 +
1992 +               if (!next.dentry->d_inode) {
1993 +                       dput(next.dentry);
1994 +                       continue;
1995 +               }
1996 +
1997 +               /* only directories can be part of a union stack */
1998 +               if (!S_ISDIR(next.dentry->d_inode->i_mode)) {
1999 +                       dput(next.dentry);
2000 +                       break;
2001 +               }
2002 +
2003 +               /* now we know we found something "real" */
2004 +               append_to_union(last.mnt, last.dentry, next.mnt, next.dentry);
2005 +
2006 +               if (last.dentry != path->dentry)
2007 +                       path_put(&last);
2008 +               last.dentry = next.dentry;
2009 +               last.mnt = mntget(next.mnt);
2010 +       }
2011 +
2012 +       if (last.dentry != path->dentry)
2013 +               path_put(&last);
2014 +out:
2015 +       return err;
2016 +}
2017 +
2018 +int hash_lookup_union(struct nameidata *nd, struct qstr *name,
2019 +                            struct path *path)
2020 +{
2021 +       struct path safe = { .dentry = nd->path.dentry, .mnt = nd->path.mnt };
2022 +       int res ;
2023 +
2024 +       path_get(&safe);
2025 +       res = __hash_lookup_topmost(nd, name, path);
2026 +       if (res)
2027 +               goto out;
2028 +
2029 +       /* only directories can be part of a union stack */
2030 +       if (!path->dentry->d_inode ||
2031 +           !S_ISDIR(path->dentry->d_inode->i_mode))
2032 +               goto out;
2033 +
2034 +       /* Build the union stack for this part */
2035 +       res = __hash_lookup_build_union(nd, name, path);
2036 +       if (res) {
2037 +               dput(path->dentry);
2038 +               if (path->mnt != safe.mnt)
2039 +                       mntput(path->mnt);
2040 +               goto out;
2041 +       }
2042 +
2043 +out:
2044 +       path_put(&nd->path);
2045 +       nd->path.dentry = safe.dentry;
2046 +       nd->path.mnt = safe.mnt;
2047 +       return res;
2048 +}
2049 +
2050 +/**
2051 + * do_union_hash_lookup() - walk down the union stack and lookup_hash()
2052 + * @nd: nameidata of parent to lookup from
2053 + * @name: pathname component to lookup
2054 + * @path: path to store result of lookup in
2055 + *
2056 + * Walk down the union stack and search for single pathname component name. It
2057 + * is assumed that the caller already did a lookup_hash() in the topmost parent
2058 + * that gave negative lookup result. Therefore this does call lookup_hash() in
2059 + * every lower layer (!) of the union stack. If a directory is found the union
2060 + * stack for that is assembled as well.
2061 + *
2062 + * Note:
2063 + * The caller needs to take care of holding a valid reference to the topmost
2064 + * parent.
2065 + * On error we leave @path untouched as well as when we don't find anything.
2066 + */
2067 +static int do_union_hash_lookup(struct nameidata *nd, struct qstr *name,
2068 +                               struct path *path)
2069 +{
2070 +       struct path next;
2071 +       int err = 0;
2072 +
2073 +       while (follow_union_down(&nd->path)) {
2074 +               /* rehash because of d_op->d_hash() by the previous layer */
2075 +               name->hash = full_name_hash(name->name, name->len);
2076 +
2077 +               mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2078 +               err = lookup_hash(nd, name, &next);
2079 +               mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
2080 +
2081 +               if (err)
2082 +                       break;
2083 +
2084 +               if (next.dentry->d_inode) {
2085 +                       mntget(next.mnt);
2086 +                       if (!S_ISDIR(next.dentry->d_inode->i_mode)) {
2087 +                               *path = next;
2088 +                               break;
2089 +                       }
2090 +                       err = __hash_lookup_build_union(nd, name, &next);
2091 +                       if (err)
2092 +                               path_put(&next);
2093 +                       else
2094 +                               *path = next;
2095 +                       break;
2096 +               }
2097 +
2098 +               path_put_conditional(&next, nd);
2099 +
2100 +               if ((IS_OPAQUE(nd->path.dentry->d_inode) &&
2101 +                    !d_is_fallthru(next.dentry)) ||
2102 +                   d_is_whiteout(next.dentry))
2103 +                       break;
2104 +       }
2105 +
2106 +       return err;
2107 +}
2108 +
2109 +/**
2110 + * _hash_lookup_union() - lookup single pathname component
2111 + * @nd: nameidata of parent to lookup from
2112 + * @name: pathname component to lookup
2113 + * @path: path to store result of lookup in
2114 + *
2115 + * Returns the topmost parent locked and the target dentry found in the union
2116 + * or the topmost negative target dentry otherwise.
2117 + *
2118 + * Note:
2119 + * Returns topmost parent locked even on error.
2120 + */
2121 +static int _hash_lookup_union(struct nameidata *nd, struct qstr *name,
2122 +                             struct path *path)
2123 +{
2124 +       struct path parent = nd->path;
2125 +       struct path topmost;
2126 +       int err;
2127 +
2128 +       mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2129 +       err = lookup_hash(nd, name, path);
2130 +       if (err)
2131 +               return err;
2132 +
2133 +       /* return if we found something and it isn't a directory we are done */
2134 +       if (path->dentry->d_inode && !S_ISDIR(path->dentry->d_inode->i_mode))
2135 +               return 0;
2136 +
2137 +       /* stop lookup if the parent directory is marked opaque */
2138 +       if ((IS_OPAQUE(nd->path.dentry->d_inode) &&
2139 +            !d_is_fallthru(path->dentry)) ||
2140 +           d_is_whiteout(path->dentry))
2141 +               return 0;
2142 +
2143 +       if (!strcmp(path->mnt->mnt_sb->s_type->name, "proc") ||
2144 +           !strcmp(path->mnt->mnt_sb->s_type->name, "sysfs"))
2145 +               return 0;
2146 +
2147 +       mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
2148 +
2149 +       /*
2150 +        * safe a reference to the topmost parent for walking the union stack
2151 +        */
2152 +       path_get(&parent);
2153 +       topmost = *path;
2154 +
2155 +       if (path->dentry->d_inode && S_ISDIR(path->dentry->d_inode->i_mode)) {
2156 +               err = __hash_lookup_build_union(nd, name, path);
2157 +               if (err)
2158 +                       goto err_lock_parent;
2159 +               goto out_lock_and_revalidate_parent;
2160 +       }
2161 +
2162 +       err = do_union_hash_lookup(nd, name, path);
2163 +       if (err)
2164 +               goto err_lock_parent;
2165 +
2166 +out_lock_and_revalidate_parent:
2167 +       /* seems that we haven't found anything, so return the topmost */
2168 +       path_to_nameidata(&parent, nd);
2169 +       mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2170 +
2171 +       if (topmost.dentry == path->dentry) {
2172 +               spin_lock(&path->dentry->d_lock);
2173 +               if (nd->path.dentry != path->dentry->d_parent) {
2174 +                       spin_unlock(&path->dentry->d_lock);
2175 +                       dput(path->dentry);
2176 +                       name->hash = full_name_hash(name->name, name->len);
2177 +                       err = lookup_hash(nd, name, path);
2178 +                       if (err)
2179 +                               return err;
2180 +                       /* FIXME: What if we find a directory here ... */
2181 +                       return err;
2182 +               }
2183 +               spin_unlock(&path->dentry->d_lock);
2184 +       } else
2185 +               dput(topmost.dentry);
2186 +
2187 +       return 0;
2188 +
2189 +err_lock_parent:
2190 +       path_to_nameidata(&parent, nd);
2191 +       path_put_conditional(path, nd);
2192 +       mutex_lock(&nd->path.dentry->d_inode->i_mutex);
2193 +       return err;
2194 +}
2195 +
2196 +/**
2197 + * lookup_rename_source() - lookup the source used by rename
2198 + *
2199 + * This is a special version of _hash_lookup_union() which becomes necessary
2200 + * for finding the source of a rename on union mounts.
2201 + *
2202 + * See comment for _hash_lookup_union() above.
2203 + */
2204 +static int lookup_rename_source(struct nameidata *oldnd,
2205 +                               struct nameidata *newnd,
2206 +                               struct dentry **trap, struct qstr *name,
2207 +                               struct path *old)
2208  {
2209 -       struct dentry *dentry;
2210 -       struct inode *inode;
2211 +       struct path parent = oldnd->path;
2212 +       struct path topmost;
2213         int err;
2214
2215 -       inode = base->d_inode;
2216 +       err = lookup_hash(oldnd, name, old);
2217 +       if (err)
2218 +               return err;
2219 +
2220 +       /* return if we found something and it isn't a directory we are done */
2221 +       if (old->dentry->d_inode && !S_ISDIR(old->dentry->d_inode->i_mode))
2222 +               return 0;
2223 +
2224 +       /* stop lookup if the parent directory is marked opaque */
2225 +       if ((IS_OPAQUE(oldnd->path.dentry->d_inode) &&
2226 +            !d_is_fallthru(old->dentry)) ||
2227 +           d_is_whiteout(old->dentry))
2228 +               return 0;
2229 +
2230 +       if (!strcmp(old->mnt->mnt_sb->s_type->name, "proc") ||
2231 +           !strcmp(old->mnt->mnt_sb->s_type->name, "sysfs"))
2232 +               return 0;
2233 +
2234 +       unlock_rename(oldnd->path.dentry, newnd->path.dentry);
2235
2236         /*
2237 -        * See if the low-level filesystem might want
2238 -        * to use its own hash..
2239 +        * safe a reference to the topmost parent for walking the union stack
2240          */
2241 -       if (base->d_op && base->d_op->d_hash) {
2242 -               err = base->d_op->d_hash(base, name);
2243 -               dentry = ERR_PTR(err);
2244 -               if (err < 0)
2245 -                       goto out;
2246 +       path_get(&parent);
2247 +       topmost = *old;
2248 +
2249 +       if (old->dentry->d_inode && S_ISDIR(old->dentry->d_inode->i_mode)) {
2250 +               err = __hash_lookup_build_union(oldnd, name, old);
2251 +               if (err)
2252 +                       goto err_lock;
2253 +               goto out_lock_and_revalidate_parent;
2254         }
2255
2256 -       dentry = cached_lookup(base, name, nd);
2257 -       if (!dentry) {
2258 -               struct dentry *new;
2259 +       err = do_union_hash_lookup(oldnd, name, old);
2260 +       if (err)
2261 +               goto err_lock;
2262
2263 -               /* Don't create child dentry for a dead directory. */
2264 -               dentry = ERR_PTR(-ENOENT);
2265 -               if (IS_DEADDIR(inode))
2266 -                       goto out;
2267 +out_lock_and_revalidate_parent:
2268 +       path_to_nameidata(&parent, oldnd);
2269 +       *trap = lock_rename(oldnd->path.dentry, newnd->path.dentry);
2270
2271 -               new = d_alloc(base, name);
2272 -               dentry = ERR_PTR(-ENOMEM);
2273 -               if (!new)
2274 -                       goto out;
2275 -               dentry = inode->i_op->lookup(inode, new, nd);
2276 -               if (!dentry)
2277 -                       dentry = new;
2278 -               else
2279 -                       dput(new);
2280 -       }
2281 -out:
2282 -       return dentry;
2283 -}
2284 +       /*
2285 +        * If we return the topmost dentry we have to make sure that it has not
2286 +        * been moved away while we gave up the topmost parents i_mutex lock.
2287 +        */
2288 +       if (topmost.dentry == old->dentry) {
2289 +               spin_lock(&old->dentry->d_lock);
2290 +               if (oldnd->path.dentry != old->dentry->d_parent) {
2291 +                       spin_unlock(&old->dentry->d_lock);
2292 +                       dput(old->dentry);
2293 +                       name->hash = full_name_hash(name->name, name->len);
2294 +                       err = lookup_hash(oldnd, name, old);
2295 +                       if (err)
2296 +                               return err;
2297 +                       /* FIXME: What if we find a directory here ... */
2298 +                       return err;
2299 +               }
2300 +               spin_unlock(&old->dentry->d_lock);
2301 +       } else
2302 +               dput(topmost.dentry);
2303
2304 -/*
2305 - * Restricted form of lookup. Doesn't follow links, single-component only,
2306 - * needs parent already locked. Doesn't follow mounts.
2307 - * SMP-safe.
2308 - */
2309 -static struct dentry *lookup_hash(struct nameidata *nd)
2310 -{
2311 -       int err;
2312 +       return 0;
2313
2314 -       err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
2315 -       if (err)
2316 -               return ERR_PTR(err);
2317 -       return __lookup_hash(&nd->last, nd->path.dentry, nd);
2318 +err_lock:
2319 +       path_to_nameidata(&parent, oldnd);
2320 +       path_put_conditional(old, oldnd);
2321 +       *trap = lock_rename(oldnd->path.dentry, newnd->path.dentry);
2322 +       return err;
2323  }
2324
2325  static int __lookup_one_len(const char *name, struct qstr *this,
2326 @@ -1502,8 +2335,9 @@
2327         return error;
2328  }
2329
2330 -int may_open(struct path *path, int acc_mode, int flag)
2331 +int may_open(struct nameidata *nd, int acc_mode, int flag)
2332  {
2333 +       struct path *path = &nd->path;
2334         struct dentry *dentry = path->dentry;
2335         struct inode *inode = dentry->d_inode;
2336         int error;
2337 @@ -1529,7 +2363,7 @@
2338                 break;
2339         }
2340
2341 -       error = inode_permission(inode, acc_mode);
2342 +       error = union_permission(path, acc_mode);
2343         if (error)
2344                 return error;
2345
2346 @@ -1575,6 +2409,9 @@
2347                 if (!error)
2348                         error = security_path_truncate(path, 0,
2349                                                ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
2350 +               /* XXX don't copy up file data */
2351 +               if (is_unionized(path->dentry, path->mnt))
2352 +                       error = union_copyup(nd, flag /* XXX not used */);
2353                 if (!error) {
2354                         vfs_dq_init(inode);
2355
2356 @@ -1621,7 +2458,7 @@
2357         if (error)
2358                 return error;
2359         /* Don't check for write permission, don't truncate */
2360 -       return may_open(&nd->path, 0, flag & ~O_TRUNC);
2361 +       return may_open(nd, 0, flag & ~O_TRUNC);
2362  }
2363
2364  /*
2365 @@ -1736,12 +2573,10 @@
2366         if (flag & O_EXCL)
2367                 nd.flags |= LOOKUP_EXCL;
2368         mutex_lock(&dir->d_inode->i_mutex);
2369 -       path.dentry = lookup_hash(&nd);
2370 -       path.mnt = nd.path.mnt;
2371 +       error = hash_lookup_union(&nd, &nd.last, &path);
2372
2373  do_last:
2374 -       error = PTR_ERR(path.dentry);
2375 -       if (IS_ERR(path.dentry)) {
2376 +       if (error) {
2377                 mutex_unlock(&dir->d_inode->i_mutex);
2378                 goto exit;
2379         }
2380 @@ -1801,10 +2636,23 @@
2381         if (path.dentry->d_inode->i_op->follow_link)
2382                 goto do_link;
2383
2384 -       path_to_nameidata(&path, &nd);
2385         error = -EISDIR;
2386         if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
2387 -               goto exit;
2388 +               goto exit_dput;
2389 +
2390 +       /*
2391 +        * If this file is on a lower layer of the union stack, copy it to the
2392 +        * topmost layer before opening it
2393 +        */
2394 +       if (path.dentry->d_inode &&
2395 +           (path.dentry->d_parent != dir) &&
2396 +           S_ISREG(path.dentry->d_inode->i_mode)) {
2397 +               error = __union_copyup(&path, &nd, &path);
2398 +               if (error)
2399 +                       goto exit_dput;
2400 +       }
2401 +
2402 +       path_to_nameidata(&path, &nd);
2403  ok:
2404         /*
2405          * Consider:
2406 @@ -1822,12 +2670,18 @@
2407                 if (error)
2408                         goto exit;
2409         }
2410 -       error = may_open(&nd.path, acc_mode, flag);
2411 +       error = may_open(&nd, acc_mode, flag);
2412         if (error) {
2413                 if (will_write)
2414                         mnt_drop_write(nd.path.mnt);
2415                 goto exit;
2416         }
2417 +       /* Okay, all permissions go, now copy up */
2418 +       if (!(flag & O_CREAT) && (flag & FMODE_WRITE)) {
2419 +               error = union_copyup(&nd, flag /* XXX not used */);
2420 +               if (error)
2421 +                       goto exit;
2422 +       }
2423         filp = nameidata_to_filp(&nd, open_flag);
2424         if (IS_ERR(filp))
2425                 ima_counts_put(&nd.path,
2426 @@ -1902,8 +2756,7 @@
2427         }
2428         dir = nd.path.dentry;
2429         mutex_lock(&dir->d_inode->i_mutex);
2430 -       path.dentry = lookup_hash(&nd);
2431 -       path.mnt = nd.path.mnt;
2432 +       error = hash_lookup_union(&nd, &nd.last, &path);
2433         __putname(nd.last.name);
2434         goto do_last;
2435  }
2436 @@ -1937,7 +2790,8 @@
2437   */
2438  struct dentry *lookup_create(struct nameidata *nd, int is_dir)
2439  {
2440 -       struct dentry *dentry = ERR_PTR(-EEXIST);
2441 +       struct path path = { .dentry = ERR_PTR(-EEXIST) } ;
2442 +       int err;
2443
2444         mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2445         /*
2446 @@ -1953,11 +2807,13 @@
2447         /*
2448          * Do the final lookup.
2449          */
2450 -       dentry = lookup_hash(nd);
2451 -       if (IS_ERR(dentry))
2452 +       err = hash_lookup_union(nd, &nd->last, &path);
2453 +       if (err) {
2454 +               path.dentry = ERR_PTR(err);
2455                 goto fail;
2456 +       }
2457
2458 -       if (dentry->d_inode)
2459 +       if (path.dentry->d_inode)
2460                 goto eexist;
2461         /*
2462          * Special case - lookup gave negative, but... we had foo/bar/
2463 @@ -1966,15 +2822,17 @@
2464          * been asking for (non-existent) directory. -ENOENT for you.
2465          */
2466         if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
2467 -               dput(dentry);
2468 -               dentry = ERR_PTR(-ENOENT);
2469 +               path_put_conditional(&path, nd);
2470 +               path.dentry = ERR_PTR(-ENOENT);
2471         }
2472 -       return dentry;
2473 +       if (nd->path.mnt != path.mnt)
2474 +               mntput(path.mnt);
2475 +       return path.dentry;
2476  eexist:
2477 -       dput(dentry);
2478 -       dentry = ERR_PTR(-EEXIST);
2479 +       path_put_conditional(&path, nd);
2480 +       path.dentry = ERR_PTR(-EEXIST);
2481  fail:
2482 -       return dentry;
2483 +       return path.dentry;
2484  }
2485  EXPORT_SYMBOL_GPL(lookup_create);
2486
2487 @@ -2086,6 +2944,7 @@
2488  int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2489  {
2490         int error = may_create(dir, dentry);
2491 +       int opaque = 0;
2492
2493         if (error)
2494                 return error;
2495 @@ -2099,9 +2958,18 @@
2496                 return error;
2497
2498         vfs_dq_init(dir);
2499 +
2500 +       if (d_is_whiteout(dentry))
2501 +               opaque = 1;
2502 +
2503         error = dir->i_op->mkdir(dir, dentry, mode);
2504 -       if (!error)
2505 +       if (!error) {
2506                 fsnotify_mkdir(dir, dentry);
2507 +               if (opaque) {
2508 +                       dentry->d_inode->i_flags |= S_OPAQUE;
2509 +                       mark_inode_dirty(dentry->d_inode);
2510 +               }
2511 +       }
2512         return error;
2513  }
2514
2515 @@ -2147,6 +3015,212 @@
2516         return sys_mkdirat(AT_FDCWD, pathname, mode);
2517  }
2518
2519 +
2520 +/* Checks on the victim for whiteout */
2521 +static inline int may_whiteout(struct inode *dir, struct dentry *victim,
2522 +                              int isdir)
2523 +{
2524 +       int err;
2525 +
2526 +       /* from may_create() */
2527 +       if (IS_DEADDIR(dir))
2528 +               return -ENOENT;
2529 +       err = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2530 +       if (err)
2531 +               return err;
2532 +
2533 +       /* from may_delete() */
2534 +       if (IS_APPEND(dir))
2535 +               return -EPERM;
2536 +       if (!victim->d_inode)
2537 +               return 0;
2538 +       if (check_sticky(dir, victim->d_inode) ||
2539 +           IS_APPEND(victim->d_inode) ||
2540 +           IS_IMMUTABLE(victim->d_inode))
2541 +               return -EPERM;
2542 +       if (isdir) {
2543 +               if (!S_ISDIR(victim->d_inode->i_mode))
2544 +                       return -ENOTDIR;
2545 +               if (IS_ROOT(victim))
2546 +                       return -EBUSY;
2547 +       } else if (S_ISDIR(victim->d_inode->i_mode))
2548 +               return -EISDIR;
2549 +       if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2550 +               return -EBUSY;
2551 +       return 0;
2552 +}
2553 +
2554 +/**
2555 + * vfs_whiteout: creates a white-out for the given directory entry
2556 + * @dir: parent inode
2557 + * @dentry: directory entry to white-out
2558 + *
2559 + * Simply white-out a given directory entry. This functionality is usually used
2560 + * in the sense of unlink. Therefore the given dentry can still be in-use and
2561 + * contains an in-use inode. The filesystem has to do what unlink or rmdir
2562 + * would in that case. Since the dentry still might be in-use we have to
2563 + * provide a fresh unhashed dentry that whiteout can fill the new inode into.
2564 + * In that case the given dentry is dropped and the fresh dentry containing the
2565 + * whiteout is rehashed instead. If the given dentry is unused, the whiteout
2566 + * inode is instantiated into it instead.
2567 + *
2568 + * After this returns with success, don't make any assumptions about the inode.
2569 + * Just dput() it dentry.
2570 + */
2571 +static int vfs_whiteout(struct inode *dir, struct dentry *dentry, int isdir)
2572 +{
2573 +       int err;
2574 +       struct inode *old_inode = dentry->d_inode;
2575 +       struct dentry *parent, *whiteout;
2576 +
2577 +       err = may_whiteout(dir, dentry, isdir);
2578 +       if (err)
2579 +               return err;
2580 +
2581 +       BUG_ON(dentry->d_parent->d_inode != dir);
2582 +
2583 +       if (!dir->i_op || !dir->i_op->whiteout)
2584 +               return -EOPNOTSUPP;
2585 +
2586 +       if (old_inode) {
2587 +               vfs_dq_init(dir);
2588 +
2589 +               mutex_lock(&old_inode->i_mutex);
2590 +               if (isdir)
2591 +                       dentry_unhash(dentry);
2592 +               if (d_mountpoint(dentry))
2593 +                       err = -EBUSY;
2594 +               else {
2595 +                       if (isdir)
2596 +                               err = security_inode_rmdir(dir, dentry);
2597 +                       else
2598 +                               err = security_inode_unlink(dir, dentry);
2599 +               }
2600 +       }
2601 +
2602 +       parent = dget_parent(dentry);
2603 +       whiteout = d_alloc_name(parent, dentry->d_name.name);
2604 +
2605 +       if (!err)
2606 +               err = dir->i_op->whiteout(dir, dentry, whiteout);
2607 +
2608 +       if (old_inode) {
2609 +               mutex_unlock(&old_inode->i_mutex);
2610 +               if (!err) {
2611 +                       fsnotify_link_count(old_inode);
2612 +                       d_delete(dentry);
2613 +               }
2614 +               if (isdir)
2615 +                       dput(dentry);
2616 +       }
2617 +
2618 +       dput(whiteout);
2619 +       dput(parent);
2620 +       return err;
2621 +}
2622 +
2623 +int path_whiteout(struct path *dir_path, struct dentry *dentry, int isdir)
2624 +{
2625 +       int error = mnt_want_write(dir_path->mnt);
2626 +
2627 +       if (!error) {
2628 +               error = vfs_whiteout(dir_path->dentry->d_inode, dentry, isdir);
2629 +               mnt_drop_write(dir_path->mnt);
2630 +       }
2631 +
2632 +       return error;
2633 +}
2634 +EXPORT_SYMBOL(path_whiteout);
2635 +
2636 +/*
2637 + * This is abusing readdir to check if a union directory is logically empty.
2638 + * Al Viro barfed when he saw this, but Val said: "Well, at this point I'm
2639 + * aiming for working, pretty can come later"
2640 + */
2641 +static int filldir_is_empty(void *__buf, const char *name, int namlen,
2642 +                           loff_t offset, u64 ino, unsigned int d_type)
2643 +{
2644 +       int *is_empty = (int *)__buf;
2645 +
2646 +       switch (namlen) {
2647 +       case 2:
2648 +               if (name[1] != '.')
2649 +                       break;
2650 +       case 1:
2651 +               if (name[0] != '.')
2652 +                       break;
2653 +               return 0;
2654 +       }
2655 +
2656 +       if (d_type == DT_WHT)
2657 +               return 0;
2658 +
2659 +       (*is_empty) = 0;
2660 +       return 0;
2661 +}
2662 +
2663 +static int directory_is_empty(struct dentry *dentry, struct vfsmount *mnt)
2664 +{
2665 +       struct file *file;
2666 +       int err;
2667 +       int is_empty = 1;
2668 +
2669 +       BUG_ON(!S_ISDIR(dentry->d_inode->i_mode));
2670 +
2671 +       /* references for the file pointer */
2672 +       dget(dentry);
2673 +       mntget(mnt);
2674 +
2675 +       file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
2676 +       if (IS_ERR(file))
2677 +               return 0;
2678 +
2679 +       err = vfs_readdir(file, filldir_is_empty, &is_empty);
2680 +
2681 +       fput(file);
2682 +       return is_empty;
2683 +}
2684 +
2685 +static int do_whiteout(struct nameidata *nd, struct path *path, int isdir)
2686 +{
2687 +       struct path safe = { .dentry = dget(nd->path.dentry),
2688 +                            .mnt = mntget(nd->path.mnt) };
2689 +       struct dentry *dentry = path->dentry;
2690 +       int err;
2691 +
2692 +       err = may_whiteout(nd->path.dentry->d_inode, dentry, isdir);
2693 +       if (err)
2694 +               goto out;
2695 +
2696 +       err = -ENOENT;
2697 +       if (!dentry->d_inode)
2698 +               goto out;
2699 +
2700 +       err = -ENOTEMPTY;
2701 +       if (isdir && !directory_is_empty(path->dentry, path->mnt))
2702 +               goto out;
2703 +
2704 +       if (nd->path.dentry != dentry->d_parent) {
2705 +               dentry = __lookup_hash(&path->dentry->d_name, nd->path.dentry,
2706 +                                      nd);
2707 +               err = PTR_ERR(dentry);
2708 +               if (IS_ERR(dentry))
2709 +                       goto out;
2710 +
2711 +               dput(path->dentry);
2712 +               if (path->mnt != safe.mnt)
2713 +                       mntput(path->mnt);
2714 +               path->mnt = nd->path.mnt;
2715 +               path->dentry = dentry;
2716 +       }
2717 +
2718 +       err = vfs_whiteout(nd->path.dentry->d_inode, dentry, isdir);
2719 +
2720 +out:
2721 +       path_put(&safe);
2722 +       return err;
2723 +}
2724 +
2725  /*
2726   * We try to drop the dentry early: we should have
2727   * a usage count of 2 if we're the only user of this
2728 @@ -2211,7 +3285,7 @@
2729  {
2730         int error = 0;
2731         char * name;
2732 -       struct dentry *dentry;
2733 +       struct path path;
2734         struct nameidata nd;
2735
2736         error = user_path_parent(dfd, pathname, &nd, &name);
2737 @@ -2233,21 +3307,24 @@
2738         nd.flags &= ~LOOKUP_PARENT;
2739
2740         mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2741 -       dentry = lookup_hash(&nd);
2742 -       error = PTR_ERR(dentry);
2743 -       if (IS_ERR(dentry))
2744 +       error = hash_lookup_union(&nd, &nd.last, &path);
2745 +       if (error)
2746                 goto exit2;
2747 +       if (is_unionized(nd.path.dentry, nd.path.mnt)) {
2748 +               error = do_whiteout(&nd, &path, 1);
2749 +               goto exit3;
2750 +       }
2751         error = mnt_want_write(nd.path.mnt);
2752         if (error)
2753                 goto exit3;
2754 -       error = security_path_rmdir(&nd.path, dentry);
2755 +       error = security_path_rmdir(&nd.path, path.dentry);
2756         if (error)
2757                 goto exit4;
2758 -       error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2759 +       error = vfs_rmdir(nd.path.dentry->d_inode, path.dentry);
2760  exit4:
2761         mnt_drop_write(nd.path.mnt);
2762  exit3:
2763 -       dput(dentry);
2764 +       path_put_conditional(&path, &nd);
2765  exit2:
2766         mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2767  exit1:
2768 @@ -2302,7 +3379,7 @@
2769  {
2770         int error;
2771         char *name;
2772 -       struct dentry *dentry;
2773 +       struct path path;
2774         struct nameidata nd;
2775         struct inode *inode = NULL;
2776
2777 @@ -2317,26 +3394,29 @@
2778         nd.flags &= ~LOOKUP_PARENT;
2779
2780         mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2781 -       dentry = lookup_hash(&nd);
2782 -       error = PTR_ERR(dentry);
2783 -       if (!IS_ERR(dentry)) {
2784 +       error = hash_lookup_union(&nd, &nd.last, &path);
2785 +       if (!error) {
2786                 /* Why not before? Because we want correct error value */
2787                 if (nd.last.name[nd.last.len])
2788                         goto slashes;
2789 -               inode = dentry->d_inode;
2790 +               inode = path.dentry->d_inode;
2791                 if (inode)
2792                         atomic_inc(&inode->i_count);
2793 +               if (is_unionized(nd.path.dentry, nd.path.mnt)) {
2794 +                       error = do_whiteout(&nd, &path, 0);
2795 +                       goto exit2;
2796 +               }
2797                 error = mnt_want_write(nd.path.mnt);
2798                 if (error)
2799                         goto exit2;
2800 -               error = security_path_unlink(&nd.path, dentry);
2801 +               error = security_path_unlink(&nd.path, path.dentry);
2802                 if (error)
2803                         goto exit3;
2804 -               error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2805 +               error = vfs_unlink(nd.path.dentry->d_inode, path.dentry);
2806  exit3:
2807                 mnt_drop_write(nd.path.mnt);
2808         exit2:
2809 -               dput(dentry);
2810 +               path_put_conditional(&path, &nd);
2811         }
2812         mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2813         if (inode)
2814 @@ -2347,8 +3427,8 @@
2815         return error;
2816
2817  slashes:
2818 -       error = !dentry->d_inode ? -ENOENT :
2819 -               S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2820 +       error = !path.dentry->d_inode ? -ENOENT :
2821 +               S_ISDIR(path.dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2822         goto exit2;
2823  }
2824
2825 @@ -2684,11 +3764,96 @@
2826         return error;
2827  }
2828
2829 +static int vfs_rename_union(struct nameidata *oldnd, struct path *old,
2830 +                           struct nameidata *newnd, struct path *new)
2831 +{
2832 +       struct inode *old_dir = oldnd->path.dentry->d_inode;
2833 +       struct inode *new_dir = newnd->path.dentry->d_inode;
2834 +       struct qstr old_name;
2835 +       char *name;
2836 +       struct dentry *dentry;
2837 +       int error;
2838 +
2839 +       if (old->dentry->d_inode == new->dentry->d_inode)
2840 +               return 0;
2841 +       error = may_whiteout(old_dir, old->dentry, 0);
2842 +       if (error)
2843 +               return error;
2844 +       if (!old_dir->i_op || !old_dir->i_op->whiteout)
2845 +               return -EPERM;
2846 +
2847 +       if (!new->dentry->d_inode)
2848 +               error = may_create(new_dir, new->dentry);
2849 +       else
2850 +               error = may_delete(new_dir, new->dentry, 0);
2851 +       if (error)
2852 +               return error;
2853 +
2854 +       vfs_dq_init(old_dir);
2855 +       vfs_dq_init(new_dir);
2856 +
2857 +       error = -EBUSY;
2858 +       if (d_mountpoint(old->dentry) || d_mountpoint(new->dentry))
2859 +               return error;
2860 +
2861 +       error = -ENOMEM;
2862 +       name = kmalloc(old->dentry->d_name.len, GFP_KERNEL);
2863 +       if (!name)
2864 +               return error;
2865 +       strncpy(name, old->dentry->d_name.name, old->dentry->d_name.len);
2866 +       name[old->dentry->d_name.len] = 0;
2867 +       old_name.len = old->dentry->d_name.len;
2868 +       old_name.hash = old->dentry->d_name.hash;
2869 +       old_name.name = name;
2870 +
2871 +       /* possibly delete the existing new file */
2872 +       if ((newnd->path.dentry == new->dentry->d_parent) &&
2873 +           new->dentry->d_inode) {
2874 +               /* FIXME: inode may be truncated while we hold a lock */
2875 +               error = vfs_unlink(new_dir, new->dentry);
2876 +               if (error)
2877 +                       goto freename;
2878 +
2879 +               dentry = __lookup_hash(&new->dentry->d_name,
2880 +                                      newnd->path.dentry, newnd);
2881 +               if (IS_ERR(dentry))
2882 +                       goto freename;
2883 +
2884 +               dput(new->dentry);
2885 +               new->dentry = dentry;
2886 +       }
2887 +
2888 +       /* copyup to the new file */
2889 +       error = __union_copyup(old, newnd, new);
2890 +       if (error)
2891 +               goto freename;
2892 +
2893 +       /* whiteout the old file */
2894 +       dentry = __lookup_hash(&old_name, oldnd->path.dentry, oldnd);
2895 +       error = PTR_ERR(dentry);
2896 +       if (IS_ERR(dentry))
2897 +               goto freename;
2898 +       error = vfs_whiteout(old_dir, dentry, 0);
2899 +       dput(dentry);
2900 +
2901 +       /* FIXME: This is acutally unlink() && create() ... */
2902 +/*
2903 +  if (!error) {
2904 +  const char *new_name = old_dentry->d_name.name;
2905 +  fsnotify_move(old_dir, new_dir, old_name.name, new_name, 0,
2906 +  new_dentry->d_inode, old_dentry->d_inode);
2907 +  }
2908 +*/
2909 +freename:
2910 +       kfree(old_name.name);
2911 +       return error;
2912 +}
2913 +
2914  SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
2915                 int, newdfd, const char __user *, newname)
2916  {
2917         struct dentry *old_dir, *new_dir;
2918 -       struct dentry *old_dentry, *new_dentry;
2919 +       struct path old, new;
2920         struct dentry *trap;
2921         struct nameidata oldnd, newnd;
2922         char *from;
2923 @@ -2722,16 +3887,28 @@
2924
2925         trap = lock_rename(new_dir, old_dir);
2926
2927 -       old_dentry = lookup_hash(&oldnd);
2928 -       error = PTR_ERR(old_dentry);
2929 -       if (IS_ERR(old_dentry))
2930 +       /*
2931 +        * For union mounts we need to call a giant lookup_rename_source()
2932 +        * instead.
2933 +        * First lock_rename() and look on the topmost fs like you would do in
2934 +        * the normal rename, if you find something which is not a directory,
2935 +        * go ahead and lookup target and do normal rename.
2936 +        * If you find a negative dentry, unlock_rename() and continue as
2937 +        * _hash_lookup_union() would do without locking the topmost parent
2938 +        * at the end. After that do lock_rename() of the source parent and the
2939 +        * target parent and do a copyup with additional whiteout creation at
2940 +        * the end.
2941 +        */
2942 +//     error = hash_lookup_union(&oldnd, &oldnd.last, &old);
2943 +       error = lookup_rename_source(&oldnd, &newnd, &trap, &oldnd.last, &old);
2944 +       if (error)
2945                 goto exit3;
2946         /* source must exist */
2947         error = -ENOENT;
2948 -       if (!old_dentry->d_inode)
2949 +       if (!old.dentry->d_inode)
2950                 goto exit4;
2951         /* unless the source is a directory trailing slashes give -ENOTDIR */
2952 -       if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2953 +       if (!S_ISDIR(old.dentry->d_inode->i_mode)) {
2954                 error = -ENOTDIR;
2955                 if (oldnd.last.name[oldnd.last.len])
2956                         goto exit4;
2957 @@ -2740,32 +3917,44 @@
2958         }
2959         /* source should not be ancestor of target */
2960         error = -EINVAL;
2961 -       if (old_dentry == trap)
2962 +       if (old.dentry == trap)
2963                 goto exit4;
2964 -       new_dentry = lookup_hash(&newnd);
2965 -       error = PTR_ERR(new_dentry);
2966 -       if (IS_ERR(new_dentry))
2967 +       /* target is always on topmost fs, even with unions */
2968 +       error = lookup_hash(&newnd, &newnd.last, &new);
2969 +       if (error)
2970                 goto exit4;
2971         /* target should not be an ancestor of source */
2972         error = -ENOTEMPTY;
2973 -       if (new_dentry == trap)
2974 +       if (new.dentry == trap)
2975 +               goto exit5;
2976 +       /* renaming of directories on unions is done by the user-space */
2977 +       error = -EXDEV;
2978 +       if (is_unionized(oldnd.path.dentry, oldnd.path.mnt) &&
2979 +           S_ISDIR(old.dentry->d_inode->i_mode))
2980                 goto exit5;
2981 +//     if (is_unionized(newnd.path.dentry, newnd.path.mnt))
2982 +//             goto exit5;
2983
2984         error = mnt_want_write(oldnd.path.mnt);
2985         if (error)
2986                 goto exit5;
2987 -       error = security_path_rename(&oldnd.path, old_dentry,
2988 -                                    &newnd.path, new_dentry);
2989 +       error = security_path_rename(&oldnd.path, old.dentry,
2990 +                                    &newnd.path, new.dentry);
2991         if (error)
2992                 goto exit6;
2993 -       error = vfs_rename(old_dir->d_inode, old_dentry,
2994 -                                  new_dir->d_inode, new_dentry);
2995 +       if (is_unionized(oldnd.path.dentry, oldnd.path.mnt) &&
2996 +           (old.dentry->d_parent != oldnd.path.dentry)) {
2997 +               error = vfs_rename_union(&oldnd, &old, &newnd, &new);
2998 +               goto exit6;
2999 +       }
3000 +       error = vfs_rename(old_dir->d_inode, old.dentry,
3001 +                                  new_dir->d_inode, new.dentry);
3002  exit6:
3003         mnt_drop_write(oldnd.path.mnt);
3004  exit5:
3005 -       dput(new_dentry);
3006 +       path_put_conditional(&new, &newnd);
3007  exit4:
3008 -       dput(old_dentry);
3009 +       path_put_conditional(&old, &oldnd);
3010  exit3:
3011         unlock_rename(new_dir, old_dir);
3012  exit2:
3013 --- a/fs/namespace.c
3014 +++ b/fs/namespace.c
3015 @@ -29,6 +29,7 @@
3016  #include <linux/log2.h>
3017  #include <linux/idr.h>
3018  #include <linux/fs_struct.h>
3019 +#include <linux/union.h>
3020  #include <asm/uaccess.h>
3021  #include <asm/unistd.h>
3022  #include "pnode.h"
3023 @@ -150,6 +151,9 @@
3024                 INIT_LIST_HEAD(&mnt->mnt_share);
3025                 INIT_LIST_HEAD(&mnt->mnt_slave_list);
3026                 INIT_LIST_HEAD(&mnt->mnt_slave);
3027 +#ifdef CONFIG_UNION_MOUNT
3028 +               INIT_LIST_HEAD(&mnt->mnt_unions);
3029 +#endif
3030  #ifdef CONFIG_SMP
3031                 mnt->mnt_writers = alloc_percpu(int);
3032                 if (!mnt->mnt_writers)
3033 @@ -469,6 +473,7 @@
3034
3035  static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
3036  {
3037 +       detach_mnt_union(mnt);
3038         old_path->dentry = mnt->mnt_mountpoint;
3039         old_path->mnt = mnt->mnt_parent;
3040         mnt->mnt_parent = mnt;
3041 @@ -492,6 +497,7 @@
3042         list_add_tail(&mnt->mnt_hash, mount_hashtable +
3043                         hash(path->mnt, path->dentry));
3044         list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
3045 +       attach_mnt_union(mnt, path->mnt, path->dentry);
3046  }
3047
3048  /*
3049 @@ -514,6 +520,7 @@
3050         list_add_tail(&mnt->mnt_hash, mount_hashtable +
3051                                 hash(parent, mnt->mnt_mountpoint));
3052         list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
3053 +       attach_mnt_union(mnt, mnt->mnt_parent, mnt->mnt_mountpoint);
3054         touch_mnt_namespace(n);
3055  }
3056
3057 @@ -770,6 +777,7 @@
3058                 { MNT_NODIRATIME, ",nodiratime" },
3059                 { MNT_RELATIME, ",relatime" },
3060                 { MNT_STRICTATIME, ",strictatime" },
3061 +               { MNT_UNION, ",union" },
3062                 { 0, NULL }
3063         };
3064         const struct proc_fs_info *fs_infop;
3065 @@ -984,6 +992,7 @@
3066                         struct dentry *dentry;
3067                         struct vfsmount *m;
3068                         spin_lock(&vfsmount_lock);
3069 +                       detach_mnt_union(mnt);
3070                         dentry = mnt->mnt_mountpoint;
3071                         m = mnt->mnt_parent;
3072                         mnt->mnt_mountpoint = mnt->mnt_root;
3073 @@ -1102,6 +1111,11 @@
3074         spin_unlock(&vfsmount_lock);
3075         if (retval)
3076                 security_sb_umount_busy(mnt);
3077 +       /* If this was a union mount, we are no longer a read-only
3078 +        * user on the underlying mount */
3079 +       if (mnt->mnt_flags & MNT_UNION)
3080 +               mnt->mnt_parent->mnt_sb->s_readonly_users--;
3081 +
3082         up_write(&namespace_sem);
3083         release_mounts(&umount_list);
3084         return retval;
3085 @@ -1426,6 +1440,10 @@
3086         if (path->dentry != path->mnt->mnt_root)
3087                 return -EINVAL;
3088
3089 +       /* Don't change the type of union mounts */
3090 +       if (IS_MNT_UNION(path->mnt))
3091 +               return -EINVAL;
3092 +
3093         down_write(&namespace_sem);
3094         if (type == MS_SHARED) {
3095                 err = invent_group_ids(mnt, recurse);
3096 @@ -1444,10 +1462,65 @@
3097  }
3098
3099  /*
3100 + * Mount-time check of upper and lower layer file systems to see if we
3101 + * can union mount one on the other.
3102 + *
3103 + * Union mounts must follow these rules:
3104 + *
3105 + * - The lower layer must be read-only.  This avoids lots of nasty
3106 + *   unsolvable races where file system structures disappear suddenly.
3107 + *   XXX - Checking the vfsmnt for read-only is a temporary hack; the
3108 + *   file system could be mounted read-write elsewhere.  We need to
3109 + *   enforce read-only at the superblock level (patches coming).
3110 + *
3111 + * - The upper layer must be writable.  This isn't an absolute
3112 + *   requirement; right now we need it to make readdir() work since we
3113 + *   copy up directory entries to the top level.  A possible
3114 + *   workaround is to mount a tmpfs file system transparently over the
3115 + *   top.
3116 + *
3117 + * - The upper layer must support whiteouts and fallthrus (if it is
3118 + *   writeable).
3119 + *
3120 + * - The lower layer must not also be a union mount.  This is just to
3121 + *   make life simpler for now, there is no inherent limitation on the
3122 + *   number of layers.
3123 + *
3124 + * XXX - Check other mount flags for incompatibilities - I'm sure
3125 + * there are some.
3126 + */
3127 +
3128 +static int
3129 +check_union_mnt(struct path *mntpnt, struct vfsmount *top_mnt, int mnt_flags)
3130 +{
3131 +       struct vfsmount *lower_mnt = mntpnt->mnt;
3132 +
3133 +       /* Is this even a union mount? */
3134 +       if (!(mnt_flags & MNT_UNION))
3135 +               return 0;
3136 +
3137 +       /* Lower layer must be read-only and not a union mount */
3138 +       if (!(lower_mnt->mnt_sb->s_flags & MS_RDONLY) ||
3139 +           (lower_mnt->mnt_flags & MNT_UNION))
3140 +               return -EBUSY;
3141 +
3142 +       /* Upper layer must be writable */
3143 +       if (mnt_flags & MNT_READONLY)
3144 +               return -EROFS;
3145 +
3146 +       /* Upper layer must support whiteouts and fallthrus */
3147 +       if (!(top_mnt->mnt_sb->s_flags & MS_WHITEOUT))
3148 +               return -EINVAL;
3149 +
3150 +       /* All good! */
3151 +       return 0;
3152 +}
3153 +
3154 +/*
3155   * do loopback mount.
3156   */
3157 -static int do_loopback(struct path *path, char *old_name,
3158 -                               int recurse)
3159 +static int do_loopback(struct path *path, char *old_name, int recurse,
3160 +                      int mnt_flags)
3161  {
3162         struct path old_path;
3163         struct vfsmount *mnt = NULL;
3164 @@ -1477,6 +1550,13 @@
3165         if (!mnt)
3166                 goto out;
3167
3168 +       err = check_union_mnt(&old_path, mnt, mnt_flags);
3169 +       if (err)
3170 +               goto out;
3171 +
3172 +       if (mnt_flags & MNT_UNION)
3173 +               mnt->mnt_flags |= MNT_UNION;
3174 +
3175         err = graft_tree(mnt, path);
3176         if (err) {
3177                 LIST_HEAD(umount_list);
3178 @@ -1486,6 +1566,10 @@
3179                 release_mounts(&umount_list);
3180         }
3181
3182 +       /* If this is a union mount, add ourselves to the readonly users */
3183 +       if (mnt_flags & MNT_UNION)
3184 +               mnt->mnt_parent->mnt_sb->s_readonly_users++;
3185 +
3186  out:
3187         up_write(&namespace_sem);
3188         path_put(&old_path);
3189 @@ -1570,6 +1654,13 @@
3190         if (err)
3191                 return err;
3192
3193 +       /* moving to or from a union mount is not supported */
3194 +       err = -EINVAL;
3195 +       if (IS_MNT_UNION(path->mnt))
3196 +               goto exit;
3197 +       if (IS_MNT_UNION(old_path.mnt))
3198 +               goto exit;
3199 +
3200         down_write(&namespace_sem);
3201         while (d_mountpoint(path->dentry) &&
3202                follow_down(path))
3203 @@ -1627,6 +1718,7 @@
3204         up_write(&namespace_sem);
3205         if (!err)
3206                 path_put(&parent_path);
3207 +exit:
3208         path_put(&old_path);
3209         return err;
3210  }
3211 @@ -1684,10 +1776,18 @@
3212         if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
3213                 goto unlock;
3214
3215 +       err = check_union_mnt(path, newmnt, mnt_flags);
3216 +       if (err)
3217 +               goto unlock;
3218 +
3219         newmnt->mnt_flags = mnt_flags;
3220         if ((err = graft_tree(newmnt, path)))
3221                 goto unlock;
3222
3223 +       /* If this is a union mount, add ourselves to the readonly users */
3224 +       if (mnt_flags & MNT_UNION)
3225 +               newmnt->mnt_parent->mnt_sb->s_readonly_users++;
3226 +
3227         if (fslist) /* add to the specified expiration list */
3228                 list_add_tail(&newmnt->mnt_expire, fslist);
3229
3230 @@ -1925,10 +2025,12 @@
3231                 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3232         if (flags & MS_RDONLY)
3233                 mnt_flags |= MNT_READONLY;
3234 +       if (flags & MS_UNION)
3235 +               mnt_flags |= MNT_UNION;
3236
3237         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
3238                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
3239 -                  MS_STRICTATIME);
3240 +                  MS_STRICTATIME | MS_UNION);
3241
3242         /* ... and get the mountpoint */
3243         retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
3244 @@ -1944,7 +2046,8 @@
3245                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
3246                                     data_page);
3247         else if (flags & MS_BIND)
3248 -               retval = do_loopback(&path, dev_name, flags & MS_REC);
3249 +               retval = do_loopback(&path, dev_name, flags & MS_REC,
3250 +                                    mnt_flags);
3251         else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3252                 retval = do_change_type(&path, flags);
3253         else if (flags & MS_MOVE)
3254 @@ -2179,6 +2282,8 @@
3255         if (d_unlinked(old.dentry))
3256                 goto out2;
3257         error = -EBUSY;
3258 +       follow_union_down(&new);
3259 +       follow_union_down(&root);
3260         if (new.mnt == root.mnt ||
3261             old.mnt == root.mnt)
3262                 goto out2; /* loop, on the same file system  */
3263 --- a/fs/nfsctl.c
3264 +++ b/fs/nfsctl.c
3265 @@ -38,10 +38,10 @@
3266                 return ERR_PTR(error);
3267
3268         if (flags == O_RDWR)
3269 -               error = may_open(&nd.path, MAY_READ|MAY_WRITE,
3270 -                                          FMODE_READ|FMODE_WRITE);
3271 +               error = may_open(&nd, MAY_READ|MAY_WRITE,
3272 +                                FMODE_READ|FMODE_WRITE);
3273         else
3274 -               error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
3275 +               error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
3276
3277         if (!error)
3278                 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
3279 --- a/fs/nfsd/nfs3xdr.c
3280 +++ b/fs/nfsd/nfs3xdr.c
3281 @@ -884,6 +884,11 @@
3282         int             elen;           /* estimated entry length in words */
3283         int             num_entry_words = 0;    /* actual number of words */
3284
3285 +       if (d_type == DT_WHT) {
3286 +               cd->common.err = nfs_ok;
3287 +               return 0;
3288 +       }
3289 +
3290         if (cd->offset) {
3291                 u64 offset64 = offset;
3292
3293 --- a/fs/nfsd/nfs4xdr.c
3294 +++ b/fs/nfsd/nfs4xdr.c
3295 @@ -2263,7 +2263,7 @@
3296         __be32 nfserr = nfserr_toosmall;
3297
3298         /* In nfsv4, "." and ".." never make it onto the wire.. */
3299 -       if (name && isdotent(name, namlen)) {
3300 +       if (d_type == DT_WHT || (name && isdotent(name, namlen))) {
3301                 cd->common.err = nfs_ok;
3302                 return 0;
3303         }
3304 --- a/fs/nfsd/nfsxdr.c
3305 +++ b/fs/nfsd/nfsxdr.c
3306 @@ -513,6 +513,10 @@
3307                         namlen, name, offset, ino);
3308          */
3309
3310 +       if (d_type == DT_WHT) {
3311 +               cd->common.err = nfs_ok;
3312 +               return 0;
3313 +       }
3314         if (offset > ~((u32) 0)) {
3315                 cd->common.err = nfserr_fbig;
3316                 return -EINVAL;
3317 --- a/fs/open.c
3318 +++ b/fs/open.c
3319 @@ -30,6 +30,7 @@
3320  #include <linux/audit.h>
3321  #include <linux/falloc.h>
3322  #include <linux/fs_struct.h>
3323 +#include <linux/union.h>
3324
3325  int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
3326  {
3327 @@ -222,69 +223,69 @@
3328         return err;
3329  }
3330
3331 -static long do_sys_truncate(const char __user *pathname, loff_t length)
3332 +static int __do_ftruncate(struct file *file, unsigned long length, int small)
3333  {
3334 -       struct path path;
3335 -       struct inode *inode;
3336 +       struct inode * inode;
3337 +       struct dentry *dentry;
3338         int error;
3339
3340         error = -EINVAL;
3341 -       if (length < 0) /* sorry, but loff_t says... */
3342 +       if (length < 0)
3343                 goto out;
3344 +       /* explicitly opened as large or we are on 64-bit box */
3345 +       if (file->f_flags & O_LARGEFILE)
3346 +               small = 0;
3347
3348 -       error = user_path(pathname, &path);
3349 -       if (error)
3350 +       dentry = file->f_path.dentry;
3351 +       inode = dentry->d_inode;
3352 +       error = -EINVAL;
3353 +       if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
3354                 goto out;
3355 -       inode = path.dentry->d_inode;
3356 -
3357 -       /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
3358 -       error = -EISDIR;
3359 -       if (S_ISDIR(inode->i_mode))
3360 -               goto dput_and_out;
3361
3362         error = -EINVAL;
3363 -       if (!S_ISREG(inode->i_mode))
3364 -               goto dput_and_out;
3365 -
3366 -       error = mnt_want_write(path.mnt);
3367 -       if (error)
3368 -               goto dput_and_out;
3369 +       /* Cannot ftruncate over 2^31 bytes without large file support */
3370 +       if (small && length > MAX_NON_LFS)
3371
3372 -       error = inode_permission(inode, MAY_WRITE);
3373 -       if (error)
3374 -               goto mnt_drop_write_and_out;
3375 +               goto out;
3376
3377         error = -EPERM;
3378         if (IS_APPEND(inode))
3379 -               goto mnt_drop_write_and_out;
3380 +               goto out;
3381
3382 -       error = get_write_access(inode);
3383 -       if (error)
3384 -               goto mnt_drop_write_and_out;
3385 +       error = locks_verify_truncate(inode, file, length);
3386 +       if (!error)
3387 +               error = security_path_truncate(&file->f_path, length,
3388 +                                              ATTR_MTIME|ATTR_CTIME);
3389 +       if (!error)
3390 +               /* Already copied up for union, opened with write */
3391 +               error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
3392 +out:
3393 +       return error;
3394 +}
3395
3396 -       /*
3397 -        * Make sure that there are no leases.  get_write_access() protects
3398 -        * against the truncate racing with a lease-granting setlease().
3399 -        */
3400 -       error = break_lease(inode, FMODE_WRITE);
3401 -       if (error)
3402 -               goto put_write_and_out;
3403 +static long do_sys_truncate(const char __user *pathname, loff_t length)
3404 +{
3405 +       struct file *file;
3406 +       char *tmp;
3407 +       int error;
3408
3409 -       error = locks_verify_truncate(inode, NULL, length);
3410 -       if (!error)
3411 -               error = security_path_truncate(&path, length, 0);
3412 -       if (!error) {
3413 -               vfs_dq_init(inode);
3414 -               error = do_truncate(path.dentry, length, 0, NULL);
3415 -       }
3416 +       error = -EINVAL;
3417 +       if (length < 0) /* sorry, but loff_t says... */
3418 +               return error;
3419
3420 -put_write_and_out:
3421 -       put_write_access(inode);
3422 -mnt_drop_write_and_out:
3423 -       mnt_drop_write(path.mnt);
3424 -dput_and_out:
3425 -       path_put(&path);
3426 -out:
3427 +       tmp = getname(pathname);
3428 +       if (IS_ERR(tmp))
3429 +               return PTR_ERR(tmp);
3430 +
3431 +       file = filp_open(tmp, O_RDWR | O_LARGEFILE, 0);
3432 +       putname(tmp);
3433 +
3434 +       if (IS_ERR(file))
3435 +               return PTR_ERR(file);
3436 +
3437 +       error = __do_ftruncate(file, length, 0);
3438 +
3439 +       fput(file);
3440         return error;
3441  }
3442
3443 @@ -296,45 +297,16 @@
3444
3445  static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
3446  {
3447 -       struct inode * inode;
3448 -       struct dentry *dentry;
3449         struct file * file;
3450         int error;
3451
3452 -       error = -EINVAL;
3453 -       if (length < 0)
3454 -               goto out;
3455         error = -EBADF;
3456         file = fget(fd);
3457         if (!file)
3458                 goto out;
3459
3460 -       /* explicitly opened as large or we are on 64-bit box */
3461 -       if (file->f_flags & O_LARGEFILE)
3462 -               small = 0;
3463 -
3464 -       dentry = file->f_path.dentry;
3465 -       inode = dentry->d_inode;
3466 -       error = -EINVAL;
3467 -       if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
3468 -               goto out_putf;
3469 -
3470 -       error = -EINVAL;
3471 -       /* Cannot ftruncate over 2^31 bytes without large file support */
3472 -       if (small && length > MAX_NON_LFS)
3473 -               goto out_putf;
3474 +       error = __do_ftruncate(file, length, small);
3475
3476 -       error = -EPERM;
3477 -       if (IS_APPEND(inode))
3478 -               goto out_putf;
3479 -
3480 -       error = locks_verify_truncate(inode, file, length);
3481 -       if (!error)
3482 -               error = security_path_truncate(&file->f_path, length,
3483 -                                              ATTR_MTIME|ATTR_CTIME);
3484 -       if (!error)
3485 -               error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
3486 -out_putf:
3487         fput(file);
3488  out:
3489         return error;
3490 @@ -493,7 +465,8 @@
3491                         goto out_path_release;
3492         }
3493
3494 -       res = inode_permission(inode, mode | MAY_ACCESS);
3495 +       res = union_permission(&path, mode | MAY_ACCESS);
3496 +
3497         /* SuS v2 requires we report a read only fs too */
3498         if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
3499                 goto out_path_release;
3500 @@ -507,7 +480,8 @@
3501          * inherently racy and know that the fs may change
3502          * state before we even see this result.
3503          */
3504 -       if (__mnt_is_readonly(path.mnt))
3505 +       if ((!is_unionized(path.dentry, path.mnt) &&
3506 +            (__mnt_is_readonly(path.mnt))))
3507                 res = -EROFS;
3508
3509  out_path_release:
3510 @@ -553,20 +527,19 @@
3511         error = -EBADF;
3512         file = fget(fd);
3513         if (!file)
3514 -               goto out;
3515 +               return error;
3516
3517         inode = file->f_path.dentry->d_inode;
3518
3519         error = -ENOTDIR;
3520         if (!S_ISDIR(inode->i_mode))
3521 -               goto out_putf;
3522 +               goto out;
3523
3524         error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
3525         if (!error)
3526                 set_fs_pwd(current->fs, &file->f_path);
3527 -out_putf:
3528 -       fput(file);
3529  out:
3530 +       fput(file);
3531         return error;
3532  }
3533
3534 --- a/fs/readdir.c
3535 +++ b/fs/readdir.c
3536 @@ -16,6 +16,7 @@
3537  #include <linux/security.h>
3538  #include <linux/syscalls.h>
3539  #include <linux/unistd.h>
3540 +#include <linux/union.h>
3541
3542  #include <asm/uaccess.h>
3543
3544 @@ -36,9 +37,24 @@
3545
3546         res = -ENOENT;
3547         if (!IS_DEADDIR(inode)) {
3548 +               /*
3549 +                * XXX Think harder about locking for
3550 +                * union_copyup_dir.  Currently we lock the topmost
3551 +                * directory and hold that lock while sequentially
3552 +                * acquiring and dropping locks for the directories
3553 +                * below this one in the union stack.
3554 +                */
3555 +               if (is_unionized(file->f_path.dentry, file->f_path.mnt) &&
3556 +                   !IS_OPAQUE(inode)) {
3557 +                       res = union_copyup_dir(&file->f_path);
3558 +                       if (res)
3559 +                               goto out_unlock;
3560 +               }
3561 +
3562                 res = file->f_op->readdir(file, buf, filler);
3563                 file_accessed(file);
3564         }
3565 +out_unlock:
3566         mutex_unlock(&inode->i_mutex);
3567  out:
3568         return res;
3569 @@ -77,6 +93,9 @@
3570         struct old_linux_dirent __user * dirent;
3571         unsigned long d_ino;
3572
3573 +       if (d_type == DT_WHT)
3574 +               return 0;
3575 +
3576         if (buf->result)
3577                 return -EINVAL;
3578         d_ino = ino;
3579 @@ -154,6 +173,9 @@
3580         unsigned long d_ino;
3581         int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long));
3582
3583 +       if (d_type == DT_WHT)
3584 +               return 0;
3585 +
3586         buf->error = -EINVAL;   /* only used if we fail.. */
3587         if (reclen > buf->count)
3588                 return -EINVAL;
3589 @@ -239,6 +261,9 @@
3590         struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
3591         int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64));
3592
3593 +       if (d_type == DT_WHT)
3594 +               return 0;
3595 +
3596         buf->error = -EINVAL;   /* only used if we fail.. */
3597         if (reclen > buf->count)
3598                 return -EINVAL;
3599 --- a/fs/super.c
3600 +++ b/fs/super.c
3601 @@ -553,6 +553,15 @@
3602         }
3603         remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
3604
3605 +       /* If we are remounting read/write, make sure that none of the
3606 +          users require read-only for correct operation (such as
3607 +          union mounts). */
3608 +       if (remount_rw && sb->s_readonly_users) {
3609 +               printk(KERN_INFO "%s: In use by %d read-only user(s)\n",
3610 +                      sb->s_id, sb->s_readonly_users);
3611 +               return -EROFS;
3612 +       }
3613 +
3614         if (sb->s_op->remount_fs) {
3615                 retval = sb->s_op->remount_fs(sb, &flags, data);
3616                 if (retval)
3617 @@ -889,6 +898,11 @@
3618         if (error)
3619                 goto out_sb;
3620
3621 +       error = -EROFS;
3622 +       if (!(flags & MS_RDONLY) &&
3623 +           (mnt->mnt_sb->s_readonly_users))
3624 +               goto out_sb;
3625 +
3626         mnt->mnt_mountpoint = mnt->mnt_root;
3627         mnt->mnt_parent = mnt;
3628         up_write(&mnt->mnt_sb->s_umount);
3629 --- /dev/null
3630 +++ b/fs/union.c
3631 @@ -0,0 +1,981 @@
3632 +/*
3633 + * VFS based union mount for Linux
3634 + *
3635 + * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
3636 + * Copyright (C) 2007-2009 Novell Inc.
3637 + *
3638 + *   Author(s): Jan Blunck (j.blunck@tu-harburg.de)
3639 + *              Valerie Aurora <vaurora@redhat.com>
3640 + *
3641 + * This program is free software; you can redistribute it and/or modify it
3642 + * under the terms of the GNU General Public License as published by the Free
3643 + * Software Foundation; either version 2 of the License, or (at your option)
3644 + * any later version.
3645 + */
3646 +
3647 +#include <linux/bootmem.h>
3648 +#include <linux/init.h>
3649 +#include <linux/module.h>
3650 +#include <linux/types.h>
3651 +#include <linux/hash.h>
3652 +#include <linux/fs.h>
3653 +#include <linux/mount.h>
3654 +#include <linux/fs_struct.h>
3655 +#include <linux/union.h>
3656 +#include <linux/namei.h>
3657 +#include <linux/file.h>
3658 +#include <linux/mm.h>
3659 +#include <linux/quotaops.h>
3660 +#include <linux/dnotify.h>
3661 +#include <linux/security.h>
3662 +#include <linux/pipe_fs_i.h>
3663 +#include <linux/splice.h>
3664 +
3665 +/*
3666 + * This is borrowed from fs/inode.c. The hashtable for lookups. Somebody
3667 + * should try to make this good - I've just made it work.
3668 + */
3669 +static unsigned int union_hash_mask __read_mostly;
3670 +static unsigned int union_hash_shift __read_mostly;
3671 +static struct hlist_head *union_hashtable __read_mostly;
3672 +static unsigned int union_rhash_mask __read_mostly;
3673 +static unsigned int union_rhash_shift __read_mostly;
3674 +static struct hlist_head *union_rhashtable __read_mostly;
3675 +
3676 +/*
3677 + * Locking Rules:
3678 + * - dcache_lock (for union_rlookup() only)
3679 + * - union_lock
3680 + */
3681 +DEFINE_SPINLOCK(union_lock);
3682 +
3683 +static struct kmem_cache *union_cache __read_mostly;
3684 +
3685 +static unsigned long hash(struct dentry *dentry, struct vfsmount *mnt)
3686 +{
3687 +       unsigned long tmp;
3688 +
3689 +       tmp = ((unsigned long)mnt * (unsigned long)dentry) ^
3690 +               (GOLDEN_RATIO_PRIME + (unsigned long)mnt) / L1_CACHE_BYTES;
3691 +       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> union_hash_shift);
3692 +       return tmp & union_hash_mask;
3693 +}
3694 +
3695 +static __initdata unsigned long union_hash_entries;
3696 +
3697 +static int __init set_union_hash_entries(char *str)
3698 +{
3699 +       if (!str)
3700 +               return 0;
3701 +       union_hash_entries = simple_strtoul(str, &str, 0);
3702 +       return 1;
3703 +}
3704 +
3705 +__setup("union_hash_entries=", set_union_hash_entries);
3706 +
3707 +static int __init init_union(void)
3708 +{
3709 +       int loop;
3710 +
3711 +       union_cache = KMEM_CACHE(union_mount, SLAB_PANIC | SLAB_MEM_SPREAD);
3712 +       union_hashtable = alloc_large_system_hash("Union-cache",
3713 +                                                 sizeof(struct hlist_head),
3714 +                                                 union_hash_entries,
3715 +                                                 14,
3716 +                                                 0,
3717 +                                                 &union_hash_shift,
3718 +                                                 &union_hash_mask,
3719 +                                                 0);
3720 +
3721 +       for (loop = 0; loop < (1 << union_hash_shift); loop++)
3722 +               INIT_HLIST_HEAD(&union_hashtable[loop]);
3723 +
3724 +
3725 +       union_rhashtable = alloc_large_system_hash("rUnion-cache",
3726 +                                                 sizeof(struct hlist_head),
3727 +                                                 union_hash_entries,
3728 +                                                 14,
3729 +                                                 0,
3730 +                                                 &union_rhash_shift,
3731 +                                                 &union_rhash_mask,
3732 +                                                 0);
3733 +
3734 +       for (loop = 0; loop < (1 << union_rhash_shift); loop++)
3735 +               INIT_HLIST_HEAD(&union_rhashtable[loop]);
3736 +
3737 +       return 0;
3738 +}
3739 +
3740 +fs_initcall(init_union);
3741 +
3742 +struct union_mount *union_alloc(struct dentry *this, struct vfsmount *this_mnt,
3743 +                               struct dentry *next, struct vfsmount *next_mnt)
3744 +{
3745 +       struct union_mount *um;
3746 +
3747 +       BUG_ON(!S_ISDIR(this->d_inode->i_mode));
3748 +       BUG_ON(!S_ISDIR(next->d_inode->i_mode));
3749 +
3750 +       um = kmem_cache_alloc(union_cache, GFP_ATOMIC);
3751 +       if (!um)
3752 +               return NULL;
3753 +
3754 +       atomic_set(&um->u_count, 1);
3755 +       INIT_LIST_HEAD(&um->u_unions);
3756 +       INIT_LIST_HEAD(&um->u_list);
3757 +       INIT_HLIST_NODE(&um->u_hash);
3758 +       INIT_HLIST_NODE(&um->u_rhash);
3759 +
3760 +       um->u_this.mnt = this_mnt;
3761 +       um->u_this.dentry = this;
3762 +       um->u_next.mnt = mntget(next_mnt);
3763 +       um->u_next.dentry = dget(next);
3764 +
3765 +       return um;
3766 +}
3767 +
3768 +struct union_mount *union_get(struct union_mount *um)
3769 +{
3770 +       BUG_ON(!atomic_read(&um->u_count));
3771 +       atomic_inc(&um->u_count);
3772 +       return um;
3773 +}
3774 +
3775 +static int __union_put(struct union_mount *um)
3776 +{
3777 +       if (!atomic_dec_and_test(&um->u_count))
3778 +               return 0;
3779 +
3780 +       BUG_ON(!hlist_unhashed(&um->u_hash));
3781 +       BUG_ON(!hlist_unhashed(&um->u_rhash));
3782 +
3783 +       kmem_cache_free(union_cache, um);
3784 +       return 1;
3785 +}
3786 +
3787 +void union_put(struct union_mount *um)
3788 +{
3789 +       struct path tmp = um->u_next;
3790 +
3791 +       if (__union_put(um))
3792 +               path_put(&tmp);
3793 +}
3794 +
3795 +static void __union_hash(struct union_mount *um)
3796 +{
3797 +       hlist_add_head(&um->u_hash, union_hashtable +
3798 +                      hash(um->u_this.dentry, um->u_this.mnt));
3799 +       hlist_add_head(&um->u_rhash, union_rhashtable +
3800 +                      hash(um->u_next.dentry, um->u_next.mnt));
3801 +}
3802 +
3803 +static void __union_unhash(struct union_mount *um)
3804 +{
3805 +       hlist_del_init(&um->u_hash);
3806 +       hlist_del_init(&um->u_rhash);
3807 +}
3808 +
3809 +struct union_mount *union_lookup(struct dentry *dentry, struct vfsmount *mnt)
3810 +{
3811 +       struct hlist_head *head = union_hashtable + hash(dentry, mnt);
3812 +       struct hlist_node *node;
3813 +       struct union_mount *um;
3814 +
3815 +       hlist_for_each_entry(um, node, head, u_hash) {
3816 +               if ((um->u_this.dentry == dentry) &&
3817 +                   (um->u_this.mnt == mnt))
3818 +                       return um;
3819 +       }
3820 +
3821 +       return NULL;
3822 +}
3823 +
3824 +struct union_mount *union_rlookup(struct dentry *dentry, struct vfsmount *mnt)
3825 +{
3826 +       struct hlist_head *head = union_rhashtable + hash(dentry, mnt);
3827 +       struct hlist_node *node;
3828 +       struct union_mount *um;
3829 +
3830 +       hlist_for_each_entry(um, node, head, u_rhash) {
3831 +               if ((um->u_next.dentry == dentry) &&
3832 +                   (um->u_next.mnt == mnt))
3833 +                       return um;
3834 +       }
3835 +
3836 +       return NULL;
3837 +}
3838 +
3839 +/*
3840 + * is_unionized - check if a dentry lives on a union mounted file system
3841 + *
3842 + * This tests if a dentry is living on an union mounted file system by walking
3843 + * the file system hierarchy.
3844 + */
3845 +int is_unionized(struct dentry *dentry, struct vfsmount *mnt)
3846 +{
3847 +       struct path this = { .mnt = mntget(mnt),
3848 +                            .dentry = dget(dentry) };
3849 +       struct vfsmount *tmp;
3850 +
3851 +       do {
3852 +               /* check if there is an union mounted on top of us */
3853 +               spin_lock(&vfsmount_lock);
3854 +               list_for_each_entry(tmp, &this.mnt->mnt_mounts, mnt_child) {
3855 +                       if (!(tmp->mnt_flags & MNT_UNION))
3856 +                               continue;
3857 +                       /* Isn't this a bug? */
3858 +                       if (this.dentry->d_sb != tmp->mnt_mountpoint->d_sb)
3859 +                               continue;
3860 +                       if (is_subdir(this.dentry, tmp->mnt_mountpoint)) {
3861 +                               spin_unlock(&vfsmount_lock);
3862 +                               path_put(&this);
3863 +                               return 1;
3864 +                       }
3865 +               }
3866 +               spin_unlock(&vfsmount_lock);
3867 +
3868 +               /* check our mountpoint next */
3869 +               tmp = mntget(this.mnt->mnt_parent);
3870 +               dput(this.dentry);
3871 +               this.dentry = dget(this.mnt->mnt_mountpoint);
3872 +               mntput(this.mnt);
3873 +               this.mnt = tmp;
3874 +       } while (this.mnt != this.mnt->mnt_parent);
3875 +
3876 +       path_put(&this);
3877 +       return 0;
3878 +}
3879 +
3880 +int append_to_union(struct vfsmount *mnt, struct dentry *dentry,
3881 +                   struct vfsmount *dest_mnt, struct dentry *dest_dentry)
3882 +{
3883 +       struct union_mount *this, *um;
3884 +
3885 +       BUG_ON(!IS_MNT_UNION(mnt));
3886 +
3887 +       this = union_alloc(dentry, mnt, dest_dentry, dest_mnt);
3888 +       if (!this)
3889 +               return -ENOMEM;
3890 +
3891 +       spin_lock(&union_lock);
3892 +       um = union_lookup(dentry, mnt);
3893 +       if (um) {
3894 +               BUG_ON((um->u_next.dentry != dest_dentry) ||
3895 +                      (um->u_next.mnt != dest_mnt));
3896 +               spin_unlock(&union_lock);
3897 +               union_put(this);
3898 +               return 0;
3899 +       }
3900 +       list_add(&this->u_list, &mnt->mnt_unions);
3901 +       list_add(&this->u_unions, &dentry->d_unions);
3902 +       dest_dentry->d_unionized++;
3903 +       __union_hash(this);
3904 +       spin_unlock(&union_lock);
3905 +       return 0;
3906 +}
3907 +
3908 +/*
3909 + * follow_union_down - follow the union stack one layer down
3910 + *
3911 + * This is called to traverse the union stack from one layer to the next
3912 + * overlayed one. follow_union_down() is called by various lookup functions
3913 + * that are aware of union mounts.
3914 + *
3915 + * Returns non-zero if followed to the next layer, zero otherwise.
3916 + */
3917 +int follow_union_down(struct path *path)
3918 +{
3919 +       struct union_mount *um;
3920 +
3921 +       if (!IS_MNT_UNION(path->mnt))
3922 +               return 0;
3923 +
3924 +       spin_lock(&union_lock);
3925 +       um = union_lookup(path->dentry, path->mnt);
3926 +       spin_unlock(&union_lock);
3927 +       if (um) {
3928 +               path_get(&um->u_next);
3929 +               dput(path->dentry);
3930 +               path->dentry = um->u_next.dentry;
3931 +               mntput(path->mnt);
3932 +               path->mnt = um->u_next.mnt;
3933 +               return 1;
3934 +       }
3935 +       return 0;
3936 +}
3937 +
3938 +/*
3939 + * follow_union_mount - follow the union stack to the topmost layer
3940 + *
3941 + * This is called to traverse the union stack to the topmost layer. This is
3942 + * necessary for following parent pointers in an union mount.
3943 + *
3944 + * Returns none zero if followed to the topmost layer, zero otherwise.
3945 + */
3946 +int follow_union_mount(struct path *path)
3947 +{
3948 +       struct union_mount *um;
3949 +       int res = 0;
3950 +
3951 +       while (IS_UNION(path->dentry)) {
3952 +               spin_lock(&dcache_lock);
3953 +               spin_lock(&union_lock);
3954 +               um = union_rlookup(path->dentry, path->mnt);
3955 +               if (um)
3956 +                       path_get(&um->u_this);
3957 +               spin_unlock(&union_lock);
3958 +               spin_unlock(&dcache_lock);
3959 +
3960 +               /*
3961 +                * Q: Aaargh, how do I validate the topmost dentry pointer?
3962 +                * A: Eeeeasy! We took the dcache_lock and union_lock. Since
3963 +                *    this protects from any dput'ng going on, we know that the
3964 +                *    dentry is valid since the union is unhashed under
3965 +                *    dcache_lock too.
3966 +                */
3967 +               if (!um)
3968 +                       break;
3969 +               dput(path->dentry);
3970 +               path->dentry = um->u_this.dentry;
3971 +               mntput(path->mnt);
3972 +               path->mnt = um->u_this.mnt;
3973 +               res = 1;
3974 +       }
3975 +
3976 +       return res;
3977 +}
3978 +
3979 +/*
3980 + * Union mount copyup support
3981 + */
3982 +
3983 +extern int hash_lookup_union(struct nameidata *, struct qstr *, struct path *);
3984 +extern void follow_mount(struct path *);
3985 +
3986 +/*
3987 + * union_relookup_topmost - lookup and create the topmost path to dentry
3988 + * @nd: pointer to nameidata
3989 + * @flags: lookup flags
3990 + */
3991 +static int union_relookup_topmost(struct nameidata *nd, int flags)
3992 +{
3993 +       int err;
3994 +       char *kbuf, *name;
3995 +       struct nameidata this;
3996 +
3997 +       kbuf = (char *)__get_free_page(GFP_KERNEL);
3998 +       if (!kbuf)
3999 +               return -ENOMEM;
4000 +
4001 +       name = d_path(&nd->path, kbuf, PAGE_SIZE);
4002 +       err = PTR_ERR(name);
4003 +       if (IS_ERR(name))
4004 +               goto free_page;
4005 +
4006 +       err = path_lookup(name, flags|LOOKUP_CREATE|LOOKUP_TOPMOST, &this);
4007 +       if (err)
4008 +               goto free_page;
4009 +
4010 +       path_put(&nd->path);
4011 +       nd->path.dentry = this.path.dentry;
4012 +       nd->path.mnt = this.path.mnt;
4013 +
4014 +       /*
4015 +        * the nd->flags should be unchanged
4016 +        */
4017 +       BUG_ON(this.um_flags & LAST_LOWLEVEL);
4018 +       nd->um_flags &= ~LAST_LOWLEVEL;
4019 + free_page:
4020 +       free_page((unsigned long)kbuf);
4021 +       return err;
4022 +}
4023 +
4024 +static void __update_fs_pwd(struct path *path, struct dentry *dentry,
4025 +                           struct vfsmount *mnt)
4026 +{
4027 +       struct path old = { NULL, NULL };
4028 +
4029 +       write_lock(&current->fs->lock);
4030 +       if (current->fs->pwd.dentry == path->dentry) {
4031 +               old = current->fs->pwd;
4032 +               path_get(&current->fs->pwd);
4033 +       }
4034 +       write_unlock(&current->fs->lock);
4035 +
4036 +       if (old.dentry)
4037 +               path_put(&old);
4038 +
4039 +       return;
4040 +}
4041 +
4042 +/**
4043 + * union_permission  -  check for access rights to a given inode
4044 + * @inode:     inode to check permission on
4045 + * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
4046 + *
4047 + * In a union mount, the top layer is always read-write and the bottom
4048 + * is always read-only.  Ignore the read-only flag on the lower fs.
4049 + *
4050 + * Only need for certain activities, like checking to see if write
4051 + * access is ok.
4052 + */
4053 +
4054 +int union_permission(struct path *path, int mask)
4055 +{
4056 +       struct inode *inode = path->dentry->d_inode;
4057 +
4058 +       if (!is_unionized(path->dentry, path->mnt))
4059 +               return inode_permission(inode, mask);
4060 +
4061 +       /* Tell __inode_permission to ignore MS_RDONLY */
4062 +       return __inode_permission(inode, mask, 0);
4063 +}
4064 +
4065 +/*
4066 + * union_create_topmost - create the topmost path component
4067 + * @nd: pointer to nameidata of the base directory
4068 + * @name: pointer to file name
4069 + * @path: pointer to path of the overlaid file
4070 + *
4071 + * This is called by __link_path_walk() to create the directories on a path
4072 + * when it is called with LOOKUP_TOPMOST.
4073 + */
4074 +struct dentry *union_create_topmost(struct nameidata *nd, struct qstr *name,
4075 +                                   struct path *path)
4076 +{
4077 +       struct dentry *dentry, *parent = nd->path.dentry;
4078 +       int res, mode = path->dentry->d_inode->i_mode;
4079 +
4080 +       if (parent->d_sb == path->dentry->d_sb)
4081 +               return ERR_PTR(-EEXIST);
4082 +
4083 +       mutex_lock(&parent->d_inode->i_mutex);
4084 +       dentry = lookup_one_len(name->name, nd->path.dentry, name->len);
4085 +       if (IS_ERR(dentry))
4086 +               goto out_unlock;
4087 +
4088 +       switch (mode & S_IFMT) {
4089 +       case S_IFREG:
4090 +               /*
4091 +                * FIXME: Does this make any sense in this case?
4092 +                * Special case - lookup gave negative, but... we had foo/bar/
4093 +                * From the vfs_mknod() POV we just have a negative dentry -
4094 +                * all is fine. Let's be bastards - you had / on the end,you've
4095 +                * been asking for (non-existent) directory. -ENOENT for you.
4096 +                */
4097 +               if (name->name[name->len] && !dentry->d_inode) {
4098 +                       dput(dentry);
4099 +                       dentry = ERR_PTR(-ENOENT);
4100 +                       goto out_unlock;
4101 +               }
4102 +
4103 +               res = vfs_create(parent->d_inode, dentry, mode, nd);
4104 +               if (res) {
4105 +                       dput(dentry);
4106 +                       dentry = ERR_PTR(res);
4107 +                       goto out_unlock;
4108 +               }
4109 +               break;
4110 +       case S_IFDIR:
4111 +               res = vfs_mkdir(parent->d_inode, dentry, mode);
4112 +               if (res) {
4113 +                       dput(dentry);
4114 +                       dentry = ERR_PTR(res);
4115 +                       goto out_unlock;
4116 +               }
4117 +
4118 +               res = append_to_union(nd->path.mnt, dentry, path->mnt,
4119 +                                     path->dentry);
4120 +               if (res) {
4121 +                       dput(dentry);
4122 +                       dentry = ERR_PTR(res);
4123 +                       goto out_unlock;
4124 +               }
4125 +               break;
4126 +       default:
4127 +               dput(dentry);
4128 +               dentry = ERR_PTR(-EINVAL);
4129 +               goto out_unlock;
4130 +       }
4131 +
4132 +       /* FIXME: Really necessary ??? */
4133 +/*     __update_fs_pwd(path, dentry, nd->path.mnt); */
4134 +
4135 + out_unlock:
4136 +       mutex_unlock(&parent->d_inode->i_mutex);
4137 +       return dentry;
4138 +}
4139 +
4140 +static int union_copy_file(struct dentry *old_dentry, struct vfsmount *old_mnt,
4141 +                          struct dentry *new_dentry, struct vfsmount *new_mnt)
4142 +{
4143 +       int ret;
4144 +       size_t size;
4145 +       loff_t offset;
4146 +       struct file *old_file, *new_file;
4147 +       const struct cred *cred = current_cred();
4148 +
4149 +       dget(old_dentry);
4150 +       mntget(old_mnt);
4151 +       old_file = dentry_open(old_dentry, old_mnt, O_RDONLY, cred);
4152 +       if (IS_ERR(old_file))
4153 +               return PTR_ERR(old_file);
4154 +
4155 +       dget(new_dentry);
4156 +       mntget(new_mnt);
4157 +       new_file = dentry_open(new_dentry, new_mnt, O_WRONLY, cred);
4158 +       ret = PTR_ERR(new_file);
4159 +       if (IS_ERR(new_file))
4160 +               goto fput_old;
4161 +
4162 +       /* XXX be smart by using a length param, which indicates max
4163 +        * data we'll want (e.g., we are about to truncate to 0 or 10
4164 +        * bytes or something */
4165 +       size = i_size_read(old_file->f_path.dentry->d_inode);
4166 +       if (((size_t)size != size) || ((ssize_t)size != size)) {
4167 +               ret = -EFBIG;
4168 +               goto fput_new;
4169 +       }
4170 +
4171 +       offset = 0;
4172 +       ret = do_splice_direct(old_file, &offset, new_file, size,
4173 +                              SPLICE_F_MOVE);
4174 +       if (ret >= 0)
4175 +               ret = 0;
4176 + fput_new:
4177 +       fput(new_file);
4178 + fput_old:
4179 +       fput(old_file);
4180 +       return ret;
4181 +}
4182 +
4183 +/**
4184 + * __union_copyup - copy a file to the topmost directory
4185 + * @old: pointer to path of the old file name
4186 + * @new_nd: pointer to nameidata of the topmost directory
4187 + * @new: pointer to path of the new file name
4188 + *
4189 + * The topmost directory @new_nd must already be locked. Creates the topmost
4190 + * file if it doesn't exist yet.
4191 + */
4192 +int __union_copyup(struct path *old, struct nameidata *new_nd,
4193 +                  struct path *new)
4194 +{
4195 +       struct dentry *dentry;
4196 +       int error;
4197 +
4198 +       /* Maybe this should be -EINVAL */
4199 +       if (S_ISDIR(old->dentry->d_inode->i_mode))
4200 +               return -EISDIR;
4201 +
4202 +       if (new_nd->path.dentry != new->dentry->d_parent) {
4203 +               mutex_lock(&new_nd->path.dentry->d_inode->i_mutex);
4204 +               dentry = lookup_one_len(new->dentry->d_name.name,
4205 +                                       new_nd->path.dentry,
4206 +                                       new->dentry->d_name.len);
4207 +               mutex_unlock(&new_nd->path.dentry->d_inode->i_mutex);
4208 +               if (IS_ERR(dentry))
4209 +                       return PTR_ERR(dentry);
4210 +               error = -EEXIST;
4211 +               if (dentry->d_inode)
4212 +                       goto out_dput;
4213 +       } else
4214 +               dentry = dget(new->dentry);
4215 +
4216 +       if (!dentry->d_inode) {
4217 +               error = vfs_create(new_nd->path.dentry->d_inode, dentry,
4218 +                                  old->dentry->d_inode->i_mode, new_nd);
4219 +               if (error)
4220 +                       goto out_dput;
4221 +       }
4222 +
4223 +       BUG_ON(!S_ISREG(old->dentry->d_inode->i_mode));
4224 +       error = union_copy_file(old->dentry, old->mnt, dentry,
4225 +                               new_nd->path.mnt);
4226 +       if (error) {
4227 +               /* FIXME: are there return value we should not
4228 +                * BUG() on ? */
4229 +               BUG_ON(vfs_unlink(new_nd->path.dentry->d_inode,
4230 +                                 dentry));
4231 +               goto out_dput;
4232 +       }
4233 +
4234 +       dput(new->dentry);
4235 +       new->dentry = dentry;
4236 +       if (new->mnt != new_nd->path.mnt)
4237 +               mntput(new->mnt);
4238 +       new->mnt = new_nd->path.mnt;
4239 +       return error;
4240 +
4241 +out_dput:
4242 +       dput(dentry);
4243 +       return error;
4244 +}
4245 +
4246 +/*
4247 + * union_copyup - copy a file to the topmost layer of the union stack
4248 + * @nd: nameidata pointer to the file
4249 + * @flags: flags given to open_namei
4250 + */
4251 +int union_copyup(struct nameidata *nd, int flags /* XXX not used */)
4252 +{
4253 +       struct qstr this;
4254 +       char *name;
4255 +       struct dentry *dir;
4256 +       struct path path;
4257 +       int err;
4258 +
4259 +       if (!is_unionized(nd->path.dentry, nd->path.mnt))
4260 +               return 0;
4261 +       if (!S_ISREG(nd->path.dentry->d_inode->i_mode))
4262 +               return 0;
4263 +
4264 +       /* safe the name for hash_lookup_union() */
4265 +       this.len = nd->path.dentry->d_name.len;
4266 +       this.hash = nd->path.dentry->d_name.hash;
4267 +       name = kmalloc(this.len + 1, GFP_KERNEL);
4268 +       if (!name)
4269 +               return -ENOMEM;
4270 +       this.name = name;
4271 +       memcpy(name, nd->path.dentry->d_name.name, nd->path.dentry->d_name.len);
4272 +       name[this.len] = 0;
4273 +
4274 +       err = union_relookup_topmost(nd, nd->flags|LOOKUP_PARENT);
4275 +       if (err) {
4276 +               kfree(name);
4277 +               return err;
4278 +       }
4279 +       nd->flags &= ~LOOKUP_PARENT;
4280 +
4281 +       dir = nd->path.dentry;
4282 +       mutex_lock(&dir->d_inode->i_mutex);
4283 +       err = hash_lookup_union(nd, &this, &path);
4284 +       mutex_unlock(&dir->d_inode->i_mutex);
4285 +       kfree(name);
4286 +       if (err)
4287 +               return err;
4288 +
4289 +       err = -ENOENT;
4290 +       if (!path.dentry->d_inode)
4291 +               goto exit_dput;
4292 +
4293 +       /* Necessary?! I guess not ... */
4294 +       follow_mount(&path);
4295 +
4296 +       err = -ENOENT;
4297 +       if (!path.dentry->d_inode)
4298 +               goto exit_dput;
4299 +
4300 +       err = -EISDIR;
4301 +       if (!S_ISREG(path.dentry->d_inode->i_mode))
4302 +               goto exit_dput;
4303 +
4304 +       if (path.dentry->d_parent != nd->path.dentry) {
4305 +               err = __union_copyup(&path, nd, &path);
4306 +               if (err)
4307 +                       goto exit_dput;
4308 +       }
4309 +
4310 +       dput(nd->path.dentry);
4311 +       if (nd->path.mnt != path.mnt)
4312 +               mntput(nd->path.mnt);
4313 +       nd->path = path;
4314 +       return 0;
4315 +
4316 +exit_dput:
4317 +       dput(path.dentry);
4318 +       if (path.mnt != nd->path.mnt)
4319 +               mntput(path.mnt);
4320 +       return err;
4321 +}
4322 +
4323 +/*
4324 + * This must be called when unhashing a dentry. This is called with dcache_lock
4325 + * and unhashes all unions this dentry is in.
4326 + */
4327 +void __d_drop_unions(struct dentry *dentry)
4328 +{
4329 +       struct union_mount *this, *next;
4330 +
4331 +       spin_lock(&union_lock);
4332 +       list_for_each_entry_safe(this, next, &dentry->d_unions, u_unions)
4333 +               __union_unhash(this);
4334 +       spin_unlock(&union_lock);
4335 +}
4336 +EXPORT_SYMBOL_GPL(__d_drop_unions);
4337 +
4338 +/*
4339 + * This must be called after __d_drop_unions() without holding any locks.
4340 + * Note: The dentry might still be reachable via a lookup but at that time it
4341 + * already a negative dentry. Otherwise it would be unhashed. The union_mount
4342 + * structure itself is still reachable through mnt->mnt_unions (which we
4343 + * protect against with union_lock).
4344 + */
4345 +void shrink_d_unions(struct dentry *dentry)
4346 +{
4347 +       struct union_mount *this, *next;
4348 +
4349 +repeat:
4350 +       spin_lock(&union_lock);
4351 +       list_for_each_entry_safe(this, next, &dentry->d_unions, u_unions) {
4352 +               BUG_ON(!hlist_unhashed(&this->u_hash));
4353 +               BUG_ON(!hlist_unhashed(&this->u_rhash));
4354 +               list_del(&this->u_list);
4355 +               list_del(&this->u_unions);
4356 +               this->u_next.dentry->d_unionized--;
4357 +               spin_unlock(&union_lock);
4358 +               union_put(this);
4359 +               goto repeat;
4360 +       }
4361 +       spin_unlock(&union_lock);
4362 +}
4363 +
4364 +extern void __dput(struct dentry *, struct list_head *, int);
4365 +
4366 +/*
4367 + * This is the special variant for use in dput() only.
4368 + */
4369 +void __shrink_d_unions(struct dentry *dentry, struct list_head *list)
4370 +{
4371 +       struct union_mount *this, *next;
4372 +
4373 +       BUG_ON(!d_unhashed(dentry));
4374 +
4375 +repeat:
4376 +       spin_lock(&union_lock);
4377 +       list_for_each_entry_safe(this, next, &dentry->d_unions, u_unions) {
4378 +               struct dentry *n_dentry = this->u_next.dentry;
4379 +               struct vfsmount *n_mnt = this->u_next.mnt;
4380 +
4381 +               BUG_ON(!hlist_unhashed(&this->u_hash));
4382 +               BUG_ON(!hlist_unhashed(&this->u_rhash));
4383 +               list_del(&this->u_list);
4384 +               list_del(&this->u_unions);
4385 +               this->u_next.dentry->d_unionized--;
4386 +               spin_unlock(&union_lock);
4387 +               if (__union_put(this)) {
4388 +                       __dput(n_dentry, list, 0);
4389 +                       mntput(n_mnt);
4390 +               }
4391 +               goto repeat;
4392 +       }
4393 +       spin_unlock(&union_lock);
4394 +}
4395 +
4396 +/*
4397 + * Remove all union_mounts structures belonging to this vfsmount from the
4398 + * union lookup hashtable and so on ...
4399 + */
4400 +void shrink_mnt_unions(struct vfsmount *mnt)
4401 +{
4402 +       struct union_mount *this, *next;
4403 +
4404 +repeat:
4405 +       spin_lock(&union_lock);
4406 +       list_for_each_entry_safe(this, next, &mnt->mnt_unions, u_list) {
4407 +               if (this->u_this.dentry == mnt->mnt_root)
4408 +                       continue;
4409 +               __union_unhash(this);
4410 +               list_del(&this->u_list);
4411 +               list_del(&this->u_unions);
4412 +               this->u_next.dentry->d_unionized--;
4413 +               spin_unlock(&union_lock);
4414 +               union_put(this);
4415 +               goto repeat;
4416 +       }
4417 +       spin_unlock(&union_lock);
4418 +}
4419 +
4420 +int attach_mnt_union(struct vfsmount *mnt, struct vfsmount *dest_mnt,
4421 +                    struct dentry *dest_dentry)
4422 +{
4423 +       if (!IS_MNT_UNION(mnt))
4424 +               return 0;
4425 +
4426 +       return append_to_union(mnt, mnt->mnt_root, dest_mnt, dest_dentry);
4427 +}
4428 +
4429 +void detach_mnt_union(struct vfsmount *mnt)
4430 +{
4431 +       struct union_mount *um;
4432 +
4433 +       if (!IS_MNT_UNION(mnt))
4434 +               return;
4435 +
4436 +       shrink_mnt_unions(mnt);
4437 +
4438 +       spin_lock(&union_lock);
4439 +       um = union_lookup(mnt->mnt_root, mnt);
4440 +       __union_unhash(um);
4441 +       list_del(&um->u_list);
4442 +       list_del(&um->u_unions);
4443 +       um->u_next.dentry->d_unionized--;
4444 +       spin_unlock(&union_lock);
4445 +       union_put(um);
4446 +       return;
4447 +}
4448 +
4449 +/**
4450 + * union_copyup_dir_one - copy up a single directory entry
4451 + *
4452 + * Individual directory entry copyup function for union_copyup_dir.
4453 + * We get the entries from higher level layers first.
4454 + */
4455 +
4456 +static int union_copyup_dir_one(void *buf, const char *name, int namlen,
4457 +                               loff_t offset, u64 ino, unsigned int d_type)
4458 +{
4459 +       struct dentry *topmost_dentry = (struct dentry *) buf;
4460 +       struct dentry *dentry;
4461 +       int err = 0;
4462 +
4463 +       switch (namlen) {
4464 +       case 2:
4465 +               if (name[1] != '.')
4466 +                       break;
4467 +       case 1:
4468 +               if (name[0] != '.')
4469 +                       break;
4470 +               return 0;
4471 +       }
4472 +
4473 +       /* Lookup this entry in the topmost directory */
4474 +       dentry = lookup_one_len(name, topmost_dentry, namlen);
4475 +
4476 +       if (IS_ERR(dentry)) {
4477 +               printk(KERN_INFO "error looking up %s\n", dentry->d_name.name);
4478 +               goto out;
4479 +       }
4480 +
4481 +       /*
4482 +        * If the entry already exists, one of the following is true:
4483 +        * it was already copied up (due to an earlier lookup), an
4484 +        * entry with the same name already exists on the topmost file
4485 +        * system, it is a whiteout, or it is a fallthru.  In each
4486 +        * case, the top level entry masks any entries from lower file
4487 +        * systems, so don't copy up this entry.
4488 +        */
4489 +       if (dentry->d_inode || d_is_whiteout(dentry) ||
4490 +           d_is_fallthru(dentry)) {
4491 +               printk(KERN_INFO "skipping copy of %s\n", dentry->d_name.name);
4492 +               goto out_dput;
4493 +       }
4494 +
4495 +       /*
4496 +        * If the entry doesn't exist, create a fallthru entry in the
4497 +        * topmost file system.  All possible directory types are
4498 +        * used, so each file system must implement its own way of
4499 +        * storing a fallthru entry.
4500 +        */
4501 +       printk(KERN_INFO "creating fallthru for %s\n", dentry->d_name.name);
4502 +       err = topmost_dentry->d_inode->i_op->fallthru(topmost_dentry->d_inode,
4503 +                                                     dentry);
4504 +       /* FIXME */
4505 +       BUG_ON(err);
4506 +       /*
4507 +        * At this point, we have a negative dentry marked as fallthru
4508 +        * in the cache.  We could potentially lookup the entry lower
4509 +        * level file system and turn this into a positive dentry
4510 +        * right now, but it is not clear that would be a performance
4511 +        * win and adds more opportunities to fail.
4512 +        */
4513 +out_dput:
4514 +       dput(dentry);
4515 +out:
4516 +       return 0;
4517 +}
4518 +
4519 +/**
4520 + * union_copyup_dir - copy up low-level directory entries to topmost dir
4521 + *
4522 + * readdir() is difficult to support on union file systems for two
4523 + * reasons: We must eliminate duplicates and apply whiteouts, and we
4524 + * must return something in f_pos that lets us restart in the same
4525 + * place when we return.  Our solution is to, on first readdir() of
4526 + * the directory, copy up all visible entries from the low-level file
4527 + * systems and mark the entries that refer to low-level file system
4528 + * objects as "fallthru" entries.
4529 + */
4530 +
4531 +int union_copyup_dir(struct path *topmost_path)
4532 +{
4533 +       struct dentry *topmost_dentry = topmost_path->dentry;
4534 +       struct path path = *topmost_path;
4535 +       int res = 0;
4536 +
4537 +       /*
4538 +        * Skip opaque dirs.
4539 +        */
4540 +       if (IS_OPAQUE(topmost_dentry->d_inode))
4541 +               return 0;
4542 +
4543 +       /*
4544 +        * Mark this dir opaque to show that we have already copied up
4545 +        * the lower entries.  Only fallthru entries pass through to
4546 +        * the underlying file system.
4547 +        *
4548 +        * XXX Deal with the lower file system changing.  This could
4549 +        * be through running a tool over the top level file system to
4550 +        * make directories transparent again, or we could check the
4551 +        * mtime of the underlying directory.
4552 +        */
4553 +
4554 +       topmost_dentry->d_inode->i_flags |= S_OPAQUE;
4555 +       mark_inode_dirty(topmost_dentry->d_inode);
4556 +
4557 +       /*
4558 +        * Loop through each dir on each level copying up the entries
4559 +        * to the topmost.
4560 +        */
4561 +
4562 +       /* Don't drop the caller's reference to the topmost path */
4563 +       path_get(&path);
4564 +       while (follow_union_down(&path)) {
4565 +               struct file * ftmp;
4566 +               struct inode * inode;
4567 +
4568 +               /* XXX Permit fallthrus on lower-level? Would need to
4569 +                * pass in opaque flag to union_copyup_dir_one() and
4570 +                * only copy up fallthru entries there.  We allow
4571 +                * fallthrus in lower level opaque directories on
4572 +                * lookup, so for consistency we should do one or the
4573 +                * other in both places. */
4574 +               if (IS_OPAQUE(path.dentry->d_inode))
4575 +                       break;
4576 +
4577 +               /* dentry_open() doesn't get a path reference itself */
4578 +               path_get(&path);
4579 +               ftmp = dentry_open(path.dentry, path.mnt,
4580 +                                  O_RDONLY | O_DIRECTORY | O_NOATIME,
4581 +                                  current_cred());
4582 +               if (IS_ERR(ftmp)) {
4583 +                       printk (KERN_ERR "unable to open dir %s for "
4584 +                               "directory copyup: %ld\n",
4585 +                               path.dentry->d_name.name, PTR_ERR(ftmp));
4586 +                       continue;
4587 +               }
4588 +
4589 +               inode = path.dentry->d_inode;
4590 +               mutex_lock(&inode->i_mutex);
4591 +
4592 +               res = -ENOENT;
4593 +               if (IS_DEADDIR(inode))
4594 +                       goto out_fput;
4595 +               /*
4596 +                * Read the whole directory, calling our directory
4597 +                * entry copyup function on each entry.  Pass in the
4598 +                * topmost dentry as our private data so we can create
4599 +                * new entries in the topmost directory.
4600 +                */
4601 +               res = ftmp->f_op->readdir(ftmp, topmost_dentry,
4602 +                                         union_copyup_dir_one);
4603 +out_fput:
4604 +               mutex_unlock(&inode->i_mutex);
4605 +               fput(ftmp);
4606 +
4607 +               if (res)
4608 +                       break;
4609 +       }
4610 +       path_put(&path);
4611 +       return res;
4612 +}
4613 --- a/include/linux/dcache.h
4614 +++ b/include/linux/dcache.h
4615 @@ -101,6 +101,15 @@
4616         struct dentry *d_parent;        /* parent directory */
4617         struct qstr d_name;
4618
4619 +#ifdef CONFIG_UNION_MOUNT
4620 +       /*
4621 +        * The following fields are used by the VFS based union mount
4622 +        * implementation. Both are protected by union_lock!
4623 +        */
4624 +       struct list_head d_unions;      /* list of union_mount's */
4625 +       unsigned int d_unionized;       /* unions referencing this dentry */
4626 +#endif
4627 +
4628         struct list_head d_lru;         /* LRU list */
4629         /*
4630          * d_child and d_rcu can share memory
4631 @@ -186,6 +195,9 @@
4632
4633  #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */
4634
4635 +#define DCACHE_WHITEOUT                0x0100  /* This negative dentry is a whiteout */
4636 +#define DCACHE_FALLTHRU                0x0200  /* Keep looking in the file system below */
4637 +
4638  extern spinlock_t dcache_lock;
4639  extern seqlock_t rename_lock;
4640
4641 @@ -205,12 +217,20 @@
4642   * __d_drop requires dentry->d_lock.
4643   */
4644
4645 +#ifdef CONFIG_UNION_MOUNT
4646 +extern void __d_drop_unions(struct dentry *);
4647 +#endif
4648 +
4649  static inline void __d_drop(struct dentry *dentry)
4650  {
4651         if (!(dentry->d_flags & DCACHE_UNHASHED)) {
4652                 dentry->d_flags |= DCACHE_UNHASHED;
4653                 hlist_del_rcu(&dentry->d_hash);
4654         }
4655 +#ifdef CONFIG_UNION_MOUNT
4656 +       /* remove dentry from the union hashtable */
4657 +       __d_drop_unions(dentry);
4658 +#endif
4659  }
4660
4661  static inline void d_drop(struct dentry *dentry)
4662 @@ -358,6 +378,16 @@
4663         return d_unhashed(dentry) && !IS_ROOT(dentry);
4664  }
4665
4666 +static inline int d_is_whiteout(struct dentry *dentry)
4667 +{
4668 +       return (dentry->d_flags & DCACHE_WHITEOUT);
4669 +}
4670 +
4671 +static inline int d_is_fallthru(struct dentry *dentry)
4672 +{
4673 +       return (dentry->d_flags & DCACHE_FALLTHRU);
4674 +}
4675 +
4676  static inline struct dentry *dget_parent(struct dentry *dentry)
4677  {
4678         struct dentry *ret;
4679 --- a/include/linux/ext2_fs.h
4680 +++ b/include/linux/ext2_fs.h
4681 @@ -189,6 +189,7 @@
4682  #define EXT2_NOTAIL_FL                 FS_NOTAIL_FL    /* file tail should not be merged */
4683  #define EXT2_DIRSYNC_FL                        FS_DIRSYNC_FL   /* dirsync behaviour (directories only) */
4684  #define EXT2_TOPDIR_FL                 FS_TOPDIR_FL    /* Top of directory hierarchies*/
4685 +#define EXT2_OPAQUE_FL                 0x00040000
4686  #define EXT2_RESERVED_FL               FS_RESERVED_FL  /* reserved for ext2 lib */
4687
4688  #define EXT2_FL_USER_VISIBLE           FS_FL_USER_VISIBLE      /* User visible flags */
4689 @@ -503,10 +504,12 @@
4690  #define EXT3_FEATURE_INCOMPAT_RECOVER          0x0004
4691  #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV      0x0008
4692  #define EXT2_FEATURE_INCOMPAT_META_BG          0x0010
4693 +#define EXT2_FEATURE_INCOMPAT_WHITEOUT         0x0020
4694  #define EXT2_FEATURE_INCOMPAT_ANY              0xffffffff
4695
4696  #define EXT2_FEATURE_COMPAT_SUPP       EXT2_FEATURE_COMPAT_EXT_ATTR
4697  #define EXT2_FEATURE_INCOMPAT_SUPP     (EXT2_FEATURE_INCOMPAT_FILETYPE| \
4698 +                                        EXT2_FEATURE_INCOMPAT_WHITEOUT| \
4699                                          EXT2_FEATURE_INCOMPAT_META_BG)
4700  #define EXT2_FEATURE_RO_COMPAT_SUPP    (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
4701                                          EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
4702 @@ -573,6 +576,8 @@
4703         EXT2_FT_FIFO,
4704         EXT2_FT_SOCK,
4705         EXT2_FT_SYMLINK,
4706 +       EXT2_FT_WHT,
4707 +       EXT2_FT_FALLTHRU,
4708         EXT2_FT_MAX
4709  };
4710
4711 --- a/include/linux/fs.h
4712 +++ b/include/linux/fs.h
4713 @@ -188,6 +188,7 @@
4714  #define MS_REMOUNT     32      /* Alter flags of a mounted FS */
4715  #define MS_MANDLOCK    64      /* Allow mandatory locks on an FS */
4716  #define MS_DIRSYNC     128     /* Directory modifications are synchronous */
4717 +#define MS_UNION       256
4718  #define MS_NOATIME     1024    /* Do not update access times. */
4719  #define MS_NODIRATIME  2048    /* Do not update directory access times */
4720  #define MS_BIND                4096
4721 @@ -205,6 +206,7 @@
4722  #define MS_KERNMOUNT   (1<<22) /* this is a kern_mount call */
4723  #define MS_I_VERSION   (1<<23) /* Update inode I_version field */
4724  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
4725 +#define MS_WHITEOUT    (1<<26) /* fs does support white-out filetype */
4726  #define MS_ACTIVE      (1<<30)
4727  #define MS_NOUSER      (1<<31)
4728
4729 @@ -231,6 +233,7 @@
4730  #define S_NOCMTIME     128     /* Do not update file c/mtime */
4731  #define S_SWAPFILE     256     /* Do not truncate: swapon got its bmaps */
4732  #define S_PRIVATE      512     /* Inode is fs-internal */
4733 +#define S_OPAQUE       1024    /* Directory is opaque */
4734
4735  /*
4736   * Note that nosuid etc flags are inode-specific: setting some file-system
4737 @@ -266,6 +269,8 @@
4738  #define IS_SWAPFILE(inode)     ((inode)->i_flags & S_SWAPFILE)
4739  #define IS_PRIVATE(inode)      ((inode)->i_flags & S_PRIVATE)
4740
4741 +#define IS_OPAQUE(inode)       ((inode)->i_flags & S_OPAQUE)
4742 +
4743  /* the read-only stuff doesn't really belong here, but any other place is
4744     probably as bad and I don't want to create yet another include file. */
4745
4746 @@ -1379,6 +1384,11 @@
4747          * generic_show_options()
4748          */
4749         char *s_options;
4750 +
4751 +       /*
4752 +        * Users who require read-only access - e.g., union mounts
4753 +        */
4754 +       int s_readonly_users;
4755  };
4756
4757  extern struct timespec current_fs_time(struct super_block *sb);
4758 @@ -1521,6 +1531,8 @@
4759         int (*mkdir) (struct inode *,struct dentry *,int);
4760         int (*rmdir) (struct inode *,struct dentry *);
4761         int (*mknod) (struct inode *,struct dentry *,int,dev_t);
4762 +       int (*whiteout) (struct inode *, struct dentry *, struct dentry *);
4763 +       int (*fallthru) (struct inode *, struct dentry *);
4764         int (*rename) (struct inode *, struct dentry *,
4765                         struct inode *, struct dentry *);
4766         int (*readlink) (struct dentry *, char __user *,int);
4767 @@ -2094,6 +2106,7 @@
4768  extern sector_t bmap(struct inode *, sector_t);
4769  #endif
4770  extern int notify_change(struct dentry *, struct iattr *);
4771 +extern int __inode_permission(struct inode *inode, int mask, int rofs);
4772  extern int inode_permission(struct inode *, int);
4773  extern int generic_permission(struct inode *, int,
4774                 int (*check_acl)(struct inode *, int));
4775 @@ -2121,7 +2134,7 @@
4776
4777  extern struct file *do_filp_open(int dfd, const char *pathname,
4778                 int open_flag, int mode, int acc_mode);
4779 -extern int may_open(struct path *, int, int);
4780 +extern int may_open(struct nameidata *, int, int);
4781
4782  extern int kernel_read(struct file *, loff_t, char *, unsigned long);
4783  extern struct file * open_exec(const char *);
4784 --- a/include/linux/mount.h
4785 +++ b/include/linux/mount.h
4786 @@ -35,6 +35,7 @@
4787  #define MNT_SHARED     0x1000  /* if the vfsmount is a shared mount */
4788  #define MNT_UNBINDABLE 0x2000  /* if the vfsmount is a unbindable mount */
4789  #define MNT_PNODE_MASK 0x3000  /* propagation flag mask */
4790 +#define MNT_UNION      0x4000  /* if the vfsmount is a union mount */
4791
4792  struct vfsmount {
4793         struct list_head mnt_hash;
4794 @@ -53,6 +54,9 @@
4795         struct list_head mnt_slave_list;/* list of slave mounts */
4796         struct list_head mnt_slave;     /* slave list entry */
4797         struct vfsmount *mnt_master;    /* slave is on master->mnt_slave_list */
4798 +#ifdef CONFIG_UNION_MOUNT
4799 +       struct list_head mnt_unions;    /* list of union_mount structures */
4800 +#endif
4801         struct mnt_namespace *mnt_ns;   /* containing namespace */
4802         int mnt_id;                     /* mount identifier */
4803         int mnt_group_id;               /* peer group identifier */
4804 --- a/include/linux/namei.h
4805 +++ b/include/linux/namei.h
4806 @@ -20,6 +20,7 @@
4807         struct qstr     last;
4808         struct path     root;
4809         unsigned int    flags;
4810 +       unsigned int    um_flags;
4811         int             last_type;
4812         unsigned        depth;
4813         char *saved_names[MAX_NESTED_LINKS + 1];
4814 @@ -35,6 +36,9 @@
4815   */
4816  enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
4817
4818 +#define LAST_UNION             0x01
4819 +#define LAST_LOWLEVEL          0x02
4820 +
4821  /*
4822   * The bitmask for a lookup event:
4823   *  - follow links at the end
4824 @@ -49,6 +53,8 @@
4825  #define LOOKUP_CONTINUE                 4
4826  #define LOOKUP_PARENT          16
4827  #define LOOKUP_REVAL           64
4828 +#define LOOKUP_TOPMOST        128
4829 +
4830  /*
4831   * Intent data
4832   */
4833 --- /dev/null
4834 +++ b/include/linux/union.h
4835 @@ -0,0 +1,84 @@
4836 +/*
4837 + * VFS based union mount for Linux
4838 + *
4839 + * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
4840 + * Copyright (C) 2007 Novell Inc.
4841 + *   Author(s): Jan Blunck (j.blunck@tu-harburg.de)
4842 + *
4843 + * This program is free software; you can redistribute it and/or modify it
4844 + * under the terms of the GNU General Public License as published by the Free
4845 + * Software Foundation; either version 2 of the License, or (at your option)
4846 + * any later version.
4847 + *
4848 + */
4849 +#ifndef __LINUX_UNION_H
4850 +#define __LINUX_UNION_H
4851 +#ifdef __KERNEL__
4852 +
4853 +#include <linux/list.h>
4854 +#include <asm/atomic.h>
4855 +
4856 +struct dentry;
4857 +struct vfsmount;
4858 +
4859 +#ifdef CONFIG_UNION_MOUNT
4860 +
4861 +/*
4862 + * The new union mount structure.
4863 + */
4864 +struct union_mount {
4865 +       atomic_t u_count;               /* reference count */
4866 +       struct mutex u_mutex;
4867 +       struct list_head u_unions;      /* list head for d_unions */
4868 +       struct list_head u_list;        /* list head for mnt_unions */
4869 +       struct hlist_node u_hash;       /* list head for seaching */
4870 +       struct hlist_node u_rhash;      /* list head for reverse seaching */
4871 +
4872 +       struct path u_this;             /* this is me */
4873 +       struct path u_next;             /* this is what I overlay */
4874 +};
4875 +
4876 +#define IS_UNION(dentry)       (!list_empty(&(dentry)->d_unions) || \
4877 +                                (dentry)->d_unionized)
4878 +#define IS_MNT_UNION(mnt)      ((mnt)->mnt_flags & MNT_UNION)
4879 +
4880 +extern int is_unionized(struct dentry *, struct vfsmount *);
4881 +extern int append_to_union(struct vfsmount *, struct dentry *,
4882 +                          struct vfsmount *, struct dentry *);
4883 +extern int follow_union_down(struct path *);
4884 +extern int follow_union_mount(struct path *);
4885 +extern void __d_drop_unions(struct dentry *);
4886 +extern void shrink_d_unions(struct dentry *);
4887 +extern void __shrink_d_unions(struct dentry *, struct list_head *);
4888 +extern int attach_mnt_union(struct vfsmount *, struct vfsmount *,
4889 +                           struct dentry *);
4890 +extern void detach_mnt_union(struct vfsmount *);
4891 +extern struct dentry *union_create_topmost(struct nameidata *, struct qstr *,
4892 +                                          struct path *);
4893 +extern int __union_copyup(struct path *, struct nameidata *, struct path *);
4894 +extern int union_copyup(struct nameidata *, int);
4895 +extern int union_copyup_dir(struct path *path);
4896 +extern int union_permission(struct path *, int);
4897 +
4898 +#else /* CONFIG_UNION_MOUNT */
4899 +
4900 +#define IS_UNION(x)                    (0)
4901 +#define IS_MNT_UNION(x)                        (0)
4902 +#define is_unionized(x, y)             (0)
4903 +#define append_to_union(x1, y1, x2, y2)        ({ BUG(); (0); })
4904 +#define follow_union_down(x, y)                ({ (0); })
4905 +#define follow_union_mount(x, y)       ({ (0); })
4906 +#define __d_drop_unions(x)             do { } while (0)
4907 +#define shrink_d_unions(x)             do { } while (0)
4908 +#define __shrink_d_unions(x,y)         do { } while (0)
4909 +#define attach_mnt_union(x, y, z)      do { } while (0)
4910 +#define detach_mnt_union(x)            do { } while (0)
4911 +#define union_create_topmost(x, y, z)  ({ BUG(); (NULL); })
4912 +#define __union_copyup(x, y, z)                ({ BUG(); (0); })
4913 +#define union_copyup(x, y)             ({ (0); })
4914 +#define union_copyup_dir(x)            ({ BUG(); (0); })
4915 +#define union_permission(x, y)         inode_permission(x->dentry->d_inode, y)
4916 +
4917 +#endif /* CONFIG_UNION_MOUNT */
4918 +#endif /* __KERNEL__ */
4919 +#endif /* __LINUX_UNION_H */
4920 --- a/mm/shmem.c
4921 +++ b/mm/shmem.c
4922 @@ -1794,6 +1794,118 @@
4923         return 0;
4924  }
4925
4926 +static int shmem_rmdir(struct inode *dir, struct dentry *dentry);
4927 +static int shmem_unlink(struct inode *dir, struct dentry *dentry);
4928 +
4929 +/*
4930 + * Create a dentry to signify a whiteout.
4931 + */
4932 +static int shmem_whiteout(struct inode *dir, struct dentry *old_dentry,
4933 +                         struct dentry *new_dentry)
4934 +{
4935 +       struct shmem_sb_info *sbinfo = SHMEM_SB(dir->i_sb);
4936 +       struct dentry *dentry;
4937 +
4938 +       if (!(dir->i_sb->s_flags & MS_WHITEOUT))
4939 +               return -EPERM;
4940 +
4941 +       /* This gives us a proper initialized negative dentry */
4942 +       dentry = simple_lookup(dir, new_dentry, NULL);
4943 +       if (dentry && IS_ERR(dentry))
4944 +               return PTR_ERR(dentry);
4945 +
4946 +       /*
4947 +        * No ordinary (disk based) filesystem counts whiteouts as inodes;
4948 +        * but each new link needs a new dentry, pinning lowmem, and
4949 +        * tmpfs dentries cannot be pruned until they are unlinked.
4950 +        */
4951 +       if (sbinfo->max_inodes) {
4952 +               spin_lock(&sbinfo->stat_lock);
4953 +               if (!sbinfo->free_inodes) {
4954 +                       spin_unlock(&sbinfo->stat_lock);
4955 +                       return -ENOSPC;
4956 +               }
4957 +               sbinfo->free_inodes--;
4958 +               spin_unlock(&sbinfo->stat_lock);
4959 +       }
4960 +
4961 +       if (old_dentry->d_inode || d_is_fallthru(old_dentry)) {
4962 +               if (old_dentry->d_inode && S_ISDIR(old_dentry->d_inode->i_mode))
4963 +                       shmem_rmdir(dir, old_dentry);
4964 +               else
4965 +                       shmem_unlink(dir, old_dentry);
4966 +       }
4967 +
4968 +       dir->i_size += BOGO_DIRENT_SIZE;
4969 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
4970 +       /* Extra pinning count for the created dentry */
4971 +       dget(new_dentry);
4972 +       spin_lock(&new_dentry->d_lock);
4973 +       new_dentry->d_flags |= DCACHE_WHITEOUT;
4974 +       spin_unlock(&new_dentry->d_lock);
4975 +       return 0;
4976 +}
4977 +
4978 +static void shmem_d_instantiate(struct inode *dir, struct dentry *dentry,
4979 +                               struct inode *inode);
4980 +
4981 +/*
4982 + * Create a dentry to signify a fallthru.  A fallthru lets us read the
4983 + * low-level dentries into the dcache once on the first readdir() and
4984 + * then
4985 + */
4986 +static int shmem_fallthru(struct inode *dir, struct dentry *dentry)
4987 +{
4988 +       struct shmem_sb_info *sbinfo = SHMEM_SB(dir->i_sb);
4989 +
4990 +       /* FIXME: this is stupid */
4991 +       if (!(dir->i_sb->s_flags & MS_WHITEOUT))
4992 +               return -EPERM;
4993 +
4994 +       if (dentry->d_inode || d_is_fallthru(dentry) || d_is_whiteout(dentry))
4995 +               return -EEXIST;
4996 +
4997 +       /*
4998 +        * Each new link needs a new dentry, pinning lowmem, and tmpfs
4999 +        * dentries cannot be pruned until they are unlinked.
5000 +        */
5001 +       if (sbinfo->max_inodes) {
5002 +               spin_lock(&sbinfo->stat_lock);
5003 +               if (!sbinfo->free_inodes) {
5004 +                       spin_unlock(&sbinfo->stat_lock);
5005 +                       return -ENOSPC;
5006 +               }
5007 +               sbinfo->free_inodes--;
5008 +               spin_unlock(&sbinfo->stat_lock);
5009 +       }
5010 +
5011 +       shmem_d_instantiate(dir, dentry, NULL);
5012 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5013 +
5014 +       spin_lock(&dentry->d_lock);
5015 +       dentry->d_flags |= DCACHE_FALLTHRU;
5016 +       spin_unlock(&dentry->d_lock);
5017 +       return 0;
5018 +}
5019 +
5020 +static void shmem_d_instantiate(struct inode *dir, struct dentry *dentry,
5021 +                               struct inode *inode)
5022 +{
5023 +       if (d_is_whiteout(dentry)) {
5024 +               /* Re-using an existing whiteout */
5025 +               shmem_free_inode(dir->i_sb);
5026 +               if (S_ISDIR(inode->i_mode))
5027 +                       inode->i_mode |= S_OPAQUE;
5028 +       } else if (d_is_fallthru(dentry)) {
5029 +               shmem_free_inode(dir->i_sb);
5030 +       } else {
5031 +               /* New dentry */
5032 +               dir->i_size += BOGO_DIRENT_SIZE;
5033 +               dget(dentry); /* Extra count - pin the dentry in core */
5034 +       }
5035 +       /* Will clear DCACHE_WHITEOUT and DCACHE_FALLTHRU flags */
5036 +       d_instantiate(dentry, inode);
5037 +}
5038  /*
5039   * File creation. Allocate an inode, and we're done..
5040   */
5041 @@ -1818,15 +1930,16 @@
5042                         iput(inode);
5043                         return error;
5044                 }
5045 +
5046                 if (dir->i_mode & S_ISGID) {
5047                         inode->i_gid = dir->i_gid;
5048                         if (S_ISDIR(mode))
5049                                 inode->i_mode |= S_ISGID;
5050                 }
5051 -               dir->i_size += BOGO_DIRENT_SIZE;
5052 +
5053 +               shmem_d_instantiate(dir, dentry, inode);
5054 +
5055                 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5056 -               d_instantiate(dentry, inode);
5057 -               dget(dentry); /* Extra count - pin the dentry in core */
5058         }
5059         return error;
5060  }
5061 @@ -1864,12 +1977,11 @@
5062         if (ret)
5063                 goto out;
5064
5065 -       dir->i_size += BOGO_DIRENT_SIZE;
5066 +       shmem_d_instantiate(dir, dentry, inode);
5067 +
5068         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5069         inc_nlink(inode);
5070         atomic_inc(&inode->i_count);    /* New dentry reference */
5071 -       dget(dentry);           /* Extra pinning count for the created dentry */
5072 -       d_instantiate(dentry, inode);
5073  out:
5074         return ret;
5075  }
5076 @@ -1878,21 +1990,63 @@
5077  {
5078         struct inode *inode = dentry->d_inode;
5079
5080 -       if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
5081 -               shmem_free_inode(inode->i_sb);
5082 +       if (d_is_whiteout(dentry) || d_is_fallthru(dentry) ||
5083 +           (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)))
5084 +               shmem_free_inode(dir->i_sb);
5085
5086 +       if (inode) {
5087 +               inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5088 +               drop_nlink(inode);
5089 +       }
5090         dir->i_size -= BOGO_DIRENT_SIZE;
5091 -       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5092 -       drop_nlink(inode);
5093         dput(dentry);   /* Undo the count from "create" - this does all the work */
5094         return 0;
5095  }
5096
5097 +static void shmem_dir_unlink_whiteouts(struct inode *dir, struct dentry *dentry)
5098 +{
5099 +       if (!dentry->d_inode)
5100 +               return;
5101 +
5102 +       /* Remove whiteouts from logical empty directory */
5103 +       if (S_ISDIR(dentry->d_inode->i_mode) &&
5104 +           dentry->d_inode->i_sb->s_flags & MS_WHITEOUT) {
5105 +               struct dentry *child, *next;
5106 +               LIST_HEAD(list);
5107 +
5108 +               spin_lock(&dcache_lock);
5109 +               list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
5110 +                       spin_lock(&child->d_lock);
5111 +                       /* Unlink fallthrus too */
5112 +                       if (d_is_whiteout(child) || d_is_fallthru(child)) {
5113 +                               __d_drop(child);
5114 +                               if (!list_empty(&child->d_lru)) {
5115 +                                       list_del(&child->d_lru);
5116 +                                       dentry_stat.nr_unused--;
5117 +                               }
5118 +                               list_add(&child->d_lru, &list);
5119 +                       }
5120 +                       spin_unlock(&child->d_lock);
5121 +               }
5122 +               spin_unlock(&dcache_lock);
5123 +
5124 +               list_for_each_entry_safe(child, next, &list, d_lru) {
5125 +                       spin_lock(&child->d_lock);
5126 +                       list_del_init(&child->d_lru);
5127 +                       spin_unlock(&child->d_lock);
5128 +
5129 +                       shmem_unlink(dentry->d_inode, child);
5130 +               }
5131 +       }
5132 +}
5133 +
5134  static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
5135  {
5136         if (!simple_empty(dentry))
5137                 return -ENOTEMPTY;
5138
5139 +       /* Remove whiteouts from logical empty directory */
5140 +       shmem_dir_unlink_whiteouts(dir, dentry);
5141         drop_nlink(dentry->d_inode);
5142         drop_nlink(dir);
5143         return shmem_unlink(dir, dentry);
5144 @@ -1901,7 +2055,7 @@
5145  /*
5146   * The VFS layer already does all the dentry stuff for rename,
5147   * we just have to decrement the usage count for the target if
5148 - * it exists so that the VFS layer correctly free's it when it
5149 + * it exists so that the VFS layer correctly frees it when it
5150   * gets overwritten.
5151   */
5152  static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
5153 @@ -1912,7 +2066,12 @@
5154         if (!simple_empty(new_dentry))
5155                 return -ENOTEMPTY;
5156
5157 +       if (d_is_whiteout(new_dentry))
5158 +               shmem_unlink(new_dir, new_dentry);
5159 +
5160         if (new_dentry->d_inode) {
5161 +               /* Remove whiteouts from logical empty directory */
5162 +               shmem_dir_unlink_whiteouts(new_dir, new_dentry);
5163                 (void) shmem_unlink(new_dir, new_dentry);
5164                 if (they_are_dirs)
5165                         drop_nlink(old_dir);
5166 @@ -1977,12 +2136,12 @@
5167                 set_page_dirty(page);
5168                 page_cache_release(page);
5169         }
5170 +
5171 +       shmem_d_instantiate(dir, dentry, inode);
5172 +
5173         if (dir->i_mode & S_ISGID)
5174                 inode->i_gid = dir->i_gid;
5175 -       dir->i_size += BOGO_DIRENT_SIZE;
5176         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
5177 -       d_instantiate(dentry, inode);
5178 -       dget(dentry);
5179         return 0;
5180  }
5181
5182 @@ -2363,6 +2522,12 @@
5183         if (!root)
5184                 goto failed_iput;
5185         sb->s_root = root;
5186 +
5187 +#ifdef CONFIG_TMPFS
5188 +       if (!(sb->s_flags & MS_NOUSER))
5189 +               sb->s_flags |= MS_WHITEOUT;
5190 +#endif
5191 +
5192         return 0;
5193
5194  failed_iput:
5195 @@ -2462,6 +2627,8 @@
5196         .rmdir          = shmem_rmdir,
5197         .mknod          = shmem_mknod,
5198         .rename         = shmem_rename,
5199 +       .whiteout       = shmem_whiteout,
5200 +       .fallthru       = shmem_fallthru,
5201  #endif
5202  #ifdef CONFIG_TMPFS_POSIX_ACL
5203         .setattr        = shmem_notify_change,