Merge tag 'ofs-pull-tag-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux

Pull orangefs filesystem from Mike Marshall. This finally merges the long-pending orangefs filesystem, which has been much cleaned up with input from Al Viro over the last six months. From the documentation file: "OrangeFS is an LGPL userspace scale-out parallel storage system. It is ideal for large storage problems faced by HPC, BigData, Streaming Video, Genomics, Bioinformatics. Orangefs, originally called PVFS, was first developed in 1993 by Walt Ligon and Eric Blumer as a parallel file system for Parallel Virtual Machine (PVM) as part of a NASA grant to study the I/O patterns of parallel programs. Orangefs features include: - Distributes file data among multiple file servers - Supports simultaneous access by multiple clients - Stores file data and metadata on servers using local file system and access methods - Userspace implementation is easy to install and maintain - Direct MPI support - Stateless" see Documentation/filesystems/orangefs.txt for more in-depth details. * tag 'ofs-pull-tag-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux: (174 commits) orangefs: fix orangefs_superblock locking orangefs: fix do_readv_writev() handling of error halfway through orangefs: have ->kill_sb() evict the VFS side of things first orangefs: sanitize ->llseek() orangefs-bufmap.h: trim unused junk orangefs: saner calling conventions for getting a slot orangefs_copy_{to,from}_bufmap(): don't pass bufmap pointer orangefs: get rid of readdir_handle_s ornagefs: ensure that truncate has an up to date inode size orangefs: move code which sets i_link to orangefs_inode_getattr orangefs: remove needless wrapper around GFP_KERNEL orangefs: remove wrapper around mutex_lock(&inode->i_mutex) orangefs: refactor inode type or link_target change detection orangefs: use new getattr for revalidate and remove old getattr orangefs: use new getattr in inode getattr and permission orangefs: use new orangefs_inode_getattr to get size in write and llseek orangefs: use new orangefs_inode_getattr to create new inodes orangefs: rename orangefs_inode_getattr to orangefs_inode_old_getattr orangefs: remove inode->i_lock wrapper orangefs: put register_chrdev immediately before register_filesystem ...
2026-03-06 15:25:10 -08:00 · 2016-03-26 12:59:04 -07:00
parent b4cec5f668 45996492e5
commit 698f415cf5
33 changed files with 11243 additions and 0 deletions
--- a/Documentation/ABI/stable/sysfs-fs-orangefs
+++ b/Documentation/ABI/stable/sysfs-fs-orangefs
@@ -0,0 +1,87 @@
+What:			/sys/fs/orangefs/perf_counters/*
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Counters and settings for various caches.
+			Read only.
+
+
+What:			/sys/fs/orangefs/perf_counter_reset
+Date:			June 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			echo a 0 or a 1 into perf_counter_reset to
+			reset all the counters in
+			/sys/fs/orangefs/perf_counters
+			except ones with PINT_PERF_PRESERVE set.
+
+
+What:			/sys/fs/orangefs/perf_time_interval_secs
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Length of perf counter intervals in
+			seconds.
+
+
+What:			/sys/fs/orangefs/perf_history_size
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			The perf_counters cache statistics have N, or
+			perf_history_size, samples. The default is
+			one.
+
+			Every perf_time_interval_secs the (first)
+			samples are reset.
+
+			If N is greater than one, the "current" set
+			of samples is reset, and the samples from the
+			other N-1 intervals remain available.
+
+
+What:			/sys/fs/orangefs/op_timeout_secs
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Service operation timeout in seconds.
+
+
+What:			/sys/fs/orangefs/slot_timeout_secs
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			"Slot" timeout in seconds. A "slot"
+			is an indexed buffer in the shared
+			memory segment used for communication
+			between the kernel module and userspace.
+			Slots are requested and waited for,
+			the wait times out after slot_timeout_secs.
+
+
+What:			/sys/fs/orangefs/acache/*
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Attribute cache configurable settings.
+
+
+What:			/sys/fs/orangefs/ncache/*
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Name cache configurable settings.
+
+
+What:			/sys/fs/orangefs/capcache/*
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Capability cache configurable settings.
+
+
+What:			/sys/fs/orangefs/ccache/*
+Date:			Jun 2015
+Contact:		Mike Marshall <hubcap@omnibond.com>
+Description:
+			Credential cache configurable settings.
--- a/Documentation/filesystems/orangefs.txt
+++ b/Documentation/filesystems/orangefs.txt
@@ -0,0 +1,406 @@
+ORANGEFS
+========
+
+OrangeFS is an LGPL userspace scale-out parallel storage system. It is ideal
+for large storage problems faced by HPC, BigData, Streaming Video,
+Genomics, Bioinformatics.
+
+Orangefs, originally called PVFS, was first developed in 1993 by
+Walt Ligon and Eric Blumer as a parallel file system for Parallel
+Virtual Machine (PVM) as part of a NASA grant to study the I/O patterns
+of parallel programs.
+
+Orangefs features include:
+
+  * Distributes file data among multiple file servers
+  * Supports simultaneous access by multiple clients
+  * Stores file data and metadata on servers using local file system
+    and access methods
+  * Userspace implementation is easy to install and maintain
+  * Direct MPI support
+  * Stateless
+
+
+MAILING LIST
+============
+
+http://beowulf-underground.org/mailman/listinfo/pvfs2-users
+
+
+DOCUMENTATION
+=============
+
+http://www.orangefs.org/documentation/
+
+
+USERSPACE FILESYSTEM SOURCE
+===========================
+
+http://www.orangefs.org/download
+
+Orangefs versions prior to 2.9.3 would not be compatible with the
+upstream version of the kernel client.
+
+
+BUILDING THE USERSPACE FILESYSTEM ON A SINGLE SERVER
+====================================================
+
+When Orangefs is upstream, "--with-kernel" shouldn't be needed, but
+until then the path to where the kernel with the Orangefs kernel client
+patch was built is needed to ensure that pvfs2-client-core (the bridge
+between kernel space and user space) will build properly. You can omit
+--prefix if you don't care that things are sprinkled around in
+/usr/local.
+
+./configure --prefix=/opt/ofs --with-kernel=/path/to/orangefs/kernel
+
+make
+
+make install
+
+Create an orangefs config file:
+/opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf
+
+  for "Enter hostnames", use the hostname, don't let it default to
+  localhost.
+
+create a pvfs2tab file in /etc:
+cat /etc/pvfs2tab
+tcp://myhostname:3334/orangefs /mymountpoint pvfs2 defaults,noauto 0 0
+
+create the mount point you specified in the tab file if needed:
+mkdir /mymountpoint
+
+bootstrap the server:
+/opt/ofs/sbin/pvfs2-server /etc/pvfs2.conf -f
+
+start the server:
+/opt/osf/sbin/pvfs2-server /etc/pvfs2.conf
+
+Now the server is running. At this point you might like to
+prove things are working with:
+
+/opt/osf/bin/pvfs2-ls /mymountpoint
+
+You might not want to enforce selinux, it doesn't seem to matter by
+linux 3.11...
+
+If stuff seems to be working, turn on the client core:
+/opt/osf/sbin/pvfs2-client -p /opt/osf/sbin/pvfs2-client-core
+
+Mount your filesystem.
+mount -t pvfs2 tcp://myhostname:3334/orangefs /mymountpoint
+
+
+OPTIONS
+=======
+
+The following mount options are accepted:
+
+  acl
+    Allow the use of Access Control Lists on files and directories.
+
+  intr
+    Some operations between the kernel client and the user space
+    filesystem can be interruptible, such as changes in debug levels
+    and the setting of tunable parameters.
+
+  local_lock
+    Enable posix locking from the perspective of "this" kernel. The
+    default file_operations lock action is to return ENOSYS. Posix
+    locking kicks in if the filesystem is mounted with -o local_lock.
+    Distributed locking is being worked on for the future.
+
+
+DEBUGGING
+=========
+
+If you want the debug (GOSSIP) statements in a particular
+source file (inode.c for example) go to syslog:
+
+  echo inode > /sys/kernel/debug/orangefs/kernel-debug
+
+No debugging (the default):
+
+  echo none > /sys/kernel/debug/orangefs/kernel-debug
+
+Debugging from several source files:
+
+  echo inode,dir > /sys/kernel/debug/orangefs/kernel-debug
+
+All debugging:
+
+  echo all > /sys/kernel/debug/orangefs/kernel-debug
+
+Get a list of all debugging keywords:
+
+  cat /sys/kernel/debug/orangefs/debug-help
+
+
+PROTOCOL BETWEEN KERNEL MODULE AND USERSPACE
+============================================
+
+Orangefs is a user space filesystem and an associated kernel module.
+We'll just refer to the user space part of Orangefs as "userspace"
+from here on out. Orangefs descends from PVFS, and userspace code
+still uses PVFS for function and variable names. Userspace typedefs
+many of the important structures. Function and variable names in
+the kernel module have been transitioned to "orangefs", and The Linux
+Coding Style avoids typedefs, so kernel module structures that
+correspond to userspace structures are not typedefed.
+
+The kernel module implements a pseudo device that userspace
+can read from and write to. Userspace can also manipulate the
+kernel module through the pseudo device with ioctl.
+
+THE BUFMAP:
+
+At startup userspace allocates two page-size-aligned (posix_memalign)
+mlocked memory buffers, one is used for IO and one is used for readdir
+operations. The IO buffer is 41943040 bytes and the readdir buffer is
+4194304 bytes. Each buffer contains logical chunks, or partitions, and
+a pointer to each buffer is added to its own PVFS_dev_map_desc structure
+which also describes its total size, as well as the size and number of
+the partitions.
+
+A pointer to the IO buffer's PVFS_dev_map_desc structure is sent to a
+mapping routine in the kernel module with an ioctl. The structure is
+copied from user space to kernel space with copy_from_user and is used
+to initialize the kernel module's "bufmap" (struct orangefs_bufmap), which
+then contains:
+
+  * refcnt - a reference counter
+  * desc_size - PVFS2_BUFMAP_DEFAULT_DESC_SIZE (4194304) - the IO buffer's
+    partition size, which represents the filesystem's block size and
+    is used for s_blocksize in super blocks.
+  * desc_count - PVFS2_BUFMAP_DEFAULT_DESC_COUNT (10) - the number of
+    partitions in the IO buffer.
+  * desc_shift - log2(desc_size), used for s_blocksize_bits in super blocks.
+  * total_size - the total size of the IO buffer.
+  * page_count - the number of 4096 byte pages in the IO buffer.
+  * page_array - a pointer to page_count * (sizeof(struct page*)) bytes
+    of kcalloced memory. This memory is used as an array of pointers
+    to each of the pages in the IO buffer through a call to get_user_pages.
+  * desc_array - a pointer to desc_count * (sizeof(struct orangefs_bufmap_desc))
+    bytes of kcalloced memory. This memory is further intialized:
+
+      user_desc is the kernel's copy of the IO buffer's ORANGEFS_dev_map_desc
+      structure. user_desc->ptr points to the IO buffer.
+
+      pages_per_desc = bufmap->desc_size / PAGE_SIZE
+      offset = 0
+
+        bufmap->desc_array[0].page_array = &bufmap->page_array[offset]
+        bufmap->desc_array[0].array_count = pages_per_desc = 1024
+        bufmap->desc_array[0].uaddr = (user_desc->ptr) + (0 * 1024 * 4096)
+        offset += 1024
+                           .
+                           .
+                           .
+        bufmap->desc_array[9].page_array = &bufmap->page_array[offset]
+        bufmap->desc_array[9].array_count = pages_per_desc = 1024
+        bufmap->desc_array[9].uaddr = (user_desc->ptr) +
+                                               (9 * 1024 * 4096)
+        offset += 1024
+
+  * buffer_index_array - a desc_count sized array of ints, used to
+    indicate which of the IO buffer's partitions are available to use.
+  * buffer_index_lock - a spinlock to protect buffer_index_array during update.
+  * readdir_index_array - a five (ORANGEFS_READDIR_DEFAULT_DESC_COUNT) element
+    int array used to indicate which of the readdir buffer's partitions are
+    available to use.
+  * readdir_index_lock - a spinlock to protect readdir_index_array during
+    update.
+
+OPERATIONS:
+
+The kernel module builds an "op" (struct orangefs_kernel_op_s) when it
+needs to communicate with userspace. Part of the op contains the "upcall"
+which expresses the request to userspace. Part of the op eventually
+contains the "downcall" which expresses the results of the request.
+
+The slab allocator is used to keep a cache of op structures handy.
+
+At init time the kernel module defines and initializes a request list
+and an in_progress hash table to keep track of all the ops that are
+in flight at any given time.
+
+Ops are stateful:
+
+ * unknown  - op was just initialized
+ * waiting  - op is on request_list (upward bound)
+ * inprogr  - op is in progress (waiting for downcall)
+ * serviced - op has matching downcall; ok
+ * purged   - op has to start a timer since client-core
+              exited uncleanly before servicing op
+ * given up - submitter has given up waiting for it
+
+When some arbitrary userspace program needs to perform a
+filesystem operation on Orangefs (readdir, I/O, create, whatever)
+an op structure is initialized and tagged with a distinguishing ID
+number. The upcall part of the op is filled out, and the op is
+passed to the "service_operation" function.
+
+Service_operation changes the op's state to "waiting", puts
+it on the request list, and signals the Orangefs file_operations.poll
+function through a wait queue. Userspace is polling the pseudo-device
+and thus becomes aware of the upcall request that needs to be read.
+
+When the Orangefs file_operations.read function is triggered, the
+request list is searched for an op that seems ready-to-process.
+The op is removed from the request list. The tag from the op and
+the filled-out upcall struct are copy_to_user'ed back to userspace.
+
+If any of these (and some additional protocol) copy_to_users fail,
+the op's state is set to "waiting" and the op is added back to
+the request list. Otherwise, the op's state is changed to "in progress",
+and the op is hashed on its tag and put onto the end of a list in the
+in_progress hash table at the index the tag hashed to.
+
+When userspace has assembled the response to the upcall, it
+writes the response, which includes the distinguishing tag, back to
+the pseudo device in a series of io_vecs. This triggers the Orangefs
+file_operations.write_iter function to find the op with the associated
+tag and remove it from the in_progress hash table. As long as the op's
+state is not "canceled" or "given up", its state is set to "serviced".
+The file_operations.write_iter function returns to the waiting vfs,
+and back to service_operation through wait_for_matching_downcall.
+
+Service operation returns to its caller with the op's downcall
+part (the response to the upcall) filled out.
+
+The "client-core" is the bridge between the kernel module and
+userspace. The client-core is a daemon. The client-core has an
+associated watchdog daemon. If the client-core is ever signaled
+to die, the watchdog daemon restarts the client-core. Even though
+the client-core is restarted "right away", there is a period of
+time during such an event that the client-core is dead. A dead client-core
+can't be triggered by the Orangefs file_operations.poll function.
+Ops that pass through service_operation during a "dead spell" can timeout
+on the wait queue and one attempt is made to recycle them. Obviously,
+if the client-core stays dead too long, the arbitrary userspace processes
+trying to use Orangefs will be negatively affected. Waiting ops
+that can't be serviced will be removed from the request list and
+have their states set to "given up". In-progress ops that can't 
+be serviced will be removed from the in_progress hash table and
+have their states set to "given up".
+
+Readdir and I/O ops are atypical with respect to their payloads.
+
+  - readdir ops use the smaller of the two pre-allocated pre-partitioned
+    memory buffers. The readdir buffer is only available to userspace.
+    The kernel module obtains an index to a free partition before launching
+    a readdir op. Userspace deposits the results into the indexed partition
+    and then writes them to back to the pvfs device.
+
+  - io (read and write) ops use the larger of the two pre-allocated
+    pre-partitioned memory buffers. The IO buffer is accessible from
+    both userspace and the kernel module. The kernel module obtains an
+    index to a free partition before launching an io op. The kernel module
+    deposits write data into the indexed partition, to be consumed
+    directly by userspace. Userspace deposits the results of read
+    requests into the indexed partition, to be consumed directly
+    by the kernel module.
+
+Responses to kernel requests are all packaged in pvfs2_downcall_t
+structs. Besides a few other members, pvfs2_downcall_t contains a
+union of structs, each of which is associated with a particular
+response type.
+
+The several members outside of the union are:
+ - int32_t type - type of operation.
+ - int32_t status - return code for the operation.
+ - int64_t trailer_size - 0 unless readdir operation.
+ - char *trailer_buf - initialized to NULL, used during readdir operations.
+
+The appropriate member inside the union is filled out for any
+particular response.
+
+  PVFS2_VFS_OP_FILE_IO
+    fill a pvfs2_io_response_t
+
+  PVFS2_VFS_OP_LOOKUP
+    fill a PVFS_object_kref
+
+  PVFS2_VFS_OP_CREATE
+    fill a PVFS_object_kref
+
+  PVFS2_VFS_OP_SYMLINK
+    fill a PVFS_object_kref
+
+  PVFS2_VFS_OP_GETATTR
+    fill in a PVFS_sys_attr_s (tons of stuff the kernel doesn't need)
+    fill in a string with the link target when the object is a symlink.
+
+  PVFS2_VFS_OP_MKDIR
+    fill a PVFS_object_kref
+
+  PVFS2_VFS_OP_STATFS
+    fill a pvfs2_statfs_response_t with useless info <g>. It is hard for
+    us to know, in a timely fashion, these statistics about our
+    distributed network filesystem. 
+
+  PVFS2_VFS_OP_FS_MOUNT
+    fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref
+    except its members are in a different order and "__pad1" is replaced
+    with "id".
+
+  PVFS2_VFS_OP_GETXATTR
+    fill a pvfs2_getxattr_response_t
+
+  PVFS2_VFS_OP_LISTXATTR
+    fill a pvfs2_listxattr_response_t
+
+  PVFS2_VFS_OP_PARAM
+    fill a pvfs2_param_response_t
+
+  PVFS2_VFS_OP_PERF_COUNT
+    fill a pvfs2_perf_count_response_t
+
+  PVFS2_VFS_OP_FSKEY
+    file a pvfs2_fs_key_response_t
+
+  PVFS2_VFS_OP_READDIR
+    jamb everything needed to represent a pvfs2_readdir_response_t into
+    the readdir buffer descriptor specified in the upcall.
+
+Userspace uses writev() on /dev/pvfs2-req to pass responses to the requests
+made by the kernel side.
+
+A buffer_list containing:
+  - a pointer to the prepared response to the request from the
+    kernel (struct pvfs2_downcall_t).
+  - and also, in the case of a readdir request, a pointer to a
+    buffer containing descriptors for the objects in the target
+    directory.
+... is sent to the function (PINT_dev_write_list) which performs
+the writev.
+
+PINT_dev_write_list has a local iovec array: struct iovec io_array[10];
+
+The first four elements of io_array are initialized like this for all
+responses:
+
+  io_array[0].iov_base = address of local variable "proto_ver" (int32_t)
+  io_array[0].iov_len = sizeof(int32_t)
+
+  io_array[1].iov_base = address of global variable "pdev_magic" (int32_t)
+  io_array[1].iov_len = sizeof(int32_t)
+  
+  io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t)
+  io_array[2].iov_len = sizeof(int64_t)
+
+  io_array[3].iov_base = address of out_downcall member (pvfs2_downcall_t)
+                         of global variable vfs_request (vfs_request_t)
+  io_array[3].iov_len = sizeof(pvfs2_downcall_t)
+
+Readdir responses initialize the fifth element io_array like this:
+
+  io_array[4].iov_base = contents of member trailer_buf (char *)
+                         from out_downcall member of global variable
+                         vfs_request
+  io_array[4].iov_len = contents of member trailer_size (PVFS_size)
+                        from out_downcall member of global variable
+                        vfs_request
+  
+
--- a/8
+++ b/8
@@ -8251,6 +8251,14 @@ S:	Supported
 F:	fs/overlayfs/
 F:	Documentation/filesystems/overlayfs.txt

+ORANGEFS FILESYSTEM
+M:	Mike Marshall <hubcap@omnibond.com>
+L:	pvfs2-developers@beowulf-underground.org
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux.git
+S:	Supported
+F:	fs/orangefs/
+F:	Documentation/filesystems/orangefs.txt
+
 P54 WIRELESS DRIVER
 M:	Christian Lamparter <chunkeey@googlemail.com>
 L:	linux-wireless@vger.kernel.org
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -209,6 +209,7 @@ menuconfig MISC_FILESYSTEMS

 if MISC_FILESYSTEMS

+source "fs/orangefs/Kconfig"
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_OVERLAY_FS)	+= overlayfs/
+obj-$(CONFIG_ORANGEFS_FS)       += orangefs/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_OMFS_FS)		+= omfs/
--- a/fs/orangefs/Kconfig
+++ b/fs/orangefs/Kconfig
@@ -0,0 +1,6 @@
+config ORANGEFS_FS
+	tristate "ORANGEFS (Powered by PVFS) support"
+	select FS_POSIX_ACL
+	help
+	   Orange is a parallel file system designed for use on high end
+	   computing (HEC) systems.
--- a/fs/orangefs/Makefile
+++ b/fs/orangefs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the ORANGEFS filesystem.
+#
+
+obj-$(CONFIG_ORANGEFS_FS) += orangefs.o
+
+orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \
+		 dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \
+		 devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \
+		 orangefs-debugfs.o waitqueue.o
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -0,0 +1,175 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+
+struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int ret;
+	char *key = NULL, *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+		break;
+	default:
+		gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
+		return ERR_PTR(-EINVAL);
+	}
+	/*
+	 * Rather than incurring a network call just to determine the exact
+	 * length of the attribute, I just allocate a max length to save on
+	 * the network call. Conceivably, we could pass NULL to
+	 * orangefs_inode_getxattr() to probe the length of the value, but
+	 * I don't do that for now.
+	 */
+	value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+	if (value == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "inode %pU, key %s, type %d\n",
+		     get_khandle_from_ino(inode),
+		     key,
+		     type);
+	ret = orangefs_inode_getxattr(inode,
+				   "",
+				   key,
+				   value,
+				   ORANGEFS_MAX_XATTR_VALUELEN);
+	/* if the key exists, convert it to an in-memory rep */
+	if (ret > 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+	} else if (ret == -ENODATA || ret == -ENOSYS) {
+		acl = NULL;
+	} else {
+		gossip_err("inode %pU retrieving acl's failed with error %d\n",
+			   get_khandle_from_ino(inode),
+			   ret);
+		acl = ERR_PTR(ret);
+	}
+	/* kfree(NULL) is safe, so don't worry if value ever got used */
+	kfree(value);
+	return acl;
+}
+
+int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	int error = 0;
+	void *value = NULL;
+	size_t size = 0;
+	const char *name = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = inode->i_mode;
+			/*
+			 * can we represent this with the traditional file
+			 * mode permission bits?
+			 */
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0) {
+				gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+					   __func__,
+					   error);
+				return error;
+			}
+
+			if (inode->i_mode != mode)
+				SetModeFlag(orangefs_inode);
+			inode->i_mode = mode;
+			mark_inode_dirty_sync(inode);
+			if (error == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+		break;
+	default:
+		gossip_err("%s: invalid type %d!\n", __func__, type);
+		return -EINVAL;
+	}
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "%s: inode %pU, key %s type %d\n",
+		     __func__, get_khandle_from_ino(inode),
+		     name,
+		     type);
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+
+		error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (error < 0)
+			goto out;
+	}
+
+	gossip_debug(GOSSIP_ACL_DEBUG,
+		     "%s: name %s, value %p, size %zd, acl %p\n",
+		     __func__, name, value, size, acl);
+	/*
+	 * Go ahead and set the extended attribute now. NOTE: Suppose acl
+	 * was NULL, then value will be NULL and size will be 0 and that
+	 * will xlate to a removexattr. However, we don't want removexattr
+	 * complain if attributes does not exist.
+	 */
+	error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
+
+out:
+	kfree(value);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
+}
+
+int orangefs_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct posix_acl *default_acl, *acl;
+	umode_t mode = inode->i_mode;
+	int error = 0;
+
+	ClearModeFlag(orangefs_inode);
+
+	error = posix_acl_create(dir, &mode, &default_acl, &acl);
+	if (error)
+		return error;
+
+	if (default_acl) {
+		error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+
+	if (acl) {
+		if (!error)
+			error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+
+	/* If mode of the inode was changed, then do a forcible ->setattr */
+	if (mode != inode->i_mode) {
+		SetModeFlag(orangefs_inode);
+		inode->i_mode = mode;
+		orangefs_flush_inode(inode);
+	}
+
+	return error;
+}
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -0,0 +1,138 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Implementation of dentry (directory cache) functions.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* Returns 1 if dentry can still be trusted, else 0. */
+static int orangefs_revalidate_lookup(struct dentry *dentry)
+{
+	struct dentry *parent_dentry = dget_parent(dentry);
+	struct inode *parent_inode = parent_dentry->d_inode;
+	struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
+	struct inode *inode = dentry->d_inode;
+	struct orangefs_kernel_op_s *new_op;
+	int ret = 0;
+	int err = 0;
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+	if (!new_op)
+		goto out_put_parent;
+
+	new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+	new_op->upcall.req.lookup.parent_refn = parent->refn;
+	strncpy(new_op->upcall.req.lookup.d_name,
+		dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+		     "%s:%s:%d interrupt flag [%d]\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     get_interruptible_flag(parent_inode));
+
+	err = service_operation(new_op, "orangefs_lookup",
+			get_interruptible_flag(parent_inode));
+
+	/* Positive dentry: reject if error or not the same inode. */
+	if (inode) {
+		if (err) {
+			gossip_debug(GOSSIP_DCACHE_DEBUG,
+			    "%s:%s:%d lookup failure.\n",
+			    __FILE__, __func__, __LINE__);
+			goto out_drop;
+		}
+		if (!match_handle(new_op->downcall.resp.lookup.refn.khandle,
+		    inode)) {
+			gossip_debug(GOSSIP_DCACHE_DEBUG,
+			    "%s:%s:%d no match.\n",
+			    __FILE__, __func__, __LINE__);
+			goto out_drop;
+		}
+
+	/* Negative dentry: reject if success or error other than ENOENT. */
+	} else {
+		gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
+		    __func__);
+		if (!err || err != -ENOENT) {
+			if (new_op->downcall.status != 0)
+				gossip_debug(GOSSIP_DCACHE_DEBUG,
+				    "%s:%s:%d lookup failure.\n",
+				    __FILE__, __func__, __LINE__);
+			goto out_drop;
+		}
+	}
+
+	ret = 1;
+out_release_op:
+	op_release(new_op);
+out_put_parent:
+	dput(parent_dentry);
+	return ret;
+out_drop:
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
+	    __FILE__, __func__, __LINE__);
+	goto out_release_op;
+}
+
+/*
+ * Verify that dentry is valid.
+ *
+ * Should return 1 if dentry can still be trusted, else 0.
+ */
+static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int ret;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
+		     __func__, dentry);
+
+	/* skip root handle lookups. */
+	if (dentry->d_inode && is_root_handle(dentry->d_inode))
+		return 1;
+
+	/*
+	 * If this passes, the positive dentry still exists or the negative
+	 * dentry still does not exist.
+	 */
+	if (!orangefs_revalidate_lookup(dentry))
+		return 0;
+
+	/* We do not need to continue with negative dentries. */
+	if (!dentry->d_inode)
+		goto out;
+
+	/* Now we must perform a getattr to validate the inode contents. */
+
+	ret = orangefs_inode_check_changed(dentry->d_inode);
+	if (ret < 0) {
+		gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n",
+		    __FILE__, __func__, __LINE__);
+		return 0;
+	}
+	if (ret == 0)
+		return 0;
+
+out:
+	gossip_debug(GOSSIP_DCACHE_DEBUG,
+	    "%s: negative dentry or positive dentry and inode valid.\n",
+	    __func__);
+	return 1;
+}
+
+const struct dentry_operations orangefs_dentry_operations = {
+	.d_revalidate = orangefs_d_revalidate,
+};
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -0,0 +1,400 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+/*
+ * decode routine used by kmod to deal with the blob sent from
+ * userspace for readdirs. The blob contains zero or more of these
+ * sub-blobs:
+ *   __u32 - represents length of the character string that follows.
+ *   string - between 1 and ORANGEFS_NAME_MAX bytes long.
+ *   padding - (if needed) to cause the __u32 plus the string to be
+ *             eight byte aligned.
+ *   khandle - sizeof(khandle) bytes.
+ */
+static long decode_dirents(char *ptr, size_t size,
+                           struct orangefs_readdir_response_s *readdir)
+{
+	int i;
+	struct orangefs_readdir_response_s *rd =
+		(struct orangefs_readdir_response_s *) ptr;
+	char *buf = ptr;
+	int khandle_size = sizeof(struct orangefs_khandle);
+	size_t offset = offsetof(struct orangefs_readdir_response_s,
+				dirent_array);
+	/* 8 reflects eight byte alignment */
+	int smallest_blob = khandle_size + 8;
+	__u32 len;
+	int aligned_len;
+	int sizeof_u32 = sizeof(__u32);
+	long ret;
+
+	gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
+
+	/* size is = offset on empty dirs, > offset on non-empty dirs... */
+	if (size < offset) {
+		gossip_err("%s: size:%zu: offset:%zu:\n",
+			   __func__,
+			   size,
+			   offset);
+		ret = -EINVAL;
+		goto out;
+	}
+
+        if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
+		gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
+			   __func__,
+			   size,
+			   readdir->orangefs_dirent_outcount);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	readdir->token = rd->token;
+	readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
+	readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
+					sizeof(*readdir->dirent_array),
+					GFP_KERNEL);
+	if (readdir->dirent_array == NULL) {
+		gossip_err("%s: kcalloc failed.\n", __func__);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	buf += offset;
+	size -= offset;
+
+	for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
+		if (size < smallest_blob) {
+			gossip_err("%s: size:%zu: smallest_blob:%d:\n",
+				   __func__,
+				   size,
+				   smallest_blob);
+			ret = -EINVAL;
+			goto free;
+		}
+
+		len = *(__u32 *)buf;
+		if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
+			gossip_err("%s: len:%d:\n", __func__, len);
+			ret = -EINVAL;
+			goto free;
+		}
+
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "%s: size:%zu: len:%d:\n",
+			     __func__,
+			     size,
+			     len);
+
+		readdir->dirent_array[i].d_name = buf + sizeof_u32;
+		readdir->dirent_array[i].d_length = len;
+
+		/*
+		 * Calculate "aligned" length of this string and its
+		 * associated __u32 descriptor.
+		 */
+		aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "%s: aligned_len:%d:\n",
+			     __func__,
+			     aligned_len);
+
+		/*
+		 * The end of the blob should coincide with the end
+		 * of the last sub-blob.
+		 */
+		if (size < aligned_len + khandle_size) {
+			gossip_err("%s: ran off the end of the blob.\n",
+				   __func__);
+			ret = -EINVAL;
+			goto free;
+		}
+		size -= aligned_len + khandle_size;
+
+		buf += aligned_len;
+
+		readdir->dirent_array[i].khandle =
+			*(struct orangefs_khandle *) buf;
+		buf += khandle_size;
+	}
+	ret = buf - ptr;
+	gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
+	goto out;
+
+free:
+	kfree(readdir->dirent_array);
+	readdir->dirent_array = NULL;
+
+out:
+	return ret;
+}
+
+/*
+ * Read directory entries from an instance of an open directory.
+ */
+static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+{
+	int ret = 0;
+	int buffer_index;
+	/*
+	 * ptoken supports Orangefs' distributed directory logic, added
+	 * in 2.9.2.
+	 */
+	__u64 *ptoken = file->private_data;
+	__u64 pos = 0;
+	ino_t ino = 0;
+	struct dentry *dentry = file->f_path.dentry;
+	struct orangefs_kernel_op_s *new_op = NULL;
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
+	int buffer_full = 0;
+	struct orangefs_readdir_response_s readdir_response;
+	void *dents_buf;
+	int i = 0;
+	int len = 0;
+	ino_t current_ino = 0;
+	char *current_entry = NULL;
+	long bytes_decoded;
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "%s: ctx->pos:%lld, ptoken = %llu\n",
+		     __func__,
+		     lld(ctx->pos),
+		     llu(*ptoken));
+
+	pos = (__u64) ctx->pos;
+
+	/* are we done? */
+	if (pos == ORANGEFS_READDIR_END) {
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "Skipping to termination path\n");
+		return 0;
+	}
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "orangefs_readdir called on %s (pos=%llu)\n",
+		     dentry->d_name.name, llu(pos));
+
+	memset(&readdir_response, 0, sizeof(readdir_response));
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+	if (!new_op)
+		return -ENOMEM;
+
+	/*
+	 * Only the indices are shared. No memory is actually shared, but the
+	 * mechanism is used.
+	 */
+	new_op->uses_shared_memory = 1;
+	new_op->upcall.req.readdir.refn = orangefs_inode->refn;
+	new_op->upcall.req.readdir.max_dirent_count =
+	    ORANGEFS_MAX_DIRENT_COUNT_READDIR;
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "%s: upcall.req.readdir.refn.khandle: %pU\n",
+		     __func__,
+		     &new_op->upcall.req.readdir.refn.khandle);
+
+	new_op->upcall.req.readdir.token = *ptoken;
+
+get_new_buffer_index:
+	buffer_index = orangefs_readdir_index_get();
+	if (buffer_index < 0) {
+		ret = buffer_index;
+		gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
+			    ret);
+		goto out_free_op;
+	}
+	new_op->upcall.req.readdir.buf_index = buffer_index;
+
+	ret = service_operation(new_op,
+				"orangefs_readdir",
+				get_interruptible_flag(dentry->d_inode));
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "Readdir downcall status is %d.  ret:%d\n",
+		     new_op->downcall.status,
+		     ret);
+
+	orangefs_readdir_index_put(buffer_index);
+
+	if (ret == -EAGAIN && op_state_purged(new_op)) {
+		/* Client-core indices are invalid after it restarted. */
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			"%s: Getting new buffer_index for retry of readdir..\n",
+			 __func__);
+		goto get_new_buffer_index;
+	}
+
+	if (ret == -EIO && op_state_purged(new_op)) {
+		gossip_err("%s: Client is down. Aborting readdir call.\n",
+			__func__);
+		goto out_slot;
+	}
+
+	if (ret < 0 || new_op->downcall.status != 0) {
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "Readdir request failed.  Status:%d\n",
+			     new_op->downcall.status);
+		if (ret >= 0)
+			ret = new_op->downcall.status;
+		goto out_slot;
+	}
+
+	dents_buf = new_op->downcall.trailer_buf;
+	if (dents_buf == NULL) {
+		gossip_err("Invalid NULL buffer in readdir response\n");
+		ret = -ENOMEM;
+		goto out_slot;
+	}
+
+	bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
+					&readdir_response);
+	if (bytes_decoded < 0) {
+		ret = bytes_decoded;
+		gossip_err("Could not decode readdir from buffer %d\n", ret);
+		goto out_vfree;
+	}
+
+	if (bytes_decoded != new_op->downcall.trailer_size) {
+		gossip_err("orangefs_readdir: # bytes decoded (%ld) "
+			   "!= trailer size (%ld)\n",
+			   bytes_decoded,
+			   (long)new_op->downcall.trailer_size);
+		ret = -EINVAL;
+		goto out_destroy_handle;
+	}
+
+	/*
+	 *  orangefs doesn't actually store dot and dot-dot, but
+	 *  we need to have them represented.
+	 */
+	if (pos == 0) {
+		ino = get_ino_from_khandle(dentry->d_inode);
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "%s: calling dir_emit of \".\" with pos = %llu\n",
+			     __func__,
+			     llu(pos));
+		ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
+		pos += 1;
+	}
+
+	if (pos == 1) {
+		ino = get_parent_ino_from_dentry(dentry);
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "%s: calling dir_emit of \"..\" with pos = %llu\n",
+			     __func__,
+			     llu(pos));
+		ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
+		pos += 1;
+	}
+
+	/*
+	 * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
+	 * to prevent "finding" dot and dot-dot on any iteration
+	 * other than the first.
+	 */
+	if (ctx->pos == ORANGEFS_ITERATE_NEXT)
+		ctx->pos = 0;
+
+	gossip_debug(GOSSIP_DIR_DEBUG,
+		     "%s: dirent_outcount:%d:\n",
+		     __func__,
+		     readdir_response.orangefs_dirent_outcount);
+	for (i = ctx->pos;
+	     i < readdir_response.orangefs_dirent_outcount;
+	     i++) {
+		len = readdir_response.dirent_array[i].d_length;
+		current_entry = readdir_response.dirent_array[i].d_name;
+		current_ino = orangefs_khandle_to_ino(
+			&readdir_response.dirent_array[i].khandle);
+
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			     "calling dir_emit for %s with len %d"
+			     ", ctx->pos %ld\n",
+			     current_entry,
+			     len,
+			     (unsigned long)ctx->pos);
+		/*
+		 * type is unknown. We don't return object type
+		 * in the dirent_array. This leaves getdents
+		 * clueless about type.
+		 */
+		ret =
+		    dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
+		if (!ret)
+			break;
+		ctx->pos++;
+		gossip_debug(GOSSIP_DIR_DEBUG,
+			      "%s: ctx->pos:%lld\n",
+			      __func__,
+			      lld(ctx->pos));
+
+	}
+
+	/*
+	 * we ran all the way through the last batch, set up for
+	 * getting another batch...
+	 */
+	if (ret) {
+		*ptoken = readdir_response.token;
+		ctx->pos = ORANGEFS_ITERATE_NEXT;
+	}
+
+	/*
+	 * Did we hit the end of the directory?
+	 */
+	if (readdir_response.token == ORANGEFS_READDIR_END &&
+	    !buffer_full) {
+		gossip_debug(GOSSIP_DIR_DEBUG,
+		"End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
+		ctx->pos = ORANGEFS_READDIR_END;
+	}
+
+out_destroy_handle:
+	/* kfree(NULL) is safe */
+	kfree(readdir_response.dirent_array);
+out_vfree:
+	gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
+	vfree(dents_buf);
+out_slot:
+	orangefs_readdir_index_put(buffer_index);
+out_free_op:
+	op_release(new_op);
+	gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
+	return ret;
+}
+
+static int orangefs_dir_open(struct inode *inode, struct file *file)
+{
+	__u64 *ptoken;
+
+	file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+	if (!file->private_data)
+		return -ENOMEM;
+
+	ptoken = file->private_data;
+	*ptoken = ORANGEFS_READDIR_START;
+	return 0;
+}
+
+static int orangefs_dir_release(struct inode *inode, struct file *file)
+{
+	orangefs_flush_inode(inode);
+	kfree(file->private_data);
+	return 0;
+}
+
+/** ORANGEFS implementation of VFS directory operations */
+const struct file_operations orangefs_dir_operations = {
+	.read = generic_read_dir,
+	.iterate = orangefs_readdir,
+	.open = orangefs_dir_open,
+	.release = orangefs_dir_release,
+};
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -0,0 +1,133 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Definitions of downcalls used in Linux kernel module.
+ */
+
+#ifndef __DOWNCALL_H
+#define __DOWNCALL_H
+
+/*
+ * Sanitized the device-client core interaction
+ * for clean 32-64 bit usage
+ */
+struct orangefs_io_response {
+	__s64 amt_complete;
+};
+
+struct orangefs_lookup_response {
+	struct orangefs_object_kref refn;
+};
+
+struct orangefs_create_response {
+	struct orangefs_object_kref refn;
+};
+
+struct orangefs_symlink_response {
+	struct orangefs_object_kref refn;
+};
+
+struct orangefs_getattr_response {
+	struct ORANGEFS_sys_attr_s attributes;
+	char link_target[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_mkdir_response {
+	struct orangefs_object_kref refn;
+};
+
+/*
+ * duplication of some system interface structures so that I don't have
+ * to allocate extra memory
+ */
+struct orangefs_dirent {
+	char *d_name;
+	int d_length;
+	struct orangefs_khandle khandle;
+};
+
+struct orangefs_statfs_response {
+	__s64 block_size;
+	__s64 blocks_total;
+	__s64 blocks_avail;
+	__s64 files_total;
+	__s64 files_avail;
+};
+
+struct orangefs_fs_mount_response {
+	__s32 fs_id;
+	__s32 id;
+	struct orangefs_khandle root_khandle;
+};
+
+/* the getxattr response is the attribute value */
+struct orangefs_getxattr_response {
+	__s32 val_sz;
+	__s32 __pad1;
+	char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+
+/* the listxattr response is an array of attribute names */
+struct orangefs_listxattr_response {
+	__s32 returned_count;
+	__s32 __pad1;
+	__u64 token;
+	char key[ORANGEFS_MAX_XATTR_LISTLEN * ORANGEFS_MAX_XATTR_NAMELEN];
+	__s32 keylen;
+	__s32 __pad2;
+	__s32 lengths[ORANGEFS_MAX_XATTR_LISTLEN];
+};
+
+struct orangefs_param_response {
+	__s64 value;
+};
+
+#define PERF_COUNT_BUF_SIZE 4096
+struct orangefs_perf_count_response {
+	char buffer[PERF_COUNT_BUF_SIZE];
+};
+
+#define FS_KEY_BUF_SIZE 4096
+struct orangefs_fs_key_response {
+	__s32 fs_keylen;
+	__s32 __pad1;
+	char fs_key[FS_KEY_BUF_SIZE];
+};
+
+struct orangefs_downcall_s {
+	__s32 type;
+	__s32 status;
+	/* currently trailer is used only by readdir */
+	__s64 trailer_size;
+	char *trailer_buf;
+
+	union {
+		struct orangefs_io_response io;
+		struct orangefs_lookup_response lookup;
+		struct orangefs_create_response create;
+		struct orangefs_symlink_response sym;
+		struct orangefs_getattr_response getattr;
+		struct orangefs_mkdir_response mkdir;
+		struct orangefs_statfs_response statfs;
+		struct orangefs_fs_mount_response fs_mount;
+		struct orangefs_getxattr_response getxattr;
+		struct orangefs_listxattr_response listxattr;
+		struct orangefs_param_response param;
+		struct orangefs_perf_count_response perf_count;
+		struct orangefs_fs_key_response fs_key;
+	} resp;
+};
+
+struct orangefs_readdir_response_s {
+	__u64 token;
+	__u64 directory_version;
+	__u32 __pad2;
+	__u32 orangefs_dirent_outcount;
+	struct orangefs_dirent *dirent_array;
+};
+
+#endif /* __DOWNCALL_H */
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -0,0 +1,475 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Linux VFS inode operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+static int read_one_page(struct page *page)
+{
+	int ret;
+	int max_block;
+	ssize_t bytes_read = 0;
+	struct inode *inode = page->mapping->host;
+	const __u32 blocksize = PAGE_CACHE_SIZE;	/* inode->i_blksize */
+	const __u32 blockbits = PAGE_CACHE_SHIFT;	/* inode->i_blkbits */
+	struct iov_iter to;
+	struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
+
+	iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		    "orangefs_readpage called with page %p\n",
+		     page);
+
+	max_block = ((inode->i_size / blocksize) + 1);
+
+	if (page->index < max_block) {
+		loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
+
+		bytes_read = orangefs_inode_read(inode,
+						 &to,
+						 &blockptr_offset,
+						 inode->i_size);
+	}
+	/* this will only zero remaining unread portions of the page data */
+	iov_iter_zero(~0U, &to);
+	/* takes care of potential aliasing */
+	flush_dcache_page(page);
+	if (bytes_read < 0) {
+		ret = bytes_read;
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		ret = 0;
+	}
+	/* unlock the page after the ->readpage() routine completes */
+	unlock_page(page);
+	return ret;
+}
+
+static int orangefs_readpage(struct file *file, struct page *page)
+{
+	return read_one_page(page);
+}
+
+static int orangefs_readpages(struct file *file,
+			   struct address_space *mapping,
+			   struct list_head *pages,
+			   unsigned nr_pages)
+{
+	int page_idx;
+	int ret;
+
+	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
+
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page;
+
+		page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		if (!add_to_page_cache(page,
+				       mapping,
+				       page->index,
+				       GFP_KERNEL)) {
+			ret = read_one_page(page);
+			gossip_debug(GOSSIP_INODE_DEBUG,
+				"failure adding page to cache, read_one_page returned: %d\n",
+				ret);
+	      } else {
+			page_cache_release(page);
+	      }
+	}
+	BUG_ON(!list_empty(pages));
+	return 0;
+}
+
+static void orangefs_invalidatepage(struct page *page,
+				 unsigned int offset,
+				 unsigned int length)
+{
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "orangefs_invalidatepage called on page %p "
+		     "(offset is %u)\n",
+		     page,
+		     offset);
+
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	return;
+
+}
+
+static int orangefs_releasepage(struct page *page, gfp_t foo)
+{
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "orangefs_releasepage called on page %p\n",
+		     page);
+	return 0;
+}
+
+/*
+ * Having a direct_IO entry point in the address_space_operations
+ * struct causes the kernel to allows us to use O_DIRECT on
+ * open. Nothing will ever call this thing, but in the future we
+ * will need to be able to use O_DIRECT on open in order to support
+ * AIO. Modeled after NFS, they do this too.
+ */
+/*
+ * static ssize_t orangefs_direct_IO(int rw,
+ *			struct kiocb *iocb,
+ *			struct iov_iter *iter,
+ *			loff_t offset)
+ *{
+ *	gossip_debug(GOSSIP_INODE_DEBUG,
+ *		     "orangefs_direct_IO: %s\n",
+ *		     iocb->ki_filp->f_path.dentry->d_name.name);
+ *
+ *	return -EINVAL;
+ *}
+ */
+
+struct backing_dev_info orangefs_backing_dev_info = {
+	.name = "orangefs",
+	.ra_pages = 0,
+	.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+/** ORANGEFS2 implementation of address space operations */
+const struct address_space_operations orangefs_address_operations = {
+	.readpage = orangefs_readpage,
+	.readpages = orangefs_readpages,
+	.invalidatepage = orangefs_invalidatepage,
+	.releasepage = orangefs_releasepage,
+/*	.direct_IO = orangefs_direct_IO */
+};
+
+static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
+{
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	struct orangefs_kernel_op_s *new_op;
+	loff_t orig_size;
+	int ret = -EINVAL;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
+		     __func__,
+		     get_khandle_from_ino(inode),
+		     &orangefs_inode->refn.khandle,
+		     orangefs_inode->refn.fs_id,
+		     iattr->ia_size);
+
+	/* Ensure that we have a up to date size, so we know if it changed. */
+	ret = orangefs_inode_getattr(inode, 0, 1);
+	if (ret == -ESTALE)
+		ret = -EIO;
+	if (ret) {
+		gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
+		    __func__, ret);
+		return ret;
+	}
+	orig_size = i_size_read(inode);
+
+	truncate_setsize(inode, iattr->ia_size);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.truncate.refn = orangefs_inode->refn;
+	new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
+
+	ret = service_operation(new_op, __func__,
+				get_interruptible_flag(inode));
+
+	/*
+	 * the truncate has no downcall members to retrieve, but
+	 * the status value tells us if it went through ok or not
+	 */
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "orangefs: orangefs_truncate got return value of %d\n",
+		     ret);
+
+	op_release(new_op);
+
+	if (ret != 0)
+		return ret;
+
+	/*
+	 * Only change the c/mtime if we are changing the size or we are
+	 * explicitly asked to change it.  This handles the semantic difference
+	 * between truncate() and ftruncate() as implemented in the VFS.
+	 *
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (orig_size != i_size_read(inode) &&
+	    !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+		iattr->ia_ctime = iattr->ia_mtime =
+			current_fs_time(inode->i_sb);
+		iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+	}
+
+	return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	int ret = -EINVAL;
+	struct inode *inode = dentry->d_inode;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "orangefs_setattr: called on %s\n",
+		     dentry->d_name.name);
+
+	ret = inode_change_ok(inode, iattr);
+	if (ret)
+		goto out;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(inode)) {
+		ret = orangefs_setattr_size(inode, iattr);
+		if (ret)
+			goto out;
+	}
+
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
+
+	ret = orangefs_inode_setattr(inode, iattr);
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "orangefs_setattr: inode_setattr returned %d\n",
+		     ret);
+
+	if (!ret && (iattr->ia_valid & ATTR_MODE))
+		/* change mod on a file that has ACLs */
+		ret = posix_acl_chmod(inode, inode->i_mode);
+
+out:
+	gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret);
+	return ret;
+}
+
+/*
+ * Obtain attributes of an object given a dentry
+ */
+int orangefs_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *kstat)
+{
+	int ret = -ENOENT;
+	struct inode *inode = dentry->d_inode;
+	struct orangefs_inode_s *orangefs_inode = NULL;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "orangefs_getattr: called on %s\n",
+		     dentry->d_name.name);
+
+	ret = orangefs_inode_getattr(inode, 0, 1);
+	if (ret == 0) {
+		generic_fillattr(inode, kstat);
+
+		/* override block size reported to stat */
+		orangefs_inode = ORANGEFS_I(inode);
+		kstat->blksize = orangefs_inode->blksize;
+	}
+	return ret;
+}
+
+int orangefs_permission(struct inode *inode, int mask)
+{
+	int ret;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
+
+	/* Make sure the permission (and other common attrs) are up to date. */
+	ret = orangefs_inode_getattr(inode, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	return generic_permission(inode, mask);
+}
+
+/* ORANGEDS2 implementation of VFS inode operations for files */
+struct inode_operations orangefs_file_inode_operations = {
+	.get_acl = orangefs_get_acl,
+	.set_acl = orangefs_set_acl,
+	.setattr = orangefs_setattr,
+	.getattr = orangefs_getattr,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.listxattr = orangefs_listxattr,
+	.removexattr = generic_removexattr,
+	.permission = orangefs_permission,
+};
+
+static int orangefs_init_iops(struct inode *inode)
+{
+	inode->i_mapping->a_ops = &orangefs_address_operations;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &orangefs_file_inode_operations;
+		inode->i_fop = &orangefs_file_operations;
+		inode->i_blkbits = PAGE_CACHE_SHIFT;
+		break;
+	case S_IFLNK:
+		inode->i_op = &orangefs_symlink_inode_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &orangefs_dir_inode_operations;
+		inode->i_fop = &orangefs_dir_operations;
+		break;
+	default:
+		gossip_debug(GOSSIP_INODE_DEBUG,
+			     "%s: unsupported mode\n",
+			     __func__);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type
+ * that will be used as a hash-index from where the handle will
+ * be searched for in the VFS hash table of inodes.
+ */
+static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
+{
+	if (!ref)
+		return 0;
+	return orangefs_khandle_to_ino(&(ref->khandle));
+}
+
+/*
+ * Called to set up an inode from iget5_locked.
+ */
+static int orangefs_set_inode(struct inode *inode, void *data)
+{
+	struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+	ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
+	ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+	return 0;
+}
+
+/*
+ * Called to determine if handles match.
+ */
+static int orangefs_test_inode(struct inode *inode, void *data)
+{
+	struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+	struct orangefs_inode_s *orangefs_inode = NULL;
+
+	orangefs_inode = ORANGEFS_I(inode);
+	return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle))
+		&& orangefs_inode->refn.fs_id == ref->fs_id);
+}
+
+/*
+ * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS
+ * file handle.
+ *
+ * @sb: the file system super block instance.
+ * @ref: The ORANGEFS object for which we are trying to locate an inode structure.
+ */
+struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref)
+{
+	struct inode *inode = NULL;
+	unsigned long hash;
+	int error;
+
+	hash = orangefs_handle_hash(ref);
+	inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref);
+	if (!inode || !(inode->i_state & I_NEW))
+		return inode;
+
+	error = orangefs_inode_getattr(inode, 1, 0);
+	if (error) {
+		iget_failed(inode);
+		return ERR_PTR(error);
+	}
+
+	inode->i_ino = hash;	/* needed for stat etc */
+	orangefs_init_iops(inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
+		     &ref->khandle,
+		     ref->fs_id,
+		     hash,
+		     inode->i_ino);
+
+	return inode;
+}
+
+/*
+ * Allocate an inode for a newly created file and insert it into the inode hash.
+ */
+struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
+		int mode, dev_t dev, struct orangefs_object_kref *ref)
+{
+	unsigned long hash = orangefs_handle_hash(ref);
+	struct inode *inode;
+	int error;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
+		     __func__,
+		     sb,
+		     MAJOR(dev),
+		     MINOR(dev),
+		     mode);
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	orangefs_set_inode(inode, ref);
+	inode->i_ino = hash;	/* needed for stat etc */
+
+	error = orangefs_inode_getattr(inode, 1, 0);
+	if (error)
+		goto out_iput;
+
+	orangefs_init_iops(inode);
+
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_size = PAGE_CACHE_SIZE;
+	inode->i_rdev = dev;
+
+	error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
+	if (error < 0)
+		goto out_iput;
+
+	gossip_debug(GOSSIP_INODE_DEBUG,
+		     "Initializing ACL's for inode %pU\n",
+		     get_khandle_from_ino(inode));
+	orangefs_init_acl(inode, dir);
+	return inode;
+
+out_iput:
+	iput(inode);
+	return ERR_PTR(error);
+}
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -0,0 +1,462 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ *  Linux VFS namei operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/*
+ * Get a newly allocated inode to go with a negative dentry.
+ */
+static int orangefs_create(struct inode *dir,
+			struct dentry *dentry,
+			umode_t mode,
+			bool exclusive)
+{
+	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+	struct orangefs_kernel_op_s *new_op;
+	struct inode *inode;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n",
+		     __func__,
+		     dentry->d_name.name);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_CREATE);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.create.parent_refn = parent->refn;
+
+	fill_default_sys_attrs(new_op->upcall.req.create.attributes,
+			       ORANGEFS_TYPE_METAFILE, mode);
+
+	strncpy(new_op->upcall.req.create.d_name,
+		dentry->d_name.name, ORANGEFS_NAME_MAX);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
+		     __func__,
+		     dentry->d_name.name,
+		     &new_op->downcall.resp.create.refn.khandle,
+		     new_op->downcall.resp.create.refn.fs_id,
+		     new_op,
+		     ret);
+
+	if (ret < 0)
+		goto out;
+
+	inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
+				&new_op->downcall.resp.create.refn);
+	if (IS_ERR(inode)) {
+		gossip_err("%s: Failed to allocate inode for file :%s:\n",
+			   __func__,
+			   dentry->d_name.name);
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: Assigned inode :%pU: for file :%s:\n",
+		     __func__,
+		     get_khandle_from_ino(inode),
+		     dentry->d_name.name);
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: dentry instantiated for %s\n",
+		     __func__,
+		     dentry->d_name.name);
+
+	SetMtimeFlag(parent);
+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+	mark_inode_dirty_sync(dir);
+	ret = 0;
+out:
+	op_release(new_op);
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: %s: returning %d\n",
+		     __func__,
+		     dentry->d_name.name,
+		     ret);
+	return ret;
+}
+
+/*
+ * Attempt to resolve an object name (dentry->d_name), parent handle, and
+ * fsid into a handle for the object.
+ */
+static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
+				   unsigned int flags)
+{
+	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+	struct orangefs_kernel_op_s *new_op;
+	struct inode *inode;
+	struct dentry *res;
+	int ret = -EINVAL;
+
+	/*
+	 * in theory we could skip a lookup here (if the intent is to
+	 * create) in order to avoid a potentially failed lookup, but
+	 * leaving it in can skip a valid lookup and try to create a file
+	 * that already exists (e.g. the vfs already handles checking for
+	 * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
+	 * in the create path)
+	 */
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
+		     __func__, dentry->d_name.name);
+
+	if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1))
+		return ERR_PTR(-ENAMETOOLONG);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+	if (!new_op)
+		return ERR_PTR(-ENOMEM);
+
+	new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     &parent->refn.khandle);
+	new_op->upcall.req.lookup.parent_refn = parent->refn;
+
+	strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: doing lookup on %s under %pU,%d\n",
+		     __func__,
+		     new_op->upcall.req.lookup.d_name,
+		     &new_op->upcall.req.lookup.parent_refn.khandle,
+		     new_op->upcall.req.lookup.parent_refn.fs_id);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Lookup Got %pU, fsid %d (ret=%d)\n",
+		     &new_op->downcall.resp.lookup.refn.khandle,
+		     new_op->downcall.resp.lookup.refn.fs_id,
+		     ret);
+
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			/*
+			 * if no inode was found, add a negative dentry to
+			 * dcache anyway; if we don't, we don't hold expected
+			 * lookup semantics and we most noticeably break
+			 * during directory renames.
+			 *
+			 * however, if the operation failed or exited, do not
+			 * add the dentry (e.g. in the case that a touch is
+			 * issued on a file that already exists that was
+			 * interrupted during this lookup -- no need to add
+			 * another negative dentry for an existing file)
+			 */
+
+			gossip_debug(GOSSIP_NAME_DEBUG,
+				     "orangefs_lookup: Adding *negative* dentry "
+				     "%p for %s\n",
+				     dentry,
+				     dentry->d_name.name);
+
+			d_add(dentry, NULL);
+			res = NULL;
+			goto out;
+		}
+
+		/* must be a non-recoverable error */
+		res = ERR_PTR(ret);
+		goto out;
+	}
+
+	inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+	if (IS_ERR(inode)) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			"error %ld from iget\n", PTR_ERR(inode));
+		res = ERR_CAST(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s:%s:%d "
+		     "Found good inode [%lu] with count [%d]\n",
+		     __FILE__,
+		     __func__,
+		     __LINE__,
+		     inode->i_ino,
+		     (int)atomic_read(&inode->i_count));
+
+	/* update dentry/inode pair into dcache */
+	res = d_splice_alias(inode, dentry);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Lookup success (inode ct = %d)\n",
+		     (int)atomic_read(&inode->i_count));
+out:
+	op_release(new_op);
+	return res;
+}
+
+/* return 0 on success; non-zero otherwise */
+static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+	struct orangefs_kernel_op_s *new_op;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: called on %s\n"
+		     "  (inode %pU): Parent is %pU | fs_id %d\n",
+		     __func__,
+		     dentry->d_name.name,
+		     get_khandle_from_ino(inode),
+		     &parent->refn.khandle,
+		     parent->refn.fs_id);
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_REMOVE);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.remove.parent_refn = parent->refn;
+	strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+
+	ret = service_operation(new_op, "orangefs_unlink",
+				get_interruptible_flag(inode));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "%s: service_operation returned:%d:\n",
+		     __func__,
+		     ret);
+
+	op_release(new_op);
+
+	if (!ret) {
+		drop_nlink(inode);
+
+		SetMtimeFlag(parent);
+		dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+		mark_inode_dirty_sync(dir);
+	}
+	return ret;
+}
+
+static int orangefs_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+	struct orangefs_kernel_op_s *new_op;
+	struct inode *inode;
+	int mode = 755;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+	if (!symname)
+		return -EINVAL;
+
+	if (strlen(symname)+1 > ORANGEFS_NAME_MAX)
+		return -ENAMETOOLONG;
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_SYMLINK);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.sym.parent_refn = parent->refn;
+
+	fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
+			       ORANGEFS_TYPE_SYMLINK,
+			       mode);
+
+	strncpy(new_op->upcall.req.sym.entry_name,
+		dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+	strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Symlink Got ORANGEFS handle %pU on fsid %d (ret=%d)\n",
+		     &new_op->downcall.resp.sym.refn.khandle,
+		     new_op->downcall.resp.sym.refn.fs_id, ret);
+
+	if (ret < 0) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			    "%s: failed with error code %d\n",
+			    __func__, ret);
+		goto out;
+	}
+
+	inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
+				&new_op->downcall.resp.sym.refn);
+	if (IS_ERR(inode)) {
+		gossip_err
+		    ("*** Failed to allocate orangefs symlink inode\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Assigned symlink inode new number of %pU\n",
+		     get_khandle_from_ino(inode));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Inode (Symlink) %pU -> %s\n",
+		     get_khandle_from_ino(inode),
+		     dentry->d_name.name);
+
+	SetMtimeFlag(parent);
+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+	mark_inode_dirty_sync(dir);
+	ret = 0;
+out:
+	op_release(new_op);
+	return ret;
+}
+
+static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+	struct orangefs_kernel_op_s *new_op;
+	struct inode *inode;
+	int ret;
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
+	if (!new_op)
+		return -ENOMEM;
+
+	new_op->upcall.req.mkdir.parent_refn = parent->refn;
+
+	fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
+			      ORANGEFS_TYPE_DIRECTORY, mode);
+
+	strncpy(new_op->upcall.req.mkdir.d_name,
+		dentry->d_name.name, ORANGEFS_NAME_MAX);
+
+	ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Mkdir Got ORANGEFS handle %pU on fsid %d\n",
+		     &new_op->downcall.resp.mkdir.refn.khandle,
+		     new_op->downcall.resp.mkdir.refn.fs_id);
+
+	if (ret < 0) {
+		gossip_debug(GOSSIP_NAME_DEBUG,
+			     "%s: failed with error code %d\n",
+			     __func__, ret);
+		goto out;
+	}
+
+	inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
+				&new_op->downcall.resp.mkdir.refn);
+	if (IS_ERR(inode)) {
+		gossip_err("*** Failed to allocate orangefs dir inode\n");
+		ret = PTR_ERR(inode);
+		goto out;
+	}
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Assigned dir inode new number of %pU\n",
+		     get_khandle_from_ino(inode));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "Inode (Directory) %pU -> %s\n",
+		     get_khandle_from_ino(inode),
+		     dentry->d_name.name);
+
+	/*
+	 * NOTE: we have no good way to keep nlink consistent for directories
+	 * across clients; keep constant at 1.
+	 */
+	SetMtimeFlag(parent);
+	dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+	mark_inode_dirty_sync(dir);
+out:
+	op_release(new_op);
+	return ret;
+}
+
+static int orangefs_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	struct orangefs_kernel_op_s *new_op;
+	int ret;
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
+		     old_dentry->d_parent->d_name.name,
+		     old_dentry->d_name.name,
+		     new_dentry->d_parent->d_name.name,
+		     new_dentry->d_name.name,
+		     d_count(new_dentry));
+
+	new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
+	if (!new_op)
+		return -EINVAL;
+
+	new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn;
+	new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn;
+
+	strncpy(new_op->upcall.req.rename.d_old_name,
+		old_dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+	strncpy(new_op->upcall.req.rename.d_new_name,
+		new_dentry->d_name.name,
+		ORANGEFS_NAME_MAX);
+
+	ret = service_operation(new_op,
+				"orangefs_rename",
+				get_interruptible_flag(old_dentry->d_inode));
+
+	gossip_debug(GOSSIP_NAME_DEBUG,
+		     "orangefs_rename: got downcall status %d\n",
+		     ret);
+
+	if (new_dentry->d_inode)
+		new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+	op_release(new_op);
+	return ret;
+}
+
+/* ORANGEFS implementation of VFS inode operations for directories */
+struct inode_operations orangefs_dir_inode_operations = {
+	.lookup = orangefs_lookup,
+	.get_acl = orangefs_get_acl,
+	.set_acl = orangefs_set_acl,
+	.create = orangefs_create,
+	.unlink = orangefs_unlink,
+	.symlink = orangefs_symlink,
+	.mkdir = orangefs_mkdir,
+	.rmdir = orangefs_unlink,
+	.rename = orangefs_rename,
+	.setattr = orangefs_setattr,
+	.getattr = orangefs_getattr,
+	.setxattr = generic_setxattr,
+	.getxattr = generic_getxattr,
+	.removexattr = generic_removexattr,
+	.listxattr = orangefs_listxattr,
+	.permission = orangefs_permission,
+};
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
--- a/fs/orangefs/orangefs-bufmap.h
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -0,0 +1,36 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef __ORANGEFS_BUFMAP_H
+#define __ORANGEFS_BUFMAP_H
+
+int orangefs_bufmap_size_query(void);
+
+int orangefs_bufmap_shift_query(void);
+
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
+
+void orangefs_bufmap_finalize(void);
+
+void orangefs_bufmap_run_down(void);
+
+int orangefs_bufmap_get(void);
+
+void orangefs_bufmap_put(int buffer_index);
+
+int orangefs_readdir_index_get(void);
+
+void orangefs_readdir_index_put(int buffer_index);
+
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+				int buffer_index,
+				size_t size);
+
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+			      int buffer_index,
+			      size_t size);
+
+#endif /* __ORANGEFS_BUFMAP_H */
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -0,0 +1,161 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* tags assigned to kernel upcall operations */
+static __u64 next_tag_value;
+static DEFINE_SPINLOCK(next_tag_value_lock);
+
+/* the orangefs memory caches */
+
+/* a cache for orangefs upcall/downcall operations */
+static struct kmem_cache *op_cache;
+
+int op_cache_initialize(void)
+{
+	op_cache = kmem_cache_create("orangefs_op_cache",
+				     sizeof(struct orangefs_kernel_op_s),
+				     0,
+				     ORANGEFS_CACHE_CREATE_FLAGS,
+				     NULL);
+
+	if (!op_cache) {
+		gossip_err("Cannot create orangefs_op_cache\n");
+		return -ENOMEM;
+	}
+
+	/* initialize our atomic tag counter */
+	spin_lock(&next_tag_value_lock);
+	next_tag_value = 100;
+	spin_unlock(&next_tag_value_lock);
+	return 0;
+}
+
+int op_cache_finalize(void)
+{
+	kmem_cache_destroy(op_cache);
+	return 0;
+}
+
+char *get_opname_string(struct orangefs_kernel_op_s *new_op)
+{
+	if (new_op) {
+		__s32 type = new_op->upcall.type;
+
+		if (type == ORANGEFS_VFS_OP_FILE_IO)
+			return "OP_FILE_IO";
+		else if (type == ORANGEFS_VFS_OP_LOOKUP)
+			return "OP_LOOKUP";
+		else if (type == ORANGEFS_VFS_OP_CREATE)
+			return "OP_CREATE";
+		else if (type == ORANGEFS_VFS_OP_GETATTR)
+			return "OP_GETATTR";
+		else if (type == ORANGEFS_VFS_OP_REMOVE)
+			return "OP_REMOVE";
+		else if (type == ORANGEFS_VFS_OP_MKDIR)
+			return "OP_MKDIR";
+		else if (type == ORANGEFS_VFS_OP_READDIR)
+			return "OP_READDIR";
+		else if (type == ORANGEFS_VFS_OP_READDIRPLUS)
+			return "OP_READDIRPLUS";
+		else if (type == ORANGEFS_VFS_OP_SETATTR)
+			return "OP_SETATTR";
+		else if (type == ORANGEFS_VFS_OP_SYMLINK)
+			return "OP_SYMLINK";
+		else if (type == ORANGEFS_VFS_OP_RENAME)
+			return "OP_RENAME";
+		else if (type == ORANGEFS_VFS_OP_STATFS)
+			return "OP_STATFS";
+		else if (type == ORANGEFS_VFS_OP_TRUNCATE)
+			return "OP_TRUNCATE";
+		else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
+			return "OP_MMAP_RA_FLUSH";
+		else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
+			return "OP_FS_MOUNT";
+		else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
+			return "OP_FS_UMOUNT";
+		else if (type == ORANGEFS_VFS_OP_GETXATTR)
+			return "OP_GETXATTR";
+		else if (type == ORANGEFS_VFS_OP_SETXATTR)
+			return "OP_SETXATTR";
+		else if (type == ORANGEFS_VFS_OP_LISTXATTR)
+			return "OP_LISTXATTR";
+		else if (type == ORANGEFS_VFS_OP_REMOVEXATTR)
+			return "OP_REMOVEXATTR";
+		else if (type == ORANGEFS_VFS_OP_PARAM)
+			return "OP_PARAM";
+		else if (type == ORANGEFS_VFS_OP_PERF_COUNT)
+			return "OP_PERF_COUNT";
+		else if (type == ORANGEFS_VFS_OP_CANCEL)
+			return "OP_CANCEL";
+		else if (type == ORANGEFS_VFS_OP_FSYNC)
+			return "OP_FSYNC";
+		else if (type == ORANGEFS_VFS_OP_FSKEY)
+			return "OP_FSKEY";
+	}
+	return "OP_UNKNOWN?";
+}
+
+void orangefs_new_tag(struct orangefs_kernel_op_s *op)
+{
+	spin_lock(&next_tag_value_lock);
+	op->tag = next_tag_value++;
+	if (next_tag_value == 0)
+		next_tag_value = 100;
+	spin_unlock(&next_tag_value_lock);
+}
+
+struct orangefs_kernel_op_s *op_alloc(__s32 type)
+{
+	struct orangefs_kernel_op_s *new_op = NULL;
+
+	new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL);
+	if (new_op) {
+		INIT_LIST_HEAD(&new_op->list);
+		spin_lock_init(&new_op->lock);
+		init_completion(&new_op->waitq);
+
+		new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
+		new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+		new_op->downcall.status = -1;
+
+		new_op->op_state = OP_VFS_STATE_UNKNOWN;
+
+		/* initialize the op specific tag and upcall credentials */
+		orangefs_new_tag(new_op);
+		new_op->upcall.type = type;
+		new_op->attempts = 0;
+		gossip_debug(GOSSIP_CACHE_DEBUG,
+			     "Alloced OP (%p: %llu %s)\n",
+			     new_op,
+			     llu(new_op->tag),
+			     get_opname_string(new_op));
+
+		new_op->upcall.uid = from_kuid(current_user_ns(),
+					       current_fsuid());
+
+		new_op->upcall.gid = from_kgid(current_user_ns(),
+					       current_fsgid());
+	} else {
+		gossip_err("op_alloc: kmem_cache_zalloc failed!\n");
+	}
+	return new_op;
+}
+
+void op_release(struct orangefs_kernel_op_s *orangefs_op)
+{
+	if (orangefs_op) {
+		gossip_debug(GOSSIP_CACHE_DEBUG,
+			     "Releasing OP (%p: %llu)\n",
+			     orangefs_op,
+			     llu(orangefs_op->tag));
+		kmem_cache_free(op_cache, orangefs_op);
+	} else {
+		gossip_err("NULL pointer in op_release\n");
+	}
+}
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -0,0 +1,92 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* This file just defines debugging masks to be used with the gossip
+ * logging utility.  All debugging masks for ORANGEFS are kept here to make
+ * sure we don't have collisions.
+ */
+
+#ifndef __ORANGEFS_DEBUG_H
+#define __ORANGEFS_DEBUG_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#define	GOSSIP_NO_DEBUG			(__u64)0
+
+#define GOSSIP_SUPER_DEBUG		((__u64)1 << 0)
+#define GOSSIP_INODE_DEBUG		((__u64)1 << 1)
+#define GOSSIP_FILE_DEBUG		((__u64)1 << 2)
+#define GOSSIP_DIR_DEBUG		((__u64)1 << 3)
+#define GOSSIP_UTILS_DEBUG		((__u64)1 << 4)
+#define GOSSIP_WAIT_DEBUG		((__u64)1 << 5)
+#define GOSSIP_ACL_DEBUG		((__u64)1 << 6)
+#define GOSSIP_DCACHE_DEBUG		((__u64)1 << 7)
+#define GOSSIP_DEV_DEBUG		((__u64)1 << 8)
+#define GOSSIP_NAME_DEBUG		((__u64)1 << 9)
+#define GOSSIP_BUFMAP_DEBUG		((__u64)1 << 10)
+#define GOSSIP_CACHE_DEBUG		((__u64)1 << 11)
+#define GOSSIP_DEBUGFS_DEBUG		((__u64)1 << 12)
+#define GOSSIP_XATTR_DEBUG		((__u64)1 << 13)
+#define GOSSIP_INIT_DEBUG		((__u64)1 << 14)
+#define GOSSIP_SYSFS_DEBUG		((__u64)1 << 15)
+
+#define GOSSIP_MAX_NR                 16
+#define GOSSIP_MAX_DEBUG              (((__u64)1 << GOSSIP_MAX_NR) - 1)
+
+/*function prototypes*/
+__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging);
+__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging);
+char *ORANGEFS_debug_mask_to_eventlog(__u64 mask);
+char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask);
+
+/* a private internal type */
+struct __keyword_mask_s {
+	const char *keyword;
+	__u64 mask_val;
+};
+
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ *   "all" is always last...
+ *
+ *   keyword     mask_val     index
+ *     foo          1           0
+ *     bar          2           1
+ *     baz          4           2
+ *     qux          8           3
+ *      .           .           .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+	{"super", GOSSIP_SUPER_DEBUG},
+	{"inode", GOSSIP_INODE_DEBUG},
+	{"file", GOSSIP_FILE_DEBUG},
+	{"dir", GOSSIP_DIR_DEBUG},
+	{"utils", GOSSIP_UTILS_DEBUG},
+	{"wait", GOSSIP_WAIT_DEBUG},
+	{"acl", GOSSIP_ACL_DEBUG},
+	{"dcache", GOSSIP_DCACHE_DEBUG},
+	{"dev", GOSSIP_DEV_DEBUG},
+	{"name", GOSSIP_NAME_DEBUG},
+	{"bufmap", GOSSIP_BUFMAP_DEBUG},
+	{"cache", GOSSIP_CACHE_DEBUG},
+	{"debugfs", GOSSIP_DEBUGFS_DEBUG},
+	{"xattr", GOSSIP_XATTR_DEBUG},
+	{"init", GOSSIP_INIT_DEBUG},
+	{"sysfs", GOSSIP_SYSFS_DEBUG},
+	{"none", GOSSIP_NO_DEBUG},
+	{"all", GOSSIP_MAX_DEBUG}
+};
+
+static const int num_kmod_keyword_mask_map = (int)
+	(sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
+
+#endif /* __ORANGEFS_DEBUG_H */
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -0,0 +1,455 @@
+/*
+ * What:		/sys/kernel/debug/orangefs/debug-help
+ * Date:		June 2015
+ * Contact:		Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * 			List of client and kernel debug keywords.
+ *
+ *
+ * What:		/sys/kernel/debug/orangefs/client-debug
+ * Date:		June 2015
+ * Contact:		Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * 			Debug setting for "the client", the userspace
+ * 			helper for the kernel module.
+ *
+ *
+ * What:		/sys/kernel/debug/orangefs/kernel-debug
+ * Date:		June 2015
+ * Contact:		Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * 			Debug setting for the orangefs kernel module.
+ *
+ * 			Any of the keywords, or comma-separated lists
+ * 			of keywords, from debug-help can be catted to
+ * 			client-debug or kernel-debug.
+ *
+ * 			"none", "all" and "verbose" are special keywords
+ * 			for client-debug. Setting client-debug to "all"
+ * 			is kind of like trying to drink water from a
+ * 			fire hose, "verbose" triggers most of the same
+ * 			output except for the constant flow of output
+ * 			from the main wait loop.
+ *
+ * 			"none" and "all" are similar settings for kernel-debug
+ * 			no need for a "verbose".
+ */
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include "orangefs-debugfs.h"
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+static int orangefs_debug_disabled = 1;
+
+static int orangefs_debug_help_open(struct inode *, struct file *);
+
+const struct file_operations debug_help_fops = {
+	.open           = orangefs_debug_help_open,
+	.read           = seq_read,
+	.release        = seq_release,
+	.llseek         = seq_lseek,
+};
+
+static void *help_start(struct seq_file *, loff_t *);
+static void *help_next(struct seq_file *, void *, loff_t *);
+static void help_stop(struct seq_file *, void *);
+static int help_show(struct seq_file *, void *);
+
+static const struct seq_operations help_debug_ops = {
+	.start	= help_start,
+	.next	= help_next,
+	.stop	= help_stop,
+	.show	= help_show,
+};
+
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+
+int orangefs_debug_open(struct inode *, struct file *);
+
+static ssize_t orangefs_debug_read(struct file *,
+				 char __user *,
+				 size_t,
+				 loff_t *);
+
+static ssize_t orangefs_debug_write(struct file *,
+				  const char __user *,
+				  size_t,
+				  loff_t *);
+
+static const struct file_operations kernel_debug_fops = {
+	.open           = orangefs_debug_open,
+	.read           = orangefs_debug_read,
+	.write		= orangefs_debug_write,
+	.llseek         = generic_file_llseek,
+};
+
+/*
+ * initialize kmod debug operations, create orangefs debugfs dir and
+ * ORANGEFS_KMOD_DEBUG_HELP_FILE.
+ */
+int orangefs_debugfs_init(void)
+{
+
+	int rc = -ENOMEM;
+
+	debug_dir = debugfs_create_dir("orangefs", NULL);
+	if (!debug_dir) {
+		pr_info("%s: debugfs_create_dir failed.\n", __func__);
+		goto out;
+	}
+
+	help_file_dentry = debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE,
+				  0444,
+				  debug_dir,
+				  debug_help_string,
+				  &debug_help_fops);
+	if (!help_file_dentry) {
+		pr_info("%s: debugfs_create_file failed.\n", __func__);
+		goto out;
+	}
+
+	orangefs_debug_disabled = 0;
+	rc = 0;
+
+out:
+
+	return rc;
+}
+
+void orangefs_debugfs_cleanup(void)
+{
+	if (debug_dir)
+		debugfs_remove_recursive(debug_dir);
+}
+
+/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
+static int orangefs_debug_help_open(struct inode *inode, struct file *file)
+{
+	int rc = -ENODEV;
+	int ret;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		     "orangefs_debug_help_open: start\n");
+
+	if (orangefs_debug_disabled)
+		goto out;
+
+	ret = seq_open(file, &help_debug_ops);
+	if (ret)
+		goto out;
+
+	((struct seq_file *)(file->private_data))->private = inode->i_private;
+
+	rc = 0;
+
+out:
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		     "orangefs_debug_help_open: rc:%d:\n",
+		     rc);
+	return rc;
+}
+
+/*
+ * I think start always gets called again after stop. Start
+ * needs to return NULL when it is done. The whole "payload"
+ * in this case is a single (long) string, so by the second
+ * time we get to start (pos = 1), we're done.
+ */
+static void *help_start(struct seq_file *m, loff_t *pos)
+{
+	void *payload = NULL;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n");
+
+	if (*pos == 0)
+		payload = m->private;
+
+	return payload;
+}
+
+static void *help_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
+
+	return NULL;
+}
+
+static void help_stop(struct seq_file *m, void *p)
+{
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n");
+}
+
+static int help_show(struct seq_file *m, void *v)
+{
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_show: start\n");
+
+	seq_puts(m, v);
+
+	return 0;
+}
+
+/*
+ * initialize the kernel-debug file.
+ */
+int orangefs_kernel_debug_init(void)
+{
+	int rc = -ENOMEM;
+	struct dentry *ret;
+	char *k_buffer = NULL;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+	k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+	if (!k_buffer)
+		goto out;
+
+	if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+		strcpy(k_buffer, kernel_debug_string);
+		strcat(k_buffer, "\n");
+	} else {
+		strcpy(k_buffer, "none\n");
+		pr_info("%s: overflow 1!\n", __func__);
+	}
+
+	ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+				  0444,
+				  debug_dir,
+				  k_buffer,
+				  &kernel_debug_fops);
+	if (!ret) {
+		pr_info("%s: failed to create %s.\n",
+			__func__,
+			ORANGEFS_KMOD_DEBUG_FILE);
+		goto out;
+	}
+
+	rc = 0;
+
+out:
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+	return rc;
+}
+
+/*
+ * initialize the client-debug file.
+ */
+int orangefs_client_debug_init(void)
+{
+
+	int rc = -ENOMEM;
+	char *c_buffer = NULL;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+	c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+	if (!c_buffer)
+		goto out;
+
+	if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+		strcpy(c_buffer, client_debug_string);
+		strcat(c_buffer, "\n");
+	} else {
+		strcpy(c_buffer, "none\n");
+		pr_info("%s: overflow! 2\n", __func__);
+	}
+
+	client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
+						  0444,
+						  debug_dir,
+						  c_buffer,
+						  &kernel_debug_fops);
+	if (!client_debug_dentry) {
+		pr_info("%s: failed to create updated %s.\n",
+			__func__,
+			ORANGEFS_CLIENT_DEBUG_FILE);
+		goto out;
+	}
+
+	rc = 0;
+
+out:
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+	return rc;
+}
+
+/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
+int orangefs_debug_open(struct inode *inode, struct file *file)
+{
+	int rc = -ENODEV;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		     "%s: orangefs_debug_disabled: %d\n",
+		     __func__,
+		     orangefs_debug_disabled);
+
+	if (orangefs_debug_disabled)
+		goto out;
+
+	rc = 0;
+	mutex_lock(&orangefs_debug_lock);
+	file->private_data = inode->i_private;
+	mutex_unlock(&orangefs_debug_lock);
+
+out:
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		     "orangefs_debug_open: rc: %d\n",
+		     rc);
+	return rc;
+}
+
+static ssize_t orangefs_debug_read(struct file *file,
+				 char __user *ubuf,
+				 size_t count,
+				 loff_t *ppos)
+{
+	char *buf;
+	int sprintf_ret;
+	ssize_t read_ret = -ENOMEM;
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_read: start\n");
+
+	buf = kmalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	mutex_lock(&orangefs_debug_lock);
+	sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
+	mutex_unlock(&orangefs_debug_lock);
+
+	read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
+
+	kfree(buf);
+
+out:
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		     "orangefs_debug_read: ret: %zu\n",
+		     read_ret);
+
+	return read_ret;
+}
+
+static ssize_t orangefs_debug_write(struct file *file,
+				  const char __user *ubuf,
+				  size_t count,
+				  loff_t *ppos)
+{
+	char *buf;
+	int rc = -EFAULT;
+	size_t silly = 0;
+	char *debug_string;
+	struct orangefs_kernel_op_s *new_op = NULL;
+	struct client_debug_mask c_mask = { NULL, 0, 0 };
+
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		"orangefs_debug_write: %s\n",
+		file->f_path.dentry->d_name.name);
+
+	/*
+	 * Thwart users who try to jamb a ridiculous number
+	 * of bytes into the debug file...
+	 */
+	if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+		silly = count;
+		count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+	}
+
+	buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	if (copy_from_user(buf, ubuf, count - 1)) {
+		gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+			     "%s: copy_from_user failed!\n",
+			     __func__);
+		goto out;
+	}
+
+	/*
+	 * Map the keyword string from userspace into a valid debug mask.
+	 * The mapping process involves mapping the human-inputted string
+	 * into a valid mask, and then rebuilding the string from the
+	 * verified valid mask.
+	 *
+	 * A service operation is required to set a new client-side
+	 * debug mask.
+	 */
+	if (!strcmp(file->f_path.dentry->d_name.name,
+		    ORANGEFS_KMOD_DEBUG_FILE)) {
+		debug_string_to_mask(buf, &gossip_debug_mask, 0);
+		debug_mask_to_string(&gossip_debug_mask, 0);
+		debug_string = kernel_debug_string;
+		gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+			     "New kernel debug string is %s\n",
+			     kernel_debug_string);
+	} else {
+		/* Can't reset client debug mask if client is not running. */
+		if (is_daemon_in_service()) {
+			pr_info("%s: Client not running :%d:\n",
+				__func__,
+				is_daemon_in_service());
+			goto out;
+		}
+
+		debug_string_to_mask(buf, &c_mask, 1);
+		debug_mask_to_string(&c_mask, 1);
+		debug_string = client_debug_string;
+
+		new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+		if (!new_op) {
+			pr_info("%s: op_alloc failed!\n", __func__);
+			goto out;
+		}
+
+		new_op->upcall.req.param.op =
+			ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES;
+		new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+		memset(new_op->upcall.req.param.s_value,
+		       0,
+		       ORANGEFS_MAX_DEBUG_STRING_LEN);
+		sprintf(new_op->upcall.req.param.s_value,
+			"%llx %llx\n",
+			c_mask.mask1,
+			c_mask.mask2);
+
+		/* service_operation returns 0 on success... */
+		rc = service_operation(new_op,
+				       "orangefs_param",
+					ORANGEFS_OP_INTERRUPTIBLE);
+
+		if (rc)
+			gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+				     "%s: service_operation failed! rc:%d:\n",
+				     __func__,
+				     rc);
+
+		op_release(new_op);
+	}
+
+	mutex_lock(&orangefs_debug_lock);
+	memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+	sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+	mutex_unlock(&orangefs_debug_lock);
+
+	*ppos += count;
+	if (silly)
+		rc = silly;
+	else
+		rc = count;
+
+out:
+	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+		     "orangefs_debug_write: rc: %d\n",
+		     rc);
+	kfree(buf);
+	return rc;
+}
--- a/Show More
+++ b/Show More