[kernel/rawhide/user/steved/pnfs-rawhide] Updated to the latest pNFS tag: pnfs-all-2.6.38-rc2-2011-01-27

Steve Dickson steved at fedoraproject.org
Fri Jan 28 16:34:08 UTC 2011


commit 9403ba017df4a19087fcb98a55a3f7af804e32da
Author: Steve Dickson <steved at redhat.com>
Date:   Fri Jan 28 11:31:42 2011 -0500

    Updated to the latest pNFS tag: pnfs-all-2.6.38-rc2-2011-01-27
    
    Signed-off-by: Steve Dickson <steved at redhat.com>

 kernel.spec                                        |   39 +-
 ...3.patch => pnfs-all-2.6.38-rc2-2011-01-27.patch |40823 ++++++++++----------
 2 files changed, 19552 insertions(+), 21310 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index 0e99ad8..ef7f451 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -23,7 +23,7 @@ Summary: The Linux kernel
 #
 # (Uncomment the '#' and both spaces below to set the buildid.)
 #
-%define buildid .pnfs_all_2010_11_03
+%define buildid .pnfs_38_rc2_0100107
 ###################################################################
 
 # The buildid can also be specified on the rpmbuild command line
@@ -731,19 +731,7 @@ Patch12303: dmar-disable-when-ricoh-multifunction.patch
 
 Patch12421: fs-call-security_d_instantiate-in-d_obtain_alias.patch
 
-Patch30000: pnfs-all-2.6.36-rc3-2010-08-30.patch
-Patch30001: linux-2.6-pnfs-compile.patch
-Patch30002: linux-2.6.35-inline.patch
-
-Patch30000: pnfs-all-latest.v2.6.36-rc6-f15.patch
-Patch30001: linux-2.6-pnfs-compile.patch
-Patch30002: linux-2.6.35-inline.patch
-
-Patch30000: pnfs-all-2.6.36-2010-11-03.patch
-Patch30001: linux-2.6-pnfs-compile.patch
-Patch30002: linux-2.6.35-inline.patch
-
-Patch30000: pnfs-all-2.6.36-2010-11-03.patch
+Patch30000: pnfs-all-2.6.38-rc2-2011-01-27.patch
 Patch30001: linux-2.6-pnfs-compile.patch
 Patch30002: linux-2.6.35-inline.patch
 
@@ -1359,19 +1347,7 @@ ApplyPatch dmar-disable-when-ricoh-multifunction.patch
 # rhbz#662344,600690
 ApplyPatch fs-call-security_d_instantiate-in-d_obtain_alias.patch
 
-ApplyPatch pnfs-all-2.6.36-rc3-2010-08-30.patch
-ApplyPatch linux-2.6-pnfs-compile.patch
-ApplyPatch linux-2.6.35-inline.patch
-
-ApplyPatch pnfs-all-latest.v2.6.36-rc6-f15.patch
-ApplyPatch linux-2.6-pnfs-compile.patch
-ApplyPatch linux-2.6.35-inline.patch
-
-ApplyPatch pnfs-all-2.6.36-2010-11-03.patch 
-ApplyPatch linux-2.6-pnfs-compile.patch
-ApplyPatch linux-2.6.35-inline.patch
-
-ApplyPatch pnfs-all-2.6.36-2010-11-03.patch 
+ApplyPatch pnfs-all-2.6.38-rc2-2011-01-27.patch
 ApplyPatch linux-2.6-pnfs-compile.patch
 ApplyPatch linux-2.6.35-inline.patch
 
@@ -1987,6 +1963,9 @@ fi
 #                 ||----w |
 #                 ||     ||
 %changelog
+* Fri Jan 28 2011  Steve Dickson <steved at redhat.com> 2.6.38-0.rc2.git5.1
+- Updated to the latest pNFS tag: pnfs-all-2.6.38-rc2-2011-01-27
+
 * Wed Jan 26 2011 Kyle McMartin <kmcmartin at redhat.com> 2.6.38-0.rc2.git5.1
 - Linux 2.6.38-rc2-git5
 - [x86] Re-enable TRANSPARENT_HUGEPAGE, should be fixed by cacf061c.
@@ -2254,6 +2233,9 @@ fi
 * Mon Nov 08 2010 Kyle McMartin <kyle at redhat.com>
 - Cherry-pick utrace-ptrace fixes from mayoung. Thanks!
 
+* Mon Nov 08 2010 Steve Dickson <steved at redhat.com> 
+- Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03
+
 * Tue Nov 02 2010 Kyle McMartin <kyle at redhat.com> 2.6.37-0.1.rc1.git0
 - Linux 2.6.37-rc1
 
@@ -2264,9 +2246,6 @@ fi
 - Switch to tracking git snapshots of what will become 2.6.37.
 - Fix context rejects in utrace and a few other patches.
 
-* Mon Nov  8 2010 Steve Dickson <steved at redhat.com> 
-- Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03
-
 * Wed Oct 20 2010 Chuck Ebbert <cebbert at redhat.com> 2.6.36-1
 - Linux 2.6.36
 
diff --git a/pnfs-all-2.6.36-2010-11-03.patch b/pnfs-all-2.6.38-rc2-2011-01-27.patch
similarity index 82%
rename from pnfs-all-2.6.36-2010-11-03.patch
rename to pnfs-all-2.6.38-rc2-2011-01-27.patch
index 5c46203..5f476b5 100644
--- a/pnfs-all-2.6.36-2010-11-03.patch
+++ b/pnfs-all-2.6.38-rc2-2011-01-27.patch
@@ -1,74 +1,6 @@
-diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
-index 2f68cd6..e474827 100644
---- a/Documentation/filesystems/nfs/00-INDEX
-+++ b/Documentation/filesystems/nfs/00-INDEX
-@@ -12,5 +12,7 @@ nfs-rdma.txt
- 	- how to install and setup the Linux NFS/RDMA client and server software
- nfsroot.txt
- 	- short guide on setting up a diskless box with NFS root filesystem.
-+pnfs.txt
-+	- short explanation of some of the internals of the pnfs client code
- rpc-cache.txt
- 	- introduction to the caching mechanisms in the sunrpc layer.
-diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
-new file mode 100644
-index 0000000..bc0b9cf
---- /dev/null
-+++ b/Documentation/filesystems/nfs/pnfs.txt
-@@ -0,0 +1,48 @@
-+Reference counting in pnfs:
-+==========================
-+
-+The are several inter-related caches.  We have layouts which can
-+reference multiple devices, each of which can reference multiple data servers.
-+Each data server can be referenced by multiple devices.  Each device
-+can be referenced by multiple layouts.  To keep all of this straight,
-+we need to reference count.
-+
-+
-+struct pnfs_layout_hdr
-+----------------------
-+The on-the-wire command LAYOUTGET corresponds to struct
-+pnfs_layout_segment, usually referred to by the variable name lseg.
-+Each nfs_inode may hold a pointer to a cache of of these layout
-+segments in nfsi->layout, of type struct pnfs_layout_hdr.
-+
-+We reference the header for the inode pointing to it, across each
-+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
-+LAYOUTCOMMIT), and for each lseg held within.
-+
-+Each header is also (when non-empty) put on a list associated with
-+struct nfs_client (cl_layouts).  Being put on this list does not bump
-+the reference count, as the layout is kept around by the lseg that
-+keeps it in the list.
-+
-+deviceid_cache
-+--------------
-+lsegs reference device ids, which are resolved per nfs_client and
-+layout driver type.  The device ids are held in a RCU cache (struct
-+nfs4_deviceid_cache).  The cache itself is referenced across each
-+mount.  The entries (struct nfs4_deviceid) themselves are held across
-+the lifetime of each lseg referencing them.
-+
-+RCU is used because the deviceid is basically a write once, read many
-+data structure.  The hlist size of 32 buckets needs better
-+justification, but seems reasonable given that we can have multiple
-+deviceid's per filesystem, and multiple filesystems per nfs_client.
-+
-+The hash code is copied from the nfsd code base.  A discussion of
-+hashing and variations of this algorithm can be found at:
-+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
-+
-+data server cache
-+-----------------
-+file driver devices refer to data servers, which are kept in a module
-+level cache.  Its reference is held over the lifetime of the deviceid
-+pointing to it.
-diff --git a/Documentation/filesystems/spnfs.txt b/Documentation/filesystems/spnfs.txt
-new file mode 100644
-index 0000000..e1d2864
---- /dev/null
-+++ b/Documentation/filesystems/spnfs.txt
+diff -up linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt
+--- linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt.orig	2011-01-28 09:43:53.292780695 -0500
++++ linux-2.6.37.noarch/Documentation/filesystems/spnfs.txt	2011-01-28 09:43:53.292780695 -0500
 @@ -0,0 +1,211 @@
 +(c) 2007 Network Appliance Inc.
 +
@@ -281,11 +213,10 @@ index 0000000..e1d2864
 +Bugs, enhancements, compliments, complaints to: dmuntz at netapp.com
 +
 +
-diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
-index 3e39193..92ce1fb 100644
---- a/drivers/md/dm-ioctl.c
-+++ b/drivers/md/dm-ioctl.c
-@@ -663,6 +663,12 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
+diff -up linux-2.6.37.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.37.noarch/drivers/md/dm-ioctl.c
+--- linux-2.6.37.noarch/drivers/md/dm-ioctl.c.orig	2011-01-28 09:37:28.883106954 -0500
++++ linux-2.6.37.noarch/drivers/md/dm-ioctl.c	2011-01-28 09:43:53.293780446 -0500
+@@ -713,6 +713,12 @@ static int dev_create(struct dm_ioctl *p
  	return 0;
  }
  
@@ -298,7 +229,7 @@ index 3e39193..92ce1fb 100644
  /*
   * Always use UUID for lookups if it's present, otherwise use name or dev.
   */
-@@ -758,6 +764,12 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
+@@ -808,6 +814,12 @@ static int dev_remove(struct dm_ioctl *p
  	return 0;
  }
  
@@ -311,7 +242,7 @@ index 3e39193..92ce1fb 100644
  /*
   * Check a string doesn't overrun the chunk of
   * memory we copied from userland.
-@@ -937,6 +949,12 @@ static int do_resume(struct dm_ioctl *param)
+@@ -990,6 +1002,12 @@ static int do_resume(struct dm_ioctl *pa
  	return r;
  }
  
@@ -324,7 +255,7 @@ index 3e39193..92ce1fb 100644
  /*
   * Set or unset the suspension state of a device.
   * If the device already is in the requested state we just return its status.
-@@ -1203,6 +1221,12 @@ out:
+@@ -1256,6 +1274,12 @@ out:
  	return r;
  }
  
@@ -337,11 +268,10 @@ index 3e39193..92ce1fb 100644
  static int table_clear(struct dm_ioctl *param, size_t param_size)
  {
  	struct hash_cell *hc;
-diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
-index 8a8f803..7958885 100644
---- a/drivers/scsi/hosts.c
-+++ b/drivers/scsi/hosts.c
-@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct device *dev)
+diff -up linux-2.6.37.noarch/drivers/scsi/hosts.c.orig linux-2.6.37.noarch/drivers/scsi/hosts.c
+--- linux-2.6.37.noarch/drivers/scsi/hosts.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/drivers/scsi/hosts.c	2011-01-28 09:43:53.294780201 -0500
+@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct
  	put_device(&class_to_shost(dev)->shost_gendev);
  }
  
@@ -354,68 +284,9 @@ index 8a8f803..7958885 100644
  
  /**
   *	scsi_host_set_state - Take the given host through the host state model.
-diff --git a/fs/Kconfig b/fs/Kconfig
-index 3d18530..82b6696 100644
---- a/fs/Kconfig
-+++ b/fs/Kconfig
-@@ -224,6 +224,31 @@ config LOCKD_V4
- config EXPORTFS
- 	tristate
- 
-+config EXPORTFS_FILE_LAYOUT
-+	bool
-+	depends on PNFSD && EXPORTFS
-+	help
-+	  Exportfs support for the NFSv4.1 files layout type.
-+	  Must be automatically selected by supporting filesystems.
-+
-+config EXPORTFS_OSD_LAYOUT
-+	bool
-+	depends on PNFSD && EXPORTFS
-+	help
-+	  Exportfs support for the NFSv4.1 objects layout type.
-+	  Must be automatically selected by supporting osd
-+	  filesystems.
-+
-+	  If unsure, say N.
-+
-+config EXPORTFS_BLOCK_LAYOUT
-+	bool
-+	depends on PNFSD && EXPORTFS
-+	help
-+	  Exportfs support for the NFSv4.1 blocks layout type.
-+	  Must be automatically selected by supporting filesystems.
-+
-+
- config NFS_ACL_SUPPORT
- 	tristate
- 	select FS_POSIX_ACL
-diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
-index 2d0f757..5458546 100644
---- a/fs/exofs/Kbuild
-+++ b/fs/exofs/Kbuild
-@@ -13,4 +13,5 @@
- #
- 
- exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
-+exofs-$(CONFIG_PNFSD) +=  export.o
- obj-$(CONFIG_EXOFS_FS) += exofs.o
-diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
-index 86194b2f..77c677f 100644
---- a/fs/exofs/Kconfig
-+++ b/fs/exofs/Kconfig
-@@ -1,6 +1,7 @@
- config EXOFS_FS
- 	tristate "exofs: OSD based file system support"
- 	depends on SCSI_OSD_ULD
-+	select EXPORTFS_OSD_LAYOUT if PNFSD
- 	help
- 	  EXOFS is a file system that uses an OSD storage device,
- 	  as its backing storage.
-diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
-index 2dc925f..3c03f2d 100644
---- a/fs/exofs/exofs.h
-+++ b/fs/exofs/exofs.h
+diff -up linux-2.6.37.noarch/fs/exofs/exofs.h.orig linux-2.6.37.noarch/fs/exofs/exofs.h
+--- linux-2.6.37.noarch/fs/exofs/exofs.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/exofs/exofs.h	2011-01-28 09:43:53.296779718 -0500
 @@ -36,13 +36,9 @@
  #include <linux/fs.h>
  #include <linux/time.h>
@@ -439,7 +310,7 @@ index 2dc925f..3c03f2d 100644
  	unsigned long  i_flags;            /* various atomic flags            */
  	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
  	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
-@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_size(unsigned numdevs)
+@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si
   */
  #define OBJ_2BCREATED	0	/* object will be created soon*/
  #define OBJ_CREATED	1	/* object has been created on the osd*/
@@ -449,7 +320,7 @@ index 2dc925f..3c03f2d 100644
  
  static inline int obj_2bcreated(struct exofs_i_info *oi)
  {
-@@ -303,4 +303,21 @@ extern const struct inode_operations exofs_special_inode_operations;
+@@ -303,4 +303,21 @@ extern const struct inode_operations exo
  extern const struct inode_operations exofs_symlink_inode_operations;
  extern const struct inode_operations exofs_fast_symlink_inode_operations;
  
@@ -471,11 +342,9 @@ index 2dc925f..3c03f2d 100644
 +#endif
 +
  #endif
-diff --git a/fs/exofs/export.c b/fs/exofs/export.c
-new file mode 100644
-index 0000000..69bce46
---- /dev/null
-+++ b/fs/exofs/export.c
+diff -up linux-2.6.37.noarch/fs/exofs/export.c.orig linux-2.6.37.noarch/fs/exofs/export.c
+--- linux-2.6.37.noarch/fs/exofs/export.c.orig	2011-01-28 09:43:53.297779480 -0500
++++ linux-2.6.37.noarch/fs/exofs/export.c	2011-01-28 09:43:53.297779480 -0500
 @@ -0,0 +1,396 @@
 +/*
 + * export.c - Implementation of the pnfs_export_operations
@@ -873,11 +742,10 @@ index 0000000..69bce46
 +{
 +	sb->s_pnfs_op = &exofs_pnfs_ops;
 +}
-diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
-index 3eadd97..1cf2286 100644
---- a/fs/exofs/inode.c
-+++ b/fs/exofs/inode.c
-@@ -826,8 +826,9 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+diff -up linux-2.6.37.noarch/fs/exofs/inode.c.orig linux-2.6.37.noarch/fs/exofs/inode.c
+--- linux-2.6.37.noarch/fs/exofs/inode.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/exofs/inode.c	2011-01-28 09:43:53.298779243 -0500
+@@ -820,8 +820,9 @@ static inline int exofs_inode_is_fast_sy
  const struct osd_attr g_attr_logical_length = ATTR_DEF(
  	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
  
@@ -888,7 +756,7 @@ index 3eadd97..1cf2286 100644
  	struct exofs_i_info *oi = exofs_i(inode);
  	int ret;
  
-@@ -864,7 +865,8 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+@@ -858,7 +859,8 @@ int exofs_setattr(struct dentry *dentry,
  
  	if ((iattr->ia_valid & ATTR_SIZE) &&
  	    iattr->ia_size != i_size_read(inode)) {
@@ -898,7 +766,7 @@ index 3eadd97..1cf2286 100644
  		if (unlikely(error))
  			return error;
  	}
-@@ -977,6 +979,7 @@ static void __oi_init(struct exofs_i_info *oi)
+@@ -971,6 +973,7 @@ static void __oi_init(struct exofs_i_inf
  {
  	init_waitqueue_head(&oi->i_wq);
  	oi->i_flags = 0;
@@ -906,62 +774,30 @@ index 3eadd97..1cf2286 100644
  }
  /*
   * Fill in an inode read from the OSD and set it up for use
-diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
-deleted file mode 100644
-index c52e988..0000000
---- a/fs/exofs/pnfs.h
-+++ /dev/null
-@@ -1,45 +0,0 @@
--/*
-- * Copyright (C) 2008, 2009
-- * Boaz Harrosh <bharrosh at panasas.com>
-- *
-- * This file is part of exofs.
-- *
-- * exofs is free software; you can redistribute it and/or modify it under the
-- * terms of the GNU General Public License  version 2 as published by the Free
-- * Software Foundation.
-- *
-- */
--
--/* FIXME: Remove this file once pnfs hits mainline */
--
--#ifndef __EXOFS_PNFS_H__
--#define __EXOFS_PNFS_H__
--
--#if ! defined(__PNFS_OSD_XDR_H__)
--
--enum pnfs_iomode {
--	IOMODE_READ = 1,
--	IOMODE_RW = 2,
--	IOMODE_ANY = 3,
--};
--
--/* Layout Structure */
--enum pnfs_osd_raid_algorithm4 {
--	PNFS_OSD_RAID_0		= 1,
--	PNFS_OSD_RAID_4		= 2,
--	PNFS_OSD_RAID_5		= 3,
--	PNFS_OSD_RAID_PQ	= 4     /* Reed-Solomon P+Q */
--};
--
--struct pnfs_osd_data_map {
--	u32	odm_num_comps;
--	u64	odm_stripe_unit;
--	u32	odm_group_width;
--	u32	odm_group_depth;
--	u32	odm_mirror_cnt;
--	u32	odm_raid_algorithm;
--};
--
--#endif /* ! defined(__PNFS_OSD_XDR_H__) */
--
--#endif /* __EXOFS_PNFS_H__ */
-diff --git a/fs/exofs/super.c b/fs/exofs/super.c
-index 047e92f..623aa55 100644
---- a/fs/exofs/super.c
-+++ b/fs/exofs/super.c
-@@ -620,6 +620,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+diff -up linux-2.6.37.noarch/fs/exofs/Kbuild.orig linux-2.6.37.noarch/fs/exofs/Kbuild
+--- linux-2.6.37.noarch/fs/exofs/Kbuild.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/exofs/Kbuild	2011-01-28 09:43:53.295779958 -0500
+@@ -13,4 +13,5 @@
+ #
+ 
+ exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
++exofs-$(CONFIG_PNFSD) +=  export.o
+ obj-$(CONFIG_EXOFS_FS) += exofs.o
+diff -up linux-2.6.37.noarch/fs/exofs/Kconfig.orig linux-2.6.37.noarch/fs/exofs/Kconfig
+--- linux-2.6.37.noarch/fs/exofs/Kconfig.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/exofs/Kconfig	2011-01-28 09:43:53.295779958 -0500
+@@ -1,6 +1,7 @@
+ config EXOFS_FS
+ 	tristate "exofs: OSD based file system support"
+ 	depends on SCSI_OSD_ULD
++	select EXPORTFS_OSD_LAYOUT if PNFSD
+ 	help
+ 	  EXOFS is a file system that uses an OSD storage device,
+ 	  as its backing storage.
+diff -up linux-2.6.37.noarch/fs/exofs/super.c.orig linux-2.6.37.noarch/fs/exofs/super.c
+--- linux-2.6.37.noarch/fs/exofs/super.c.orig	2011-01-28 09:37:32.381985535 -0500
++++ linux-2.6.37.noarch/fs/exofs/super.c	2011-01-28 09:43:53.300778781 -0500
+@@ -627,6 +627,7 @@ static int exofs_fill_super(struct super
  	sb->s_fs_info = sbi;
  	sb->s_op = &exofs_sops;
  	sb->s_export_op = &exofs_export_ops;
@@ -969,23 +805,9 @@ index 047e92f..623aa55 100644
  	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
  	if (IS_ERR(root)) {
  		EXOFS_ERR("ERROR: exofs_iget failed\n");
-diff --git a/fs/exportfs/Makefile b/fs/exportfs/Makefile
-index d7c5d4d..51e8ee4 100644
---- a/fs/exportfs/Makefile
-+++ b/fs/exportfs/Makefile
-@@ -3,4 +3,7 @@
- 
- obj-$(CONFIG_EXPORTFS) += exportfs.o
- 
--exportfs-objs := expfs.o
-+exportfs-y				:= expfs.o
-+exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT)	+= nfs4filelayoutxdr.o
-+exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT)	+= pnfs_osd_xdr_srv.o
-+exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o
-diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
-index e9e1759..a10949a 100644
---- a/fs/exportfs/expfs.c
-+++ b/fs/exportfs/expfs.c
+diff -up linux-2.6.37.noarch/fs/exportfs/expfs.c.orig linux-2.6.37.noarch/fs/exportfs/expfs.c
+--- linux-2.6.37.noarch/fs/exportfs/expfs.c.orig	2011-01-28 09:37:32.382985501 -0500
++++ linux-2.6.37.noarch/fs/exportfs/expfs.c	2011-01-28 09:43:53.301778557 -0500
 @@ -16,6 +16,13 @@
  #include <linux/namei.h>
  #include <linux/sched.h>
@@ -1000,11 +822,21 @@ index e9e1759..a10949a 100644
  #define dprintk(fmt, args...) do{}while(0)
  
  
-diff --git a/fs/exportfs/nfs4blocklayoutxdr.c b/fs/exportfs/nfs4blocklayoutxdr.c
-new file mode 100644
-index 0000000..439e647
---- /dev/null
-+++ b/fs/exportfs/nfs4blocklayoutxdr.c
+diff -up linux-2.6.37.noarch/fs/exportfs/Makefile.orig linux-2.6.37.noarch/fs/exportfs/Makefile
+--- linux-2.6.37.noarch/fs/exportfs/Makefile.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/exportfs/Makefile	2011-01-28 09:43:53.300778781 -0500
+@@ -3,4 +3,7 @@
+ 
+ obj-$(CONFIG_EXPORTFS) += exportfs.o
+ 
+-exportfs-objs := expfs.o
++exportfs-y				:= expfs.o
++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT)	+= nfs4filelayoutxdr.o
++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT)	+= pnfs_osd_xdr_srv.o
++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o
+diff -up linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c
+--- linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig	2011-01-28 09:43:53.301778557 -0500
++++ linux-2.6.37.noarch/fs/exportfs/nfs4blocklayoutxdr.c	2011-01-28 09:43:53.301778557 -0500
 @@ -0,0 +1,158 @@
 +/*
 + *  linux/fs/nfsd/nfs4blocklayoutxdr.c
@@ -1164,11 +996,9 @@ index 0000000..439e647
 +	return NFS4_OK;
 +}
 +EXPORT_SYMBOL_GPL(blocklayout_encode_layout);
-diff --git a/fs/exportfs/nfs4filelayoutxdr.c b/fs/exportfs/nfs4filelayoutxdr.c
-new file mode 100644
-index 0000000..f63c311
---- /dev/null
-+++ b/fs/exportfs/nfs4filelayoutxdr.c
+diff -up linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c
+--- linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig	2011-01-28 09:43:53.302778335 -0500
++++ linux-2.6.37.noarch/fs/exportfs/nfs4filelayoutxdr.c	2011-01-28 09:43:53.302778335 -0500
 @@ -0,0 +1,218 @@
 +/*
 + *  Copyright (c) 2006 The Regents of the University of Michigan.
@@ -1388,11 +1218,9 @@ index 0000000..f63c311
 +	return nfserr;
 +}
 +EXPORT_SYMBOL(filelayout_encode_layout);
-diff --git a/fs/exportfs/pnfs_osd_xdr_srv.c b/fs/exportfs/pnfs_osd_xdr_srv.c
-new file mode 100644
-index 0000000..60df0df
---- /dev/null
-+++ b/fs/exportfs/pnfs_osd_xdr_srv.c
+diff -up linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c
+--- linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig	2011-01-28 09:43:53.303778113 -0500
++++ linux-2.6.37.noarch/fs/exportfs/pnfs_osd_xdr_srv.c	2011-01-28 09:43:53.303778113 -0500
 @@ -0,0 +1,289 @@
 +/*
 + *  pnfs_osd_xdr_enc.c
@@ -1683,10 +1511,9 @@ index 0000000..60df0df
 +	return p;
 +}
 +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr);
-diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
-index 4d4b1e8..efb0a44 100644
---- a/fs/gfs2/ops_fstype.c
-+++ b/fs/gfs2/ops_fstype.c
+diff -up linux-2.6.37.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.37.noarch/fs/gfs2/ops_fstype.c
+--- linux-2.6.37.noarch/fs/gfs2/ops_fstype.c.orig	2011-01-28 09:37:32.445983313 -0500
++++ linux-2.6.37.noarch/fs/gfs2/ops_fstype.c	2011-01-28 09:43:53.303778113 -0500
 @@ -18,6 +18,7 @@
  #include <linux/mount.h>
  #include <linux/gfs2_ondisk.h>
@@ -1695,9 +1522,9 @@ index 4d4b1e8..efb0a44 100644
  
  #include "gfs2.h"
  #include "incore.h"
-@@ -1166,6 +1167,9 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
- 	sb->s_magic = GFS2_MAGIC;
+@@ -1107,6 +1108,9 @@ static int fill_super(struct super_block
  	sb->s_op = &gfs2_super_ops;
+ 	sb->s_d_op = &gfs2_dops;
  	sb->s_export_op = &gfs2_export_ops;
 +#if defined(CONFIG_PNFSD)
 +	sb->s_pnfs_op = &pnfs_dlm_export_ops;
@@ -1705,123 +1532,41 @@ index 4d4b1e8..efb0a44 100644
  	sb->s_xattr = gfs2_xattr_handlers;
  	sb->s_qcop = &gfs2_quotactl_ops;
  	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
-diff --git a/fs/inode.c b/fs/inode.c
-index 8646433..e415be4 100644
---- a/fs/inode.c
-+++ b/fs/inode.c
-@@ -172,15 +172,21 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
- 	mapping->writeback_index = 0;
+diff -up linux-2.6.37.noarch/fs/Kconfig.orig linux-2.6.37.noarch/fs/Kconfig
+--- linux-2.6.37.noarch/fs/Kconfig.orig	2011-01-28 09:37:32.257989837 -0500
++++ linux-2.6.37.noarch/fs/Kconfig	2011-01-28 09:43:53.295779958 -0500
+@@ -49,6 +49,28 @@ config FS_POSIX_ACL
+ config EXPORTFS
+ 	tristate
  
- 	/*
--	 * If the block_device provides a backing_dev_info for client
--	 * inodes then use that.  Otherwise the inode share the bdev's
--	 * backing_dev_info.
-+	 * If the filesystem provides a backing_dev_info for client inodes
-+	 * then use that. Otherwise inodes share default_backing_dev_info.
- 	 */
--	if (sb->s_bdev) {
--		struct backing_dev_info *bdi;
--
--		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
--		mapping->backing_dev_info = bdi;
-+	if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) {
-+		/*
-+		 * Catch cases where filesystem might be bitten by using s_bdi
-+		 * instead of sb->s_bdev. Can be removed in 2.6.38.
-+		 */
-+		if (sb->s_bdev) {
-+			struct backing_dev_info *bdi =
-+			  sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-+			WARN(bdi != sb->s_bdi, "s_bdev bdi %s != s_bdi %s\n",
-+			     bdi->name, sb->s_bdi->name);
-+		}
-+		mapping->backing_dev_info = sb->s_bdi;
- 	}
- 	inode->i_private = NULL;
- 	inode->i_mapping = mapping;
-diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
-index f7e13db..0da8d28 100644
---- a/fs/nfs/Kconfig
-+++ b/fs/nfs/Kconfig
-@@ -76,10 +76,42 @@ config NFS_V4
- 
- config NFS_V4_1
- 	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
--	depends on NFS_V4 && EXPERIMENTAL
-+	depends on NFS_FS && NFS_V4 && EXPERIMENTAL
-+	select PNFS_FILE_LAYOUT
- 	help
- 	  This option enables support for minor version 1 of the NFSv4 protocol
--	  (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
-+	  (RFC 5661) in the kernel's NFS client.
-+
-+	  If unsure, say N.
-+
-+config PNFS_FILE_LAYOUT
-+	tristate
-+
-+config PNFS_OBJLAYOUT
-+	tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
-+	depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
++config EXPORTFS_FILE_LAYOUT
++	bool
++	depends on PNFSD && EXPORTFS
 +	help
-+	  Say M here if you want your pNFS client to support the Objects Layout Driver.
-+	  Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
-+	  upper level driver (SCSI_OSD_ULD).
-+
-+	  If unsure, say N.
++	  Exportfs support for the NFSv4.1 files layout type.
++	  Must be automatically selected by supporting filesystems.
 +
-+config PNFS_PANLAYOUT
-+	tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
-+	depends on PNFS_OBJLAYOUT
++config EXPORTFS_OSD_LAYOUT
++	bool
++	depends on PNFSD && EXPORTFS
 +	help
-+	  Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver.
-+
-+	  If unsure, say N.
++	  Exportfs support for the NFSv4.1 objects layout type.
++	  Must be automatically selected by supporting osd
++	  filesystems.
 +
-+config PNFS_BLOCK
-+	tristate "Provide a pNFS block client (EXPERIMENTAL)"
-+	depends on NFS_FS && NFS_V4_1
-+	select MD
-+	select BLK_DEV_DM
++config EXPORTFS_BLOCK_LAYOUT
++	bool
++	depends on PNFSD && EXPORTFS
 +	help
-+	  Say M or y here if you want your pNfs client to support the block protocol
- 
- 	  If unsure, say N.
- 
-diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
-index da7fda6..e68c498 100644
---- a/fs/nfs/Makefile
-+++ b/fs/nfs/Makefile
-@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
- 			   delegation.o idmap.o \
- 			   callback.o callback_xdr.o callback_proc.o \
- 			   nfs4namespace.o
-+nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o
- nfs-$(CONFIG_SYSCTL) += sysctl.o
- nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
-+
-+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
-+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
++	  Exportfs support for the NFSv4.1 blocks layout type.
++	  Must be automatically selected by supporting filesystems.
 +
-+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
-+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
-diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
-new file mode 100644
-index 0000000..5a4bf3d
---- /dev/null
-+++ b/fs/nfs/blocklayout/Makefile
-@@ -0,0 +1,6 @@
-+#
-+# Makefile for the pNFS block layout driver kernel module
-+#
-+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-+blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
-+			extents.o block-device-discovery-pipe.o
-diff --git a/fs/nfs/blocklayout/block-device-discovery-pipe.c b/fs/nfs/blocklayout/block-device-discovery-pipe.c
-new file mode 100644
-index 0000000..e4c199f
---- /dev/null
-+++ b/fs/nfs/blocklayout/block-device-discovery-pipe.c
+ config FILE_LOCKING
+ 	bool "Enable POSIX file locking API" if EXPERT
+ 	default y
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig	2011-01-28 09:43:53.306777474 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c	2011-01-28 09:43:53.306777474 -0500
 @@ -0,0 +1,66 @@
 +#include <linux/module.h>
 +#include <linux/uaccess.h>
@@ -1889,11 +1634,9 @@ index 0000000..e4c199f
 +	pipefs_closepipe(bl_device_pipe);
 +	return;
 +}
-diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
-new file mode 100644
-index 0000000..b3ab4cb
---- /dev/null
-+++ b/fs/nfs/blocklayout/blocklayout.c
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c.orig	2011-01-28 09:43:53.307777263 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.c	2011-01-28 09:43:53.307777263 -0500
 @@ -0,0 +1,1146 @@
 +/*
 + *  linux/fs/nfs/blocklayout/blocklayout.c
@@ -2550,11 +2293,11 @@ index 0000000..b3ab4cb
 +
 +static void
 +bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo,
-+			struct nfs4_layoutcommit_args *arg, int status)
++			struct nfs4_layoutcommit_data *lcdata)
 +{
 +	dprintk("%s enter\n", __func__);
-+	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status);
-+	kfree(arg->layoutdriver_data);
++	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
++	kfree(lcdata->args.layoutdriver_data);
 +}
 +
 +static void free_blk_mountid(struct block_mount_id *mid)
@@ -2649,7 +2392,7 @@ index 0000000..b3ab4cb
 + * Retrieve the list of available devices for the mountpoint.
 + */
 +static int
-+bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh)
++bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
 +{
 +	struct block_mount_id *b_mt_id = NULL;
 +	struct pnfs_mount_type *mtype = NULL;
@@ -2716,7 +2459,7 @@ index 0000000..b3ab4cb
 +}
 +
 +static int
-+bl_uninitialize_mountpoint(struct nfs_server *server)
++bl_clear_layoutdriver(struct nfs_server *server)
 +{
 +	struct block_mount_id *b_mt_id = server->pnfs_ld_data;
 +
@@ -3013,8 +2756,8 @@ index 0000000..b3ab4cb
 +	.setup_layoutcommit		= bl_setup_layoutcommit,
 +	.encode_layoutcommit		= bl_encode_layoutcommit,
 +	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
-+	.initialize_mountpoint		= bl_initialize_mountpoint,
-+	.uninitialize_mountpoint	= bl_uninitialize_mountpoint,
++	.set_layoutdriver		= bl_set_layoutdriver,
++	.clear_layoutdriver		= bl_clear_layoutdriver,
 +	.pg_test			= bl_pg_test,
 +};
 +
@@ -3041,16 +2784,14 @@ index 0000000..b3ab4cb
 +
 +module_init(nfs4blocklayout_init);
 +module_exit(nfs4blocklayout_exit);
-diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
-new file mode 100644
-index 0000000..9e7bd62
---- /dev/null
-+++ b/fs/nfs/blocklayout/blocklayout.h
-@@ -0,0 +1,302 @@
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig	2011-01-28 09:43:53.309776857 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdev.c	2011-01-28 09:43:53.309776857 -0500
+@@ -0,0 +1,334 @@
 +/*
-+ *  linux/fs/nfs/blocklayout/blocklayout.h
++ *  linux/fs/nfs/blocklayout/blocklayoutdev.c
 + *
-+ *  Module for the NFSv4.1 pNFS block layout driver.
++ *  Device operations for the pnfs nfs4 file layout driver.
 + *
 + *  Copyright (c) 2006 The Regents of the University of Michigan.
 + *  All rights reserved.
@@ -3078,293 +2819,323 @@ index 0000000..9e7bd62
 + * of the software, even if it has been or is hereafter advised of the
 + * possibility of such damages.
 + */
-+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
-+#define FS_NFS_NFS4BLOCKLAYOUT_H
++#include <linux/module.h>
++#include <linux/buffer_head.h> /* __bread */
 +
-+#include <linux/nfs_fs.h>
-+#include <linux/dm-ioctl.h> /* Needed for struct dm_ioctl*/
-+#include "../pnfs.h"
++#include <linux/genhd.h>
++#include <linux/blkdev.h>
++#include <linux/hash.h>
 +
-+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9)
++#include "blocklayout.h"
 +
-+#define PG_pnfserr PG_owner_priv_1
-+#define PagePnfsErr(page)	test_bit(PG_pnfserr, &(page)->flags)
-+#define SetPagePnfsErr(page)	set_bit(PG_pnfserr, &(page)->flags)
-+#define ClearPagePnfsErr(page)	clear_bit(PG_pnfserr, &(page)->flags)
++#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 +
-+extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */
-+extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */
-+extern int dm_do_resume(struct dm_ioctl *param);
-+extern int dm_table_load(struct dm_ioctl *param, size_t param_size);
++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes)
++{
++	uint32_t *q = p + XDR_QUADLEN(nbytes);
++	if (unlikely(q > end || q < p))
++		return NULL;
++	return p;
++}
++EXPORT_SYMBOL(blk_overflow);
 +
-+struct block_mount_id {
-+	spinlock_t			bm_lock;    /* protects list */
-+	struct list_head		bm_devlist; /* holds pnfs_block_dev */
-+};
++/* Open a block_device by device number. */
++struct block_device *nfs4_blkdev_get(dev_t dev)
++{
++	struct block_device *bd;
 +
-+struct pnfs_block_dev {
-+	struct list_head		bm_node;
-+	struct nfs4_deviceid		bm_mdevid;    /* associated devid */
-+	struct block_device		*bm_mdev;     /* meta device itself */
-+};
++	dprintk("%s enter\n", __func__);
++	bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
++	if (IS_ERR(bd))
++		goto fail;
++	return bd;
++fail:
++	dprintk("%s failed to open device : %ld\n",
++			__func__, PTR_ERR(bd));
++	return NULL;
++}
 +
-+/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */
-+struct visible_block_device {
-+	struct list_head	vi_node;
-+	struct block_device	*vi_bdev;
-+	int			vi_mapped;
-+	int			vi_put_done;
-+};
++/*
++ * Release the block device
++ */
++int nfs4_blkdev_put(struct block_device *bdev)
++{
++	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
++			MINOR(bdev->bd_dev));
++	return blkdev_put(bdev, FMODE_READ);
++}
 +
-+enum blk_vol_type {
-+	PNFS_BLOCK_VOLUME_SIMPLE   = 0,	/* maps to a single LU */
-+	PNFS_BLOCK_VOLUME_SLICE    = 1,	/* slice of another volume */
-+	PNFS_BLOCK_VOLUME_CONCAT   = 2,	/* concatenation of multiple volumes */
-+	PNFS_BLOCK_VOLUME_STRIPE   = 3	/* striped across multiple volumes */
-+};
++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded
++ * in dev->dev_addr_buf.
++ */
++struct pnfs_block_dev *
++nfs4_blk_decode_device(struct nfs_server *server,
++		       struct pnfs_device *dev,
++		       struct list_head *sdlist)
++{
++	struct pnfs_block_dev *rv = NULL;
++	struct block_device *bd = NULL;
++	struct pipefs_hdr *msg = NULL, *reply = NULL;
++	uint32_t major, minor;
 +
-+/* All disk offset/lengths are stored in 512-byte sectors */
-+struct pnfs_blk_volume {
-+	uint32_t		bv_type;
-+	sector_t 		bv_size;
-+	struct pnfs_blk_volume 	**bv_vols;
-+	int 			bv_vol_n;
-+	union {
-+		dev_t			bv_dev;
-+		sector_t		bv_stripe_unit;
-+		sector_t 		bv_offset;
-+	};
-+};
++	dprintk("%s enter\n", __func__);
 +
-+/* Since components need not be aligned, cannot use sector_t */
-+struct pnfs_blk_sig_comp {
-+	int64_t 	bs_offset;  /* In bytes */
-+	uint32_t   	bs_length;  /* In bytes */
-+	char 		*bs_string;
-+};
-+
-+/* Maximum number of signatures components in a simple volume */
-+# define PNFS_BLOCK_MAX_SIG_COMP 16
-+
-+struct pnfs_blk_sig {
-+	int 				si_num_comps;
-+	struct pnfs_blk_sig_comp	si_comps[PNFS_BLOCK_MAX_SIG_COMP];
-+};
-+
-+enum exstate4 {
-+	PNFS_BLOCK_READWRITE_DATA	= 0,
-+	PNFS_BLOCK_READ_DATA		= 1,
-+	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
-+	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
-+};
-+
-+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
-+
-+struct my_tree_t {
-+	sector_t		mtt_step_size;	/* Internal sector alignment */
-+	struct list_head	mtt_stub; /* Should be a radix tree */
-+};
++	if (IS_ERR(bl_device_pipe))
++		return NULL;
++	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
++	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
++		dev->mincount);
++	msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area,
++				    dev->mincount);
++	if (IS_ERR(msg)) {
++		dprintk("ERROR: couldn't make pipefs message.\n");
++		goto out_err;
++	}
++	msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
++	msg->status = BL_DEVICE_REQUEST_INIT;
 +
-+struct pnfs_inval_markings {
-+	spinlock_t	im_lock;
-+	struct my_tree_t im_tree;	/* Sectors that need LAYOUTCOMMIT */
-+	sector_t	im_block_size;	/* Server blocksize in sectors */
-+};
++	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
++	reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
++					      &bl_device_list, 0, 0);
 +
-+struct pnfs_inval_tracking {
-+	struct list_head it_link;
-+	int		 it_sector;
-+	int		 it_tags;
-+};
++	if (IS_ERR(reply)) {
++		dprintk("ERROR: upcall_waitreply failed\n");
++		goto out_err;
++	}
++	if (reply->status != BL_DEVICE_REQUEST_PROC) {
++		dprintk("%s failed to open device: %ld\n",
++			__func__, PTR_ERR(bd));
++		goto out_err;
++	}
++	memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t));
++	memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)),
++		sizeof(uint32_t));
++	bd = nfs4_blkdev_get(MKDEV(major, minor));
++	if (IS_ERR(bd)) {
++		dprintk("%s failed to open device : %ld\n",
++			__func__, PTR_ERR(bd));
++		goto out_err;
++	}
 +
-+/* sector_t fields are all in 512-byte sectors */
-+struct pnfs_block_extent {
-+	struct kref	be_refcnt;
-+	struct list_head be_node;	/* link into lseg list */
-+	struct nfs4_deviceid be_devid;  /* STUB - remevable??? */
-+	struct block_device *be_mdev;
-+	sector_t	be_f_offset;	/* the starting offset in the file */
-+	sector_t	be_length;	/* the size of the extent */
-+	sector_t	be_v_offset;	/* the starting offset in the volume */
-+	enum exstate4	be_state;	/* the state of this extent */
-+	struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
-+};
++	rv = kzalloc(sizeof(*rv), GFP_KERNEL);
++	if (!rv)
++		goto out_err;
 +
-+/* Shortened extent used by LAYOUTCOMMIT */
-+struct pnfs_block_short_extent {
-+	struct list_head bse_node;
-+	struct nfs4_deviceid bse_devid;	/* STUB - removable??? */
-+	struct block_device *bse_mdev;
-+	sector_t	bse_f_offset;	/* the starting offset in the file */
-+	sector_t	bse_length;	/* the size of the extent */
-+};
++	rv->bm_mdev = bd;
++	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
++	dprintk("%s Created device %s with bd_block_size %u\n",
++		__func__,
++		bd->bd_disk->disk_name,
++		bd->bd_block_size);
++	kfree(reply);
++	kfree(msg);
++	return rv;
 +
-+static inline void
-+INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
-+{
-+	spin_lock_init(&marks->im_lock);
-+	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
-+	marks->im_block_size = blocksize;
-+	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
-+					   blocksize);
++out_err:
++	kfree(rv);
++	if (!IS_ERR(reply))
++		kfree(reply);
++	if (!IS_ERR(msg))
++		kfree(msg);
++	return NULL;
 +}
 +
-+enum extentclass4 {
-+	RW_EXTENT	= 0, /* READWRTE and INVAL */
-+	RO_EXTENT	= 1, /* READ and NONE */
-+	EXTENT_LISTS	= 2,
-+};
-+
-+static inline int choose_list(enum exstate4 state)
++/* Map deviceid returned by the server to constructed block_device */
++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
++					    struct nfs4_deviceid *id)
 +{
-+	if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
-+		return RO_EXTENT;
-+	else
-+		return RW_EXTENT;
++	struct block_device *rv = NULL;
++	struct block_mount_id *mid;
++	struct pnfs_block_dev *dev;
++
++	dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
++	mid = BLK_ID(lo);
++	spin_lock(&mid->bm_lock);
++	list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
++		if (memcmp(id->data, dev->bm_mdevid.data,
++			   NFS4_DEVICEID4_SIZE) == 0) {
++			rv = dev->bm_mdev;
++			goto out;
++		}
++	}
++ out:
++	spin_unlock(&mid->bm_lock);
++	dprintk("%s returning %p\n", __func__, rv);
++	return rv;
 +}
 +
-+struct pnfs_block_layout {
-+	struct pnfs_layout_hdr bl_layout;
-+	struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
-+	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
-+	struct list_head	bl_extents[EXTENT_LISTS]; /* R and RW extents */
-+	struct list_head	bl_commit;	/* Needs layout commit */
-+	unsigned int		bl_count;	/* entries in bl_commit */
-+	sector_t		bl_blocksize;  /* Server blocksize in sectors */
++/* Tracks info needed to ensure extents in layout obey constraints of spec */
++struct layout_verification {
++	u32 mode;	/* R or RW */
++	u64 start;	/* Expected start of next non-COW extent */
++	u64 inval;	/* Start of INVAL coverage */
++	u64 cowread;	/* End of COW read coverage */
 +};
 +
-+/* this struct is comunicated between:
-+ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit
++/* Verify the extent meets the layout requirements of the pnfs-block draft,
++ * section 2.3.1.
 + */
-+struct bl_layoutupdate_data {
-+	struct list_head ranges;
-+};
-+
-+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->inode)->pnfs_ld_data))
-+
-+static inline struct pnfs_block_layout *
-+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
++static int verify_extent(struct pnfs_block_extent *be,
++			 struct layout_verification *lv)
 +{
-+	return container_of(lo, struct pnfs_block_layout, bl_layout);
++	if (lv->mode == IOMODE_READ) {
++		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
++		    be->be_state == PNFS_BLOCK_INVALID_DATA)
++			return -EIO;
++		if (be->be_f_offset != lv->start)
++			return -EIO;
++		lv->start += be->be_length;
++		return 0;
++	}
++	/* lv->mode == IOMODE_RW */
++	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
++		if (be->be_f_offset != lv->start)
++			return -EIO;
++		if (lv->cowread > lv->start)
++			return -EIO;
++		lv->start += be->be_length;
++		lv->inval = lv->start;
++		return 0;
++	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
++		if (be->be_f_offset != lv->start)
++			return -EIO;
++		lv->start += be->be_length;
++		return 0;
++	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
++		if (be->be_f_offset > lv->start)
++			return -EIO;
++		if (be->be_f_offset < lv->inval)
++			return -EIO;
++		if (be->be_f_offset < lv->cowread)
++			return -EIO;
++		/* It looks like you might want to min this with lv->start,
++		 * but you really don't.
++		 */
++		lv->inval = lv->inval + be->be_length;
++		lv->cowread = be->be_f_offset + be->be_length;
++		return 0;
++	} else
++		return -EIO;
 +}
 +
-+static inline struct pnfs_block_layout *
-+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
++/* XDR decode pnfs_block_layout4 structure */
++int
++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
++			   struct nfs4_layoutget_res *lgr)
 +{
-+	return BLK_LO2EXT(lseg->layout);
-+}
++	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
++	uint32_t *p = (uint32_t *)lgr->layout.buf;
++	uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len);
++	int i, status = -EIO;
++	uint32_t count;
++	struct pnfs_block_extent *be = NULL, *save;
++	uint64_t tmp; /* Used by READSECTOR */
++	struct layout_verification lv = {
++		.mode = lgr->range.iomode,
++		.start = lgr->range.offset >> 9,
++		.inval = lgr->range.offset >> 9,
++		.cowread = lgr->range.offset >> 9,
++	};
 +
-+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes);
++	LIST_HEAD(extents);
 +
-+#define BLK_READBUF(p, e, nbytes)  do { \
-+	p = blk_overflow(p, e, nbytes); \
-+	if (!p) { \
-+		printk(KERN_WARNING \
-+			"%s: reply buffer overflowed in line %d.\n", \
-+			__func__, __LINE__); \
-+		goto out_err; \
-+	} \
-+} while (0)
++	BLK_READBUF(p, end, 4);
++	READ32(count);
 +
-+#define READ32(x)         (x) = ntohl(*p++)
-+#define READ64(x)         do {                  \
-+	(x) = (uint64_t)ntohl(*p++) << 32;           \
-+	(x) |= ntohl(*p++);                     \
-+} while (0)
-+#define COPYMEM(x, nbytes) do {                 \
-+	memcpy((x), p, nbytes);                 \
-+	p += XDR_QUADLEN(nbytes);               \
-+} while (0)
-+#define READ_DEVID(x)	COPYMEM((x)->data, NFS4_DEVICEID4_SIZE)
-+#define READ_SECTOR(x)     do { \
-+	READ64(tmp); \
-+	if (tmp & 0x1ff) { \
-+		printk(KERN_WARNING \
-+		       "%s Value not 512-byte aligned at line %d\n", \
-+		       __func__, __LINE__);			     \
-+		goto out_err; \
-+	} \
-+	(x) = tmp >> 9; \
-+} while (0)
-+
-+#define WRITE32(n)               do { \
-+	*p++ = htonl(n); \
-+	} while (0)
-+#define WRITE64(n)               do {                           \
-+	*p++ = htonl((uint32_t)((n) >> 32));			\
-+	*p++ = htonl((uint32_t)(n));				\
-+} while (0)
-+#define WRITEMEM(ptr, nbytes)     do {                          \
-+	p = xdr_encode_opaque_fixed(p, ptr, nbytes);	\
-+} while (0)
-+#define WRITE_DEVID(x)  WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE)
-+
-+/* blocklayoutdev.c */
-+struct block_device *nfs4_blkdev_get(dev_t dev);
-+int nfs4_blkdev_put(struct block_device *bdev);
-+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
-+					      struct pnfs_device *dev,
-+					      struct list_head *sdlist);
-+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-+			       struct nfs4_layoutget_res *lgr);
-+int nfs4_blk_create_block_disk_list(struct list_head *);
-+void nfs4_blk_destroy_disk_list(struct list_head *);
-+/* blocklayoutdm.c */
-+int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *);
-+void free_block_dev(struct pnfs_block_dev *bdev);
-+/* extents.c */
-+struct pnfs_block_extent *
-+find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-+		struct pnfs_block_extent **cow_read);
-+int mark_initialized_sectors(struct pnfs_inval_markings *marks,
-+			     sector_t offset, sector_t length,
-+			     sector_t **pages);
-+void put_extent(struct pnfs_block_extent *be);
-+struct pnfs_block_extent *alloc_extent(void);
-+struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
-+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
-+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-+				   struct xdr_stream *xdr,
-+				   const struct nfs4_layoutcommit_args *arg);
-+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-+				   const struct nfs4_layoutcommit_args *arg,
-+				   int status);
-+int add_and_merge_extent(struct pnfs_block_layout *bl,
-+			 struct pnfs_block_extent *new);
-+int mark_for_commit(struct pnfs_block_extent *be,
-+		    sector_t offset, sector_t length);
-+
-+#include <linux/sunrpc/simple_rpc_pipefs.h>
-+
-+extern struct pipefs_list bl_device_list;
-+extern struct dentry *bl_device_pipe;
-+
-+int bl_pipe_init(void);
-+void bl_pipe_exit(void);
++	dprintk("%s enter, number of extents %i\n", __func__, count);
++	BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count);
 +
-+#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
-+#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
-+#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
-+#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
-+#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
++	/* Decode individual extents, putting them in temporary
++	 * staging area until whole layout is decoded to make error
++	 * recovery easier.
++	 */
++	for (i = 0; i < count; i++) {
++		be = alloc_extent();
++		if (!be) {
++			status = -ENOMEM;
++			goto out_err;
++		}
++		READ_DEVID(&be->be_devid);
++		be->be_mdev = translate_devid(lo, &be->be_devid);
++		if (!be->be_mdev)
++			goto out_err;
++		/* The next three values are read in as bytes,
++		 * but stored as 512-byte sector lengths
++		 */
++		READ_SECTOR(be->be_f_offset);
++		READ_SECTOR(be->be_length);
++		READ_SECTOR(be->be_v_offset);
++		READ32(be->be_state);
++		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
++			be->be_inval = &bl->bl_inval;
++		if (verify_extent(be, &lv)) {
++			dprintk("%s verify failed\n", __func__);
++			goto out_err;
++		}
++		list_add_tail(&be->be_node, &extents);
++	}
++	if (p != end) {
++		dprintk("%s Undecoded cruft at end of opaque\n", __func__);
++		be = NULL;
++		goto out_err;
++	}
++	if (lgr->range.offset + lgr->range.length != lv.start << 9) {
++		dprintk("%s Final length mismatch\n", __func__);
++		be = NULL;
++		goto out_err;
++	}
++	if (lv.start < lv.cowread) {
++		dprintk("%s Final uncovered COW extent\n", __func__);
++		be = NULL;
++		goto out_err;
++	}
++	/* Extents decoded properly, now try to merge them in to
++	 * existing layout extents.
++	 */
++	spin_lock(&bl->bl_ext_lock);
++	list_for_each_entry_safe(be, save, &extents, be_node) {
++		list_del(&be->be_node);
++		status = add_and_merge_extent(bl, be);
++		if (status) {
++			spin_unlock(&bl->bl_ext_lock);
++			/* This is a fairly catastrophic error, as the
++			 * entire layout extent lists are now corrupted.
++			 * We should have some way to distinguish this.
++			 */
++			be = NULL;
++			goto out_err;
++		}
++	}
++	spin_unlock(&bl->bl_ext_lock);
++	status = 0;
++ out:
++	dprintk("%s returns %i\n", __func__, status);
++	return status;
 +
-+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
-diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
-new file mode 100644
-index 0000000..17bd25a
---- /dev/null
-+++ b/fs/nfs/blocklayout/blocklayoutdev.c
-@@ -0,0 +1,335 @@
++ out_err:
++	put_extent(be);
++	while (!list_empty(&extents)) {
++		be = list_first_entry(&extents, struct pnfs_block_extent,
++				      be_node);
++		list_del(&be->be_node);
++		put_extent(be);
++	}
++	goto out;
++}
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig	2011-01-28 09:43:53.309776857 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayoutdm.c	2011-01-28 09:43:53.309776857 -0500
+@@ -0,0 +1,120 @@
 +/*
-+ *  linux/fs/nfs/blocklayout/blocklayoutdev.c
++ *  linux/fs/nfs/blocklayout/blocklayoutdm.c
 + *
-+ *  Device operations for the pnfs nfs4 file layout driver.
++ *  Module for the NFSv4.1 pNFS block layout driver.
 + *
-+ *  Copyright (c) 2006 The Regents of the University of Michigan.
++ *  Copyright (c) 2007 The Regents of the University of Michigan.
 + *  All rights reserved.
 + *
-+ *  Andy Adamson <andros at citi.umich.edu>
 + *  Fred Isaman <iisaman at umich.edu>
++ *  Andy Adamson <andros at citi.umich.edu>
 + *
 + * permission is granted to use, copy, create derivative works and
 + * redistribute this software and such derivative works for any purpose,
@@ -3386,326 +3157,109 @@ index 0000000..17bd25a
 + * of the software, even if it has been or is hereafter advised of the
 + * possibility of such damages.
 + */
-+#include <linux/module.h>
-+#include <linux/buffer_head.h> /* __bread */
 +
-+#include <linux/genhd.h>
-+#include <linux/blkdev.h>
++#include <linux/genhd.h> /* gendisk - used in a dprintk*/
++#include <linux/sched.h>
 +#include <linux/hash.h>
 +
 +#include "blocklayout.h"
 +
 +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 +
-+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes)
-+{
-+	uint32_t *q = p + XDR_QUADLEN(nbytes);
-+	if (unlikely(q > end || q < p))
-+		return NULL;
-+	return p;
-+}
-+EXPORT_SYMBOL(blk_overflow);
-+
-+/* Open a block_device by device number. */
-+struct block_device *nfs4_blkdev_get(dev_t dev)
-+{
-+	struct block_device *bd;
-+
-+	dprintk("%s enter\n", __func__);
-+	bd = open_by_devnum(dev, FMODE_READ);
-+	if (IS_ERR(bd))
-+		goto fail;
-+	return bd;
-+fail:
-+	dprintk("%s failed to open device : %ld\n",
-+			__func__, PTR_ERR(bd));
-+	return NULL;
-+}
-+
-+/*
-+ * Release the block device
-+ */
-+int nfs4_blkdev_put(struct block_device *bdev)
-+{
-+	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
-+			MINOR(bdev->bd_dev));
-+	bd_release(bdev);
-+	return blkdev_put(bdev, FMODE_READ);
-+}
++/* Defines used for calculating memory usage in nfs4_blk_flatten() */
++#define ARGSIZE   24    /* Max bytes needed for linear target arg string */
++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE)
++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE)
++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \
++			    (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE)
++#define roundup8(x) (((x)+7) & ~7)
++#define sizeof8(x) roundup8(sizeof(x))
 +
-+/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded
-+ * in dev->dev_addr_buf.
-+ */
-+struct pnfs_block_dev *
-+nfs4_blk_decode_device(struct nfs_server *server,
-+		       struct pnfs_device *dev,
-+		       struct list_head *sdlist)
++static int dev_remove(dev_t dev)
 +{
-+	struct pnfs_block_dev *rv = NULL;
-+	struct block_device *bd = NULL;
++	int ret = 1;
 +	struct pipefs_hdr *msg = NULL, *reply = NULL;
-+	uint32_t major, minor;
++	uint64_t bl_dev;
++	uint32_t major = MAJOR(dev), minor = MINOR(dev);
 +
-+	dprintk("%s enter\n", __func__);
++	dprintk("Entering %s\n", __func__);
 +
 +	if (IS_ERR(bl_device_pipe))
-+		return NULL;
-+	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
-+	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
-+		dev->mincount);
-+	msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area,
-+				    dev->mincount);
++		return ret;
++
++	memcpy((void *)&bl_dev, &major, sizeof(uint32_t));
++	memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t));
++	msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev,
++				    sizeof(uint64_t));
 +	if (IS_ERR(msg)) {
 +		dprintk("ERROR: couldn't make pipefs message.\n");
-+		goto out_err;
++		goto out;
 +	}
 +	msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
 +	msg->status = BL_DEVICE_REQUEST_INIT;
 +
-+	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
 +	reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
 +					      &bl_device_list, 0, 0);
-+
 +	if (IS_ERR(reply)) {
 +		dprintk("ERROR: upcall_waitreply failed\n");
-+		goto out_err;
-+	}
-+	if (reply->status != BL_DEVICE_REQUEST_PROC) {
-+		dprintk("%s failed to open device: %ld\n",
-+			__func__, PTR_ERR(bd));
-+		goto out_err;
-+	}
-+	memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t));
-+	memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)),
-+		sizeof(uint32_t));
-+	bd = nfs4_blkdev_get(MKDEV(major, minor));
-+	if (IS_ERR(bd)) {
-+		dprintk("%s failed to open device : %ld\n",
-+			__func__, PTR_ERR(bd));
-+		goto out_err;
++		goto out;
 +	}
 +
-+	rv = kzalloc(sizeof(*rv), GFP_KERNEL);
-+	if (!rv)
-+		goto out_err;
-+
-+	rv->bm_mdev = bd;
-+	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
-+	dprintk("%s Created device %s with bd_block_size %u\n",
-+		__func__,
-+		bd->bd_disk->disk_name,
-+		bd->bd_block_size);
-+	kfree(reply);
-+	kfree(msg);
-+	return rv;
-+
-+out_err:
-+	kfree(rv);
++	if (reply->status == BL_DEVICE_REQUEST_PROC)
++		ret = 0; /*TODO: what to return*/
++out:
 +	if (!IS_ERR(reply))
 +		kfree(reply);
 +	if (!IS_ERR(msg))
 +		kfree(msg);
-+	return NULL;
++	return ret;
 +}
 +
-+/* Map deviceid returned by the server to constructed block_device */
-+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
-+					    struct nfs4_deviceid *id)
++/*
++ * Release meta device
++ */
++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
 +{
-+	struct block_device *rv = NULL;
-+	struct block_mount_id *mid;
-+	struct pnfs_block_dev *dev;
-+
-+	dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
-+	mid = BLK_ID(lo);
-+	spin_lock(&mid->bm_lock);
-+	list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
-+		if (memcmp(id->data, dev->bm_mdevid.data,
-+			   NFS4_DEVICEID4_SIZE) == 0) {
-+			rv = dev->bm_mdev;
-+			goto out;
-+		}
-+	}
-+ out:
-+	spin_unlock(&mid->bm_lock);
-+	dprintk("%s returning %p\n", __func__, rv);
-+	return rv;
-+}
++	int rv;
 +
-+/* Tracks info needed to ensure extents in layout obey constraints of spec */
-+struct layout_verification {
-+	u32 mode;	/* R or RW */
-+	u64 start;	/* Expected start of next non-COW extent */
-+	u64 inval;	/* Start of INVAL coverage */
-+	u64 cowread;	/* End of COW read coverage */
-+};
++	dprintk("%s Releasing\n", __func__);
++	/* XXX Check return? */
++	rv = nfs4_blkdev_put(bdev->bm_mdev);
++	dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv);
 +
-+/* Verify the extent meets the layout requirements of the pnfs-block draft,
-+ * section 2.3.1.
-+ */
-+static int verify_extent(struct pnfs_block_extent *be,
-+			 struct layout_verification *lv)
-+{
-+	if (lv->mode == IOMODE_READ) {
-+		if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-+		    be->be_state == PNFS_BLOCK_INVALID_DATA)
-+			return -EIO;
-+		if (be->be_f_offset != lv->start)
-+			return -EIO;
-+		lv->start += be->be_length;
-+		return 0;
-+	}
-+	/* lv->mode == IOMODE_RW */
-+	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
-+		if (be->be_f_offset != lv->start)
-+			return -EIO;
-+		if (lv->cowread > lv->start)
-+			return -EIO;
-+		lv->start += be->be_length;
-+		lv->inval = lv->start;
-+		return 0;
-+	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-+		if (be->be_f_offset != lv->start)
-+			return -EIO;
-+		lv->start += be->be_length;
-+		return 0;
-+	} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
-+		if (be->be_f_offset > lv->start)
-+			return -EIO;
-+		if (be->be_f_offset < lv->inval)
-+			return -EIO;
-+		if (be->be_f_offset < lv->cowread)
-+			return -EIO;
-+		/* It looks like you might want to min this with lv->start,
-+		 * but you really don't.
-+		 */
-+		lv->inval = lv->inval + be->be_length;
-+		lv->cowread = be->be_f_offset + be->be_length;
-+		return 0;
-+	} else
-+		return -EIO;
++	rv = dev_remove(bdev->bm_mdev->bd_dev);
++	dprintk("%s Returns %d\n", __func__, rv);
++	return rv;
 +}
 +
-+/* XDR decode pnfs_block_layout4 structure */
-+int
-+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-+			   struct nfs4_layoutget_res *lgr)
++void free_block_dev(struct pnfs_block_dev *bdev)
 +{
-+	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-+	uint32_t *p = (uint32_t *)lgr->layout.buf;
-+	uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len);
-+	int i, status = -EIO;
-+	uint32_t count;
-+	struct pnfs_block_extent *be = NULL, *save;
-+	uint64_t tmp; /* Used by READSECTOR */
-+	struct layout_verification lv = {
-+		.mode = lgr->range.iomode,
-+		.start = lgr->range.offset >> 9,
-+		.inval = lgr->range.offset >> 9,
-+		.cowread = lgr->range.offset >> 9,
-+	};
-+
-+	LIST_HEAD(extents);
-+
-+	BLK_READBUF(p, end, 4);
-+	READ32(count);
-+
-+	dprintk("%s enter, number of extents %i\n", __func__, count);
-+	BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count);
-+
-+	/* Decode individual extents, putting them in temporary
-+	 * staging area until whole layout is decoded to make error
-+	 * recovery easier.
-+	 */
-+	for (i = 0; i < count; i++) {
-+		be = alloc_extent();
-+		if (!be) {
-+			status = -ENOMEM;
-+			goto out_err;
-+		}
-+		READ_DEVID(&be->be_devid);
-+		be->be_mdev = translate_devid(lo, &be->be_devid);
-+		if (!be->be_mdev)
-+			goto out_err;
-+		/* The next three values are read in as bytes,
-+		 * but stored as 512-byte sector lengths
-+		 */
-+		READ_SECTOR(be->be_f_offset);
-+		READ_SECTOR(be->be_length);
-+		READ_SECTOR(be->be_v_offset);
-+		READ32(be->be_state);
-+		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-+			be->be_inval = &bl->bl_inval;
-+		if (verify_extent(be, &lv)) {
-+			dprintk("%s verify failed\n", __func__);
-+			goto out_err;
-+		}
-+		list_add_tail(&be->be_node, &extents);
-+	}
-+	if (p != end) {
-+		dprintk("%s Undecoded cruft at end of opaque\n", __func__);
-+		be = NULL;
-+		goto out_err;
-+	}
-+	if (lgr->range.offset + lgr->range.length != lv.start << 9) {
-+		dprintk("%s Final length mismatch\n", __func__);
-+		be = NULL;
-+		goto out_err;
-+	}
-+	if (lv.start < lv.cowread) {
-+		dprintk("%s Final uncovered COW extent\n", __func__);
-+		be = NULL;
-+		goto out_err;
-+	}
-+	/* Extents decoded properly, now try to merge them in to
-+	 * existing layout extents.
-+	 */
-+	spin_lock(&bl->bl_ext_lock);
-+	list_for_each_entry_safe(be, save, &extents, be_node) {
-+		list_del(&be->be_node);
-+		status = add_and_merge_extent(bl, be);
-+		if (status) {
-+			spin_unlock(&bl->bl_ext_lock);
-+			/* This is a fairly catastrophic error, as the
-+			 * entire layout extent lists are now corrupted.
-+			 * We should have some way to distinguish this.
-+			 */
-+			be = NULL;
-+			goto out_err;
++	if (bdev) {
++		if (bdev->bm_mdev) {
++			dprintk("%s Removing DM device: %d:%d\n",
++				__func__,
++				MAJOR(bdev->bm_mdev->bd_dev),
++				MINOR(bdev->bm_mdev->bd_dev));
++			/* XXX Check status ?? */
++			nfs4_blk_metadev_release(bdev);
 +		}
++		kfree(bdev);
 +	}
-+	spin_unlock(&bl->bl_ext_lock);
-+	status = 0;
-+ out:
-+	dprintk("%s returns %i\n", __func__, status);
-+	return status;
-+
-+ out_err:
-+	put_extent(be);
-+	while (!list_empty(&extents)) {
-+		be = list_first_entry(&extents, struct pnfs_block_extent,
-+				      be_node);
-+		list_del(&be->be_node);
-+		put_extent(be);
-+	}
-+	goto out;
 +}
-diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
-new file mode 100644
-index 0000000..097dd05
---- /dev/null
-+++ b/fs/nfs/blocklayout/blocklayoutdm.c
-@@ -0,0 +1,120 @@
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h.orig	2011-01-28 09:43:53.308777059 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/blocklayout.h	2011-01-28 09:43:53.308777059 -0500
+@@ -0,0 +1,302 @@
 +/*
-+ *  linux/fs/nfs/blocklayout/blocklayoutdm.c
++ *  linux/fs/nfs/blocklayout/blocklayout.h
 + *
 + *  Module for the NFSv4.1 pNFS block layout driver.
 + *
-+ *  Copyright (c) 2007 The Regents of the University of Michigan.
++ *  Copyright (c) 2006 The Regents of the University of Michigan.
 + *  All rights reserved.
 + *
-+ *  Fred Isaman <iisaman at umich.edu>
 + *  Andy Adamson <andros at citi.umich.edu>
++ *  Fred Isaman <iisaman at umich.edu>
 + *
 + * permission is granted to use, copy, create derivative works and
 + * redistribute this software and such derivative works for any purpose,
@@ -3727,181 +3281,361 @@ index 0000000..097dd05
 + * of the software, even if it has been or is hereafter advised of the
 + * possibility of such damages.
 + */
++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
++#define FS_NFS_NFS4BLOCKLAYOUT_H
 +
-+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-+#include <linux/sched.h>
-+#include <linux/hash.h>
++#include <linux/nfs_fs.h>
++#include <linux/dm-ioctl.h> /* Needed for struct dm_ioctl*/
++#include "../pnfs.h"
 +
-+#include "blocklayout.h"
++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9)
 +
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++#define PG_pnfserr PG_owner_priv_1
++#define PagePnfsErr(page)	test_bit(PG_pnfserr, &(page)->flags)
++#define SetPagePnfsErr(page)	set_bit(PG_pnfserr, &(page)->flags)
++#define ClearPagePnfsErr(page)	clear_bit(PG_pnfserr, &(page)->flags)
 +
-+/* Defines used for calculating memory usage in nfs4_blk_flatten() */
-+#define ARGSIZE   24    /* Max bytes needed for linear target arg string */
-+#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE)
-+#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE)
-+#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \
-+			    (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE)
-+#define roundup8(x) (((x)+7) & ~7)
-+#define sizeof8(x) roundup8(sizeof(x))
++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */
++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */
++extern int dm_do_resume(struct dm_ioctl *param);
++extern int dm_table_load(struct dm_ioctl *param, size_t param_size);
 +
-+static int dev_remove(dev_t dev)
-+{
-+	int ret = 1;
-+	struct pipefs_hdr *msg = NULL, *reply = NULL;
-+	uint64_t bl_dev;
-+	uint32_t major = MAJOR(dev), minor = MINOR(dev);
++struct block_mount_id {
++	spinlock_t			bm_lock;    /* protects list */
++	struct list_head		bm_devlist; /* holds pnfs_block_dev */
++};
 +
-+	dprintk("Entering %s\n", __func__);
++struct pnfs_block_dev {
++	struct list_head		bm_node;
++	struct nfs4_deviceid		bm_mdevid;    /* associated devid */
++	struct block_device		*bm_mdev;     /* meta device itself */
++};
 +
-+	if (IS_ERR(bl_device_pipe))
-+		return ret;
++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */
++struct visible_block_device {
++	struct list_head	vi_node;
++	struct block_device	*vi_bdev;
++	int			vi_mapped;
++	int			vi_put_done;
++};
 +
-+	memcpy((void *)&bl_dev, &major, sizeof(uint32_t));
-+	memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t));
-+	msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev,
-+				    sizeof(uint64_t));
-+	if (IS_ERR(msg)) {
-+		dprintk("ERROR: couldn't make pipefs message.\n");
-+		goto out;
-+	}
-+	msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
-+	msg->status = BL_DEVICE_REQUEST_INIT;
++enum blk_vol_type {
++	PNFS_BLOCK_VOLUME_SIMPLE   = 0,	/* maps to a single LU */
++	PNFS_BLOCK_VOLUME_SLICE    = 1,	/* slice of another volume */
++	PNFS_BLOCK_VOLUME_CONCAT   = 2,	/* concatenation of multiple volumes */
++	PNFS_BLOCK_VOLUME_STRIPE   = 3	/* striped across multiple volumes */
++};
 +
-+	reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
-+					      &bl_device_list, 0, 0);
-+	if (IS_ERR(reply)) {
-+		dprintk("ERROR: upcall_waitreply failed\n");
-+		goto out;
-+	}
++/* All disk offset/lengths are stored in 512-byte sectors */
++struct pnfs_blk_volume {
++	uint32_t		bv_type;
++	sector_t 		bv_size;
++	struct pnfs_blk_volume 	**bv_vols;
++	int 			bv_vol_n;
++	union {
++		dev_t			bv_dev;
++		sector_t		bv_stripe_unit;
++		sector_t 		bv_offset;
++	};
++};
 +
-+	if (reply->status == BL_DEVICE_REQUEST_PROC)
-+		ret = 0; /*TODO: what to return*/
-+out:
-+	if (!IS_ERR(reply))
-+		kfree(reply);
-+	if (!IS_ERR(msg))
-+		kfree(msg);
-+	return ret;
-+}
++/* Since components need not be aligned, cannot use sector_t */
++struct pnfs_blk_sig_comp {
++	int64_t 	bs_offset;  /* In bytes */
++	uint32_t   	bs_length;  /* In bytes */
++	char 		*bs_string;
++};
 +
-+/*
-+ * Release meta device
-+ */
-+static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-+{
-+	int rv;
++/* Maximum number of signatures components in a simple volume */
++# define PNFS_BLOCK_MAX_SIG_COMP 16
 +
-+	dprintk("%s Releasing\n", __func__);
-+	/* XXX Check return? */
-+	rv = nfs4_blkdev_put(bdev->bm_mdev);
-+	dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv);
++struct pnfs_blk_sig {
++	int 				si_num_comps;
++	struct pnfs_blk_sig_comp	si_comps[PNFS_BLOCK_MAX_SIG_COMP];
++};
 +
-+	rv = dev_remove(bdev->bm_mdev->bd_dev);
-+	dprintk("%s Returns %d\n", __func__, rv);
-+	return rv;
-+}
++enum exstate4 {
++	PNFS_BLOCK_READWRITE_DATA	= 0,
++	PNFS_BLOCK_READ_DATA		= 1,
++	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */
++	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */
++};
 +
-+void free_block_dev(struct pnfs_block_dev *bdev)
-+{
-+	if (bdev) {
-+		if (bdev->bm_mdev) {
-+			dprintk("%s Removing DM device: %d:%d\n",
-+				__func__,
-+				MAJOR(bdev->bm_mdev->bd_dev),
-+				MINOR(bdev->bm_mdev->bd_dev));
-+			/* XXX Check status ?? */
-+			nfs4_blk_metadev_release(bdev);
-+		}
-+		kfree(bdev);
-+	}
-+}
-diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
-new file mode 100644
-index 0000000..40dff82
---- /dev/null
-+++ b/fs/nfs/blocklayout/extents.c
-@@ -0,0 +1,948 @@
-+/*
-+ *  linux/fs/nfs/blocklayout/blocklayout.h
-+ *
-+ *  Module for the NFSv4.1 pNFS block layout driver.
-+ *
-+ *  Copyright (c) 2006 The Regents of the University of Michigan.
-+ *  All rights reserved.
-+ *
-+ *  Andy Adamson <andros at citi.umich.edu>
-+ *  Fred Isaman <iisaman at umich.edu>
-+ *
-+ * permission is granted to use, copy, create derivative works and
-+ * redistribute this software and such derivative works for any purpose,
-+ * so long as the name of the university of michigan is not used in
-+ * any advertising or publicity pertaining to the use or distribution
-+ * of this software without specific, written prior authorization.  if
-+ * the above copyright notice or any other identification of the
-+ * university of michigan is included in any copy of any portion of
-+ * this software, then the disclaimer below must also be included.
-+ *
-+ * this software is provided as is, without representation from the
-+ * university of michigan as to its fitness for any purpose, and without
-+ * warranty by the university of michigan of any kind, either express
-+ * or implied, including without limitation the implied warranties of
-+ * merchantability and fitness for a particular purpose.  the regents
-+ * of the university of michigan shall not be liable for any damages,
-+ * including special, indirect, incidental, or consequential damages,
-+ * with respect to any claim arising out or in connection with the use
-+ * of the software, even if it has been or is hereafter advised of the
-+ * possibility of such damages.
-+ */
++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
 +
-+#include "blocklayout.h"
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++struct my_tree_t {
++	sector_t		mtt_step_size;	/* Internal sector alignment */
++	struct list_head	mtt_stub; /* Should be a radix tree */
++};
 +
-+/* Bit numbers */
-+#define EXTENT_INITIALIZED 0
-+#define EXTENT_WRITTEN     1
-+#define EXTENT_IN_COMMIT   2
-+#define INTERNAL_EXISTS    MY_MAX_TAGS
-+#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
++struct pnfs_inval_markings {
++	spinlock_t	im_lock;
++	struct my_tree_t im_tree;	/* Sectors that need LAYOUTCOMMIT */
++	sector_t	im_block_size;	/* Server blocksize in sectors */
++};
 +
-+/* Returns largest t<=s s.t. t%base==0 */
-+static inline sector_t normalize(sector_t s, int base)
++struct pnfs_inval_tracking {
++	struct list_head it_link;
++	int		 it_sector;
++	int		 it_tags;
++};
++
++/* sector_t fields are all in 512-byte sectors */
++struct pnfs_block_extent {
++	struct kref	be_refcnt;
++	struct list_head be_node;	/* link into lseg list */
++	struct nfs4_deviceid be_devid;  /* STUB - remevable??? */
++	struct block_device *be_mdev;
++	sector_t	be_f_offset;	/* the starting offset in the file */
++	sector_t	be_length;	/* the size of the extent */
++	sector_t	be_v_offset;	/* the starting offset in the volume */
++	enum exstate4	be_state;	/* the state of this extent */
++	struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
++};
++
++/* Shortened extent used by LAYOUTCOMMIT */
++struct pnfs_block_short_extent {
++	struct list_head bse_node;
++	struct nfs4_deviceid bse_devid;	/* STUB - removable??? */
++	struct block_device *bse_mdev;
++	sector_t	bse_f_offset;	/* the starting offset in the file */
++	sector_t	bse_length;	/* the size of the extent */
++};
++
++static inline void
++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
 +{
-+	sector_t tmp = s; /* Since do_div modifies its argument */
-+	return s - do_div(tmp, base);
++	spin_lock_init(&marks->im_lock);
++	INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
++	marks->im_block_size = blocksize;
++	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
++					   blocksize);
 +}
 +
-+static inline sector_t normalize_up(sector_t s, int base)
++enum extentclass4 {
++	RW_EXTENT	= 0, /* READWRTE and INVAL */
++	RO_EXTENT	= 1, /* READ and NONE */
++	EXTENT_LISTS	= 2,
++};
++
++static inline int choose_list(enum exstate4 state)
 +{
-+	return normalize(s + base - 1, base);
++	if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
++		return RO_EXTENT;
++	else
++		return RW_EXTENT;
 +}
 +
-+/* Complete stub using list while determine API wanted */
++struct pnfs_block_layout {
++	struct pnfs_layout_hdr bl_layout;
++	struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
++	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
++	struct list_head	bl_extents[EXTENT_LISTS]; /* R and RW extents */
++	struct list_head	bl_commit;	/* Needs layout commit */
++	unsigned int		bl_count;	/* entries in bl_commit */
++	sector_t		bl_blocksize;  /* Server blocksize in sectors */
++};
 +
-+/* Returns tags, or negative */
-+static int32_t _find_entry(struct my_tree_t *tree, u64 s)
-+{
-+	struct pnfs_inval_tracking *pos;
++/* this struct is comunicated between:
++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit
++ */
++struct bl_layoutupdate_data {
++	struct list_head ranges;
++};
 +
-+	dprintk("%s(%llu) enter\n", __func__, s);
-+	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
-+		if (pos->it_sector > s)
-+			continue;
-+		else if (pos->it_sector == s)
-+			return pos->it_tags & INTERNAL_MASK;
-+		else
-+			break;
-+	}
-+	return -ENOENT;
++#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->inode)->pnfs_ld_data))
++
++static inline struct pnfs_block_layout *
++BLK_LO2EXT(struct pnfs_layout_hdr *lo)
++{
++	return container_of(lo, struct pnfs_block_layout, bl_layout);
 +}
 +
-+static inline
-+int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag)
++static inline struct pnfs_block_layout *
++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
 +{
-+	int32_t tags;
++	return BLK_LO2EXT(lseg->layout);
++}
 +
-+	dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
-+	s = normalize(s, tree->mtt_step_size);
++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes);
++
++#define BLK_READBUF(p, e, nbytes)  do { \
++	p = blk_overflow(p, e, nbytes); \
++	if (!p) { \
++		printk(KERN_WARNING \
++			"%s: reply buffer overflowed in line %d.\n", \
++			__func__, __LINE__); \
++		goto out_err; \
++	} \
++} while (0)
++
++#define READ32(x)         (x) = ntohl(*p++)
++#define READ64(x)         do {                  \
++	(x) = (uint64_t)ntohl(*p++) << 32;           \
++	(x) |= ntohl(*p++);                     \
++} while (0)
++#define COPYMEM(x, nbytes) do {                 \
++	memcpy((x), p, nbytes);                 \
++	p += XDR_QUADLEN(nbytes);               \
++} while (0)
++#define READ_DEVID(x)	COPYMEM((x)->data, NFS4_DEVICEID4_SIZE)
++#define READ_SECTOR(x)     do { \
++	READ64(tmp); \
++	if (tmp & 0x1ff) { \
++		printk(KERN_WARNING \
++		       "%s Value not 512-byte aligned at line %d\n", \
++		       __func__, __LINE__);			     \
++		goto out_err; \
++	} \
++	(x) = tmp >> 9; \
++} while (0)
++
++#define WRITE32(n)               do { \
++	*p++ = htonl(n); \
++	} while (0)
++#define WRITE64(n)               do {                           \
++	*p++ = htonl((uint32_t)((n) >> 32));			\
++	*p++ = htonl((uint32_t)(n));				\
++} while (0)
++#define WRITEMEM(ptr, nbytes)     do {                          \
++	p = xdr_encode_opaque_fixed(p, ptr, nbytes);	\
++} while (0)
++#define WRITE_DEVID(x)  WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE)
++
++/* blocklayoutdev.c */
++struct block_device *nfs4_blkdev_get(dev_t dev);
++int nfs4_blkdev_put(struct block_device *bdev);
++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
++					      struct pnfs_device *dev,
++					      struct list_head *sdlist);
++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
++			       struct nfs4_layoutget_res *lgr);
++int nfs4_blk_create_block_disk_list(struct list_head *);
++void nfs4_blk_destroy_disk_list(struct list_head *);
++/* blocklayoutdm.c */
++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *);
++void free_block_dev(struct pnfs_block_dev *bdev);
++/* extents.c */
++struct pnfs_block_extent *
++find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
++		struct pnfs_block_extent **cow_read);
++int mark_initialized_sectors(struct pnfs_inval_markings *marks,
++			     sector_t offset, sector_t length,
++			     sector_t **pages);
++void put_extent(struct pnfs_block_extent *be);
++struct pnfs_block_extent *alloc_extent(void);
++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
++				   struct xdr_stream *xdr,
++				   const struct nfs4_layoutcommit_args *arg);
++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
++				   const struct nfs4_layoutcommit_args *arg,
++				   int status);
++int add_and_merge_extent(struct pnfs_block_layout *bl,
++			 struct pnfs_block_extent *new);
++int mark_for_commit(struct pnfs_block_extent *be,
++		    sector_t offset, sector_t length);
++
++#include <linux/sunrpc/simple_rpc_pipefs.h>
++
++extern struct pipefs_list bl_device_list;
++extern struct dentry *bl_device_pipe;
++
++int bl_pipe_init(void);
++void bl_pipe_exit(void);
++
++#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
++#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
++#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
++#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
++#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
++
++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c.orig	2011-01-28 09:43:53.310776657 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/extents.c	2011-01-28 09:43:53.311776459 -0500
+@@ -0,0 +1,948 @@
++/*
++ *  linux/fs/nfs/blocklayout/blocklayout.h
++ *
++ *  Module for the NFSv4.1 pNFS block layout driver.
++ *
++ *  Copyright (c) 2006 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros at citi.umich.edu>
++ *  Fred Isaman <iisaman at umich.edu>
++ *
++ * permission is granted to use, copy, create derivative works and
++ * redistribute this software and such derivative works for any purpose,
++ * so long as the name of the university of michigan is not used in
++ * any advertising or publicity pertaining to the use or distribution
++ * of this software without specific, written prior authorization.  if
++ * the above copyright notice or any other identification of the
++ * university of michigan is included in any copy of any portion of
++ * this software, then the disclaimer below must also be included.
++ *
++ * this software is provided as is, without representation from the
++ * university of michigan as to its fitness for any purpose, and without
++ * warranty by the university of michigan of any kind, either express
++ * or implied, including without limitation the implied warranties of
++ * merchantability and fitness for a particular purpose.  the regents
++ * of the university of michigan shall not be liable for any damages,
++ * including special, indirect, incidental, or consequential damages,
++ * with respect to any claim arising out or in connection with the use
++ * of the software, even if it has been or is hereafter advised of the
++ * possibility of such damages.
++ */
++
++#include "blocklayout.h"
++#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++
++/* Bit numbers */
++#define EXTENT_INITIALIZED 0
++#define EXTENT_WRITTEN     1
++#define EXTENT_IN_COMMIT   2
++#define INTERNAL_EXISTS    MY_MAX_TAGS
++#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
++
++/* Returns largest t<=s s.t. t%base==0 */
++static inline sector_t normalize(sector_t s, int base)
++{
++	sector_t tmp = s; /* Since do_div modifies its argument */
++	return s - do_div(tmp, base);
++}
++
++static inline sector_t normalize_up(sector_t s, int base)
++{
++	return normalize(s + base - 1, base);
++}
++
++/* Complete stub using list while determine API wanted */
++
++/* Returns tags, or negative */
++static int32_t _find_entry(struct my_tree_t *tree, u64 s)
++{
++	struct pnfs_inval_tracking *pos;
++
++	dprintk("%s(%llu) enter\n", __func__, s);
++	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
++		if (pos->it_sector > s)
++			continue;
++		else if (pos->it_sector == s)
++			return pos->it_tags & INTERNAL_MASK;
++		else
++			break;
++	}
++	return -ENOENT;
++}
++
++static inline
++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag)
++{
++	int32_t tags;
++
++	dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
++	s = normalize(s, tree->mtt_step_size);
 +	tags = _find_entry(tree, s);
 +	if ((tags < 0) || !(tags & (1 << tag)))
 +		return 0;
@@ -4770,51 +4504,30 @@ index 0000000..40dff82
 +		}
 +	}
 +}
-diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
-index 85a7cfd..1f92ceb 100644
---- a/fs/nfs/callback.h
-+++ b/fs/nfs/callback.h
-@@ -8,6 +8,8 @@
- #ifndef __LINUX_FS_NFS_CALLBACK_H
- #define __LINUX_FS_NFS_CALLBACK_H
- 
-+#include "pnfs.h"
-+
- #define NFS4_CALLBACK 0x40000000
- #define NFS4_CALLBACK_XDRSIZE 2048
- #define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
-@@ -111,6 +113,13 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
- 
- #define RCA4_TYPE_MASK_RDATA_DLG	0
- #define RCA4_TYPE_MASK_WDATA_DLG	1
-+#define RCA4_TYPE_MASK_DIR_DLG         2
-+#define RCA4_TYPE_MASK_FILE_LAYOUT     3
-+#define RCA4_TYPE_MASK_BLK_LAYOUT      4
-+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8
-+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
-+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
-+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
- 
- struct cb_recallanyargs {
- 	struct sockaddr	*craa_addr;
-@@ -127,6 +136,39 @@ struct cb_recallslotargs {
- extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
- 					  void *dummy);
- 
-+struct cb_layoutrecallargs {
-+	struct sockaddr		*cbl_addr;
-+	struct nfs_fh		cbl_fh;
-+	struct pnfs_layout_range cbl_seg;
-+	struct nfs_fsid		cbl_fsid;
-+	uint32_t		cbl_recall_type;
-+	uint32_t		cbl_layout_type;
-+	uint32_t		cbl_layoutchanged;
-+	nfs4_stateid		cbl_stateid;
-+};
-+
-+extern unsigned nfs4_callback_layoutrecall(
-+	struct cb_layoutrecallargs *args,
-+	void *dummy);
+diff -up linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile
+--- linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile.orig	2011-01-28 09:43:53.305777685 -0500
++++ linux-2.6.37.noarch/fs/nfs/blocklayout/Makefile	2011-01-28 09:43:53.305777685 -0500
+@@ -0,0 +1,6 @@
++#
++# Makefile for the pNFS block layout driver kernel module
++#
++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
++			extents.o block-device-discovery-pipe.o
+diff -up linux-2.6.37.noarch/fs/nfs/callback.h.orig linux-2.6.37.noarch/fs/nfs/callback.h
+--- linux-2.6.37.noarch/fs/nfs/callback.h.orig	2011-01-28 09:37:32.520980712 -0500
++++ linux-2.6.37.noarch/fs/nfs/callback.h	2011-01-28 09:43:53.311776459 -0500
+@@ -164,9 +164,39 @@ struct cb_layoutrecallargs {
+ extern unsigned nfs4_callback_layoutrecall(
+ 	struct cb_layoutrecallargs *args,
+ 	void *dummy, struct cb_process_state *cps);
++extern bool matches_outstanding_recall(struct inode *ino,
++				       struct pnfs_layout_range *range);
++extern void notify_drained(struct nfs_client *clp, u64 mask);
++extern void nfs_client_return_layouts(struct nfs_client *clp);
+ 
+ extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
+ extern void nfs4_cb_take_slot(struct nfs_client *clp);
 +
 +struct cb_devicenotifyitem {
 +	uint32_t		cbd_notify_type;
@@ -4831,513 +4544,430 @@ index 85a7cfd..1f92ceb 100644
 +	struct cb_devicenotifyitem	 devs[NFS4_DEV_NOTIFY_MAXENTRIES];
 +};
 +
-+extern unsigned nfs4_callback_devicenotify(
++extern __be32 nfs4_callback_devicenotify(
 +	struct cb_devicenotifyargs *args,
-+	void *dummy);
++	void *dummy, struct cb_process_state *cps);
++
++#else /* CONFIG_NFS_V4_1 */
++
++static inline void nfs_client_return_layouts(struct nfs_client *clp)
++{
++}
++
  #endif /* CONFIG_NFS_V4_1 */
  
- extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
-index 930d10f..28816ab 100644
---- a/fs/nfs/callback_proc.c
-+++ b/fs/nfs/callback_proc.c
-@@ -8,10 +8,14 @@
- #include <linux/nfs4.h>
- #include <linux/nfs_fs.h>
- #include <linux/slab.h>
-+#include <linux/kthread.h>
-+#include <linux/module.h>
-+#include <linux/writeback.h>
- #include "nfs4_fs.h"
- #include "callback.h"
- #include "delegation.h"
- #include "internal.h"
-+#include "pnfs.h"
- 
- #ifdef NFS_DEBUG
- #define NFSDBG_FACILITY NFSDBG_CALLBACK
-@@ -113,16 +117,338 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
+ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
+diff -up linux-2.6.37.noarch/fs/nfs/callback_proc.c.orig linux-2.6.37.noarch/fs/nfs/callback_proc.c
+--- linux-2.6.37.noarch/fs/nfs/callback_proc.c.orig	2011-01-28 09:37:32.521980677 -0500
++++ linux-2.6.37.noarch/fs/nfs/callback_proc.c	2011-01-28 09:43:53.312776264 -0500
+@@ -108,106 +108,277 @@ int nfs4_validate_delegation_stateid(str
  
  #if defined(CONFIG_NFS_V4_1)
  
+-static u32 initiate_file_draining(struct nfs_client *clp,
+-				  struct cb_layoutrecallargs *args)
 +static bool
-+pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo,
-+			    const nfs4_stateid stateid)
++_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info,
++		     struct inode *ino, struct pnfs_layout_range *range)
+ {
+-	struct pnfs_layout_hdr *lo;
+-	struct inode *ino;
+-	bool found = false;
+-	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+-	LIST_HEAD(free_me_list);
++	struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args;
+ 
+-	spin_lock(&clp->cl_lock);
+-	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+-		if (nfs_compare_fh(&args->cbl_fh,
+-				   &NFS_I(lo->plh_inode)->fh))
+-			continue;
+-		ino = igrab(lo->plh_inode);
+-		if (!ino)
+-			continue;
+-		found = true;
+-		/* Without this, layout can be freed as soon
+-		 * as we release cl_lock.
++	switch (cb_args->cbl_recall_type) {
++	case RETURN_ALL:
++		return true;
++	case RETURN_FSID:
++		return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid,
++			       sizeof(struct nfs_fsid));
++	case RETURN_FILE:
++		return (ino == cb_info->pcl_ino) &&
++			should_free_lseg(range, &cb_args->cbl_range);
++	default:
++		/* Should never hit here, as decode_layoutrecall_args()
++		 * will verify cb_info from server.
+ 		 */
+-		get_layout_hdr(lo);
+-		break;
++		BUG();
+ 	}
+-	spin_unlock(&clp->cl_lock);
+-	if (!found)
+-		return NFS4ERR_NOMATCHING_LAYOUT;
++}
+ 
+-	spin_lock(&ino->i_lock);
+-	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+-	    mark_matching_lsegs_invalid(lo, &free_me_list,
+-					args->cbl_range.iomode))
+-		rv = NFS4ERR_DELAY;
+-	else
+-		rv = NFS4ERR_NOMATCHING_LAYOUT;
+-	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+-	spin_unlock(&ino->i_lock);
+-	pnfs_free_lseg_list(&free_me_list);
+-	put_layout_hdr(lo);
+-	iput(ino);
++bool
++matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range)
 +{
-+	int seqlock;
-+	bool res;
-+	u32 oldseqid, newseqid;
++	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
++	struct pnfs_cb_lrecall_info *cb_info;
++	bool rv = false;
 +
-+	do {
-+		seqlock = read_seqbegin(&lo->seqlock);
-+		oldseqid = be32_to_cpu(lo->stateid.stateid.seqid);
-+		newseqid = be32_to_cpu(stateid.stateid.seqid);
-+		res = !memcmp(lo->stateid.stateid.other,
-+			      stateid.stateid.other,
-+			      NFS4_STATEID_OTHER_SIZE);
-+		if (res) { /* comparing layout stateids */
-+			if (oldseqid == ~0)
-+				res = (newseqid == 1);
-+			else
-+				res = (newseqid == oldseqid + 1);
-+		} else { /* open stateid */
-+			res = !memcmp(lo->stateid.data,
-+				      &zero_stateid,
-+				      NFS4_STATEID_SIZE);
-+			if (res)
-+				res = (newseqid == 1);
++	assert_spin_locked(&clp->cl_lock);
++	list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) {
++		if (_recall_matches_lget(cb_info, ino, range)) {
++			rv = true;
++			break;
 +		}
-+	} while (read_seqretry(&lo->seqlock, seqlock));
++	}
+ 	return rv;
+ }
+ 
+-static u32 initiate_bulk_draining(struct nfs_client *clp,
+-				  struct cb_layoutrecallargs *args)
++/* Send a synchronous LAYOUTRETURN.  By the time this is called, we know
++ * all IO has been drained, any matching lsegs deleted, and that no
++ * overlapping LAYOUTGETs will be sent or processed for the duration
++ * of this call.
++ * Note that it is possible that when this is called, the stateid has
++ * been invalidated.  But will not be cleared, so can still use.
++ */
++static int
++pnfs_send_layoutreturn(struct nfs_client *clp,
++		       struct pnfs_cb_lrecall_info *cb_info)
++{
++	struct cb_layoutrecallargs *args = &cb_info->pcl_args;
++	struct nfs4_layoutreturn *lrp;
 +
-+	return res;
++	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
++	if (!lrp)
++		return -ENOMEM;
++	lrp->args.reclaim = 0;
++	lrp->args.layout_type = args->cbl_layout_type;
++	lrp->args.return_type = args->cbl_recall_type;
++	lrp->clp = clp;
++	if (args->cbl_recall_type == RETURN_FILE) {
++		lrp->args.range = args->cbl_range;
++		lrp->args.inode = cb_info->pcl_ino;
++	} else {
++		lrp->args.range.iomode = IOMODE_ANY;
++		lrp->args.inode = NULL;
++	}
++	return nfs4_proc_layoutreturn(lrp, true);
 +}
 +
-+/*
-+ * Retrieve an inode based on layout recall parameters
-+ *
-+ * Note: caller must iput(inode) to dereference the inode.
++/* Called by state manager to finish CB_LAYOUTRECALLS initiated by
++ * nfs4_callback_layoutrecall().
 + */
-+static struct inode *
-+nfs_layoutrecall_find_inode(struct nfs_client *clp,
-+			    const struct cb_layoutrecallargs *args)
++void nfs_client_return_layouts(struct nfs_client *clp)
+ {
+-	struct pnfs_layout_hdr *lo;
+-	struct inode *ino;
+-	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+-	struct pnfs_layout_hdr *tmp;
+-	LIST_HEAD(recall_list);
+-	LIST_HEAD(free_me_list);
+-	struct pnfs_layout_range range = {
+-		.iomode = IOMODE_ANY,
+-		.offset = 0,
+-		.length = NFS4_MAX_UINT64,
+-	};
++	struct pnfs_cb_lrecall_info *cb_info;
+ 
+ 	spin_lock(&clp->cl_lock);
+-	list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+-		if ((args->cbl_recall_type == RETURN_FSID) &&
+-		    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+-			   &args->cbl_fsid, sizeof(struct nfs_fsid)))
+-			continue;
+-		if (!igrab(lo->plh_inode))
+-			continue;
+-		get_layout_hdr(lo);
+-		BUG_ON(!list_empty(&lo->plh_bulk_recall));
+-		list_add(&lo->plh_bulk_recall, &recall_list);
++	while (true) {
++		if (list_empty(&clp->cl_layoutrecalls)) {
++			spin_unlock(&clp->cl_lock);
++			break;
++		}
++		cb_info = list_first_entry(&clp->cl_layoutrecalls,
++					   struct pnfs_cb_lrecall_info,
++					   pcl_list);
++		spin_unlock(&clp->cl_lock);
++		if (atomic_read(&cb_info->pcl_count) != 0)
++			break;
++		/* What do on error return?  These layoutreturns are
++		 * required by the protocol.  So if do not get
++		 * successful reply, probably have to do something
++		 * more drastic.
++		 */
++		pnfs_send_layoutreturn(clp, cb_info);
++		spin_lock(&clp->cl_lock);
++		/* Removing from the list unblocks LAYOUTGETs */
++		list_del(&cb_info->pcl_list);
++		clp->cl_cb_lrecall_count--;
++		clp->cl_drain_notification[1 << cb_info->pcl_notify_bit] = NULL;
++		rpc_wake_up(&clp->cl_rpcwaitq_recall);
++		kfree(cb_info);
+ 	}
+-	spin_unlock(&clp->cl_lock);
+-	list_for_each_entry_safe(lo, tmp,
+-				 &recall_list, plh_bulk_recall) {
+-		ino = lo->plh_inode;
+-		spin_lock(&ino->i_lock);
+-		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+-		if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+-			rv = NFS4ERR_DELAY;
+-		list_del_init(&lo->plh_bulk_recall);
+-		spin_unlock(&ino->i_lock);
+-		put_layout_hdr(lo);
+-		iput(ino);
++}
++
++void notify_drained(struct nfs_client *clp, u64 mask)
 +{
-+	struct nfs_inode *nfsi;
-+	struct pnfs_layout_hdr *lo;
-+	struct nfs_server *server;
-+	struct inode *ino = NULL;
++	atomic_t **ptr = clp->cl_drain_notification;
++	bool done = false;
 +
-+	dprintk("%s: Begin recall_type=%d clp %p\n",
-+		__func__, args->cbl_recall_type, clp);
++	/* clp lock not needed except to remove used up entries */
++	/* Should probably use functions defined in bitmap.h */
++	while (mask) {
++		if ((mask & 1) && (atomic_dec_and_test(*ptr)))
++			done = true;
++		mask >>= 1;
++		ptr++;
++	}
++	if (done) {
++		set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
++		nfs4_schedule_state_manager(clp);
++	}
++}
 +
-+	spin_lock(&clp->cl_lock);
-+	list_for_each_entry(lo, &clp->cl_layouts, layouts) {
-+		nfsi = NFS_I(lo->inode);
-+		if (!nfsi)
-+			continue;
++static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info)
++{
++	struct nfs_client *clp = cb_info->pcl_clp;
++	struct pnfs_layout_hdr *lo;
++	int rv = NFS4ERR_NOMATCHING_LAYOUT;
++	struct cb_layoutrecallargs *args = &cb_info->pcl_args;
 +
-+		dprintk("%s: Searching inode=%lu\n",
-+			__func__, nfsi->vfs_inode.i_ino);
++	if (args->cbl_recall_type == RETURN_FILE) {
++		LIST_HEAD(free_me_list);
 +
-+		if (args->cbl_recall_type == RETURN_FILE) {
-+		    if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh))
-+			continue;
-+		} else if (args->cbl_recall_type == RETURN_FSID) {
-+			server = NFS_SERVER(&nfsi->vfs_inode);
-+			if (server->fsid.major != args->cbl_fsid.major ||
-+			    server->fsid.minor != args->cbl_fsid.minor)
++		spin_lock(&clp->cl_lock);
++		list_for_each_entry(lo, &clp->cl_layouts, layouts) {
++			if (nfs_compare_fh(&args->cbl_fh,
++					   &NFS_I(lo->inode)->fh))
 +				continue;
++			if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
++				rv = NFS4ERR_DELAY;
++			else {
++				/* FIXME I need to better understand igrab and
++				 * does having a layout ref keep ino around?
++				 *  It should.
++				 */
++				/* We need to hold the reference until any
++				 * potential LAYOUTRETURN is finished.
++				 */
++				get_layout_hdr(lo);
++				cb_info->pcl_ino = lo->inode;
++				rv = NFS4_OK;
++			}
++			break;
 +		}
++		spin_unlock(&clp->cl_lock);
 +
-+		/* Make sure client didn't clean up layout without
-+		 * telling the server */
-+		if (!has_layout(nfsi))
-+			continue;
-+
-+		ino = igrab(&nfsi->vfs_inode);
-+		dprintk("%s: Found inode=%p\n", __func__, ino);
-+		break;
-+	}
-+	spin_unlock(&clp->cl_lock);
-+	return ino;
-+}
-+
-+struct recall_layout_threadargs {
-+	struct inode *inode;
-+	struct nfs_client *clp;
-+	struct completion started;
-+	struct cb_layoutrecallargs *rl;
-+	int result;
-+};
-+
-+static int pnfs_recall_layout(void *data)
-+{
-+	struct inode *inode, *ino;
-+	struct nfs_client *clp;
-+	struct cb_layoutrecallargs rl;
-+	struct nfs4_layoutreturn *lrp;
-+	struct recall_layout_threadargs *args =
-+		(struct recall_layout_threadargs *)data;
-+	int status = 0;
-+
-+	daemonize("nfsv4-layoutreturn");
-+
-+	dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n",
-+		__func__, args->rl->cbl_recall_type,
-+		args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor);
-+
-+	clp = args->clp;
-+	inode = args->inode;
-+	rl = *args->rl;
-+
-+	/* support whole file layouts only */
-+	rl.cbl_seg.offset = 0;
-+	rl.cbl_seg.length = NFS4_MAX_UINT64;
++		spin_lock(&lo->inode->i_lock);
++		if (rv == NFS4_OK) {
++			lo->plh_block_lgets++;
++			nfs4_asynch_forget_layouts(lo, &args->cbl_range,
++						   cb_info->pcl_notify_bit,
++						   &cb_info->pcl_count,
++						   &free_me_list);
++		}
++		pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
++		spin_unlock(&lo->inode->i_lock);
++		pnfs_free_lseg_list(&free_me_list);
++	} else {
++		struct pnfs_layout_hdr *tmp;
++		LIST_HEAD(recall_list);
++		LIST_HEAD(free_me_list);
++		struct pnfs_layout_range range = {
++			.iomode = IOMODE_ANY,
++			.offset = 0,
++			.length = NFS4_MAX_UINT64,
++		};
 +
-+	if (rl.cbl_recall_type == RETURN_FILE) {
-+		if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout,
-+						rl.cbl_stateid))
-+			status = pnfs_return_layout(inode, &rl.cbl_seg,
-+						    &rl.cbl_stateid, RETURN_FILE,
-+						    false);
-+		else
-+			status = cpu_to_be32(NFS4ERR_DELAY);
-+		if (status)
-+			dprintk("%s RETURN_FILE error: %d\n", __func__, status);
-+		else
-+			status =  cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
-+		args->result = status;
-+		complete(&args->started);
++		spin_lock(&clp->cl_lock);
++		/* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */
++		if (!list_is_singular(&clp->cl_layoutrecalls)) {
++			spin_unlock(&clp->cl_lock);
++			return NFS4ERR_DELAY;
++		}
++		list_for_each_entry(lo, &clp->cl_layouts, layouts) {
++			if ((args->cbl_recall_type == RETURN_FSID) &&
++			    memcmp(&NFS_SERVER(lo->inode)->fsid,
++				   &args->cbl_fsid, sizeof(struct nfs_fsid)))
++				continue;
++			get_layout_hdr(lo);
++			/* We could list_del(&lo->layouts) here */
++			BUG_ON(!list_empty(&lo->plh_bulk_recall));
++			list_add(&lo->plh_bulk_recall, &recall_list);
++		}
++		spin_unlock(&clp->cl_lock);
++		list_for_each_entry_safe(lo, tmp,
++					 &recall_list, plh_bulk_recall) {
++			spin_lock(&lo->inode->i_lock);
++			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
++			nfs4_asynch_forget_layouts(lo, &range,
++						   cb_info->pcl_notify_bit,
++						   &cb_info->pcl_count,
++						   &free_me_list);
++			list_del_init(&lo->plh_bulk_recall);
++			spin_unlock(&lo->inode->i_lock);
++			put_layout_hdr(lo);
++			rv = NFS4_OK;
++		}
++		pnfs_free_lseg_list(&free_me_list);
+ 	}
+-	pnfs_free_lseg_list(&free_me_list);
+ 	return rv;
+ }
+ 
+ static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ 				    struct cb_layoutrecallargs *args)
+ {
+-	u32 res = NFS4ERR_DELAY;
++	struct pnfs_cb_lrecall_info *new;
++	atomic_t **ptr;
++	int bit_num;
++	u32 res;
+ 
+ 	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+-	if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
++	new = kmalloc(sizeof(*new), GFP_KERNEL);
++	if (!new) {
++		res = NFS4ERR_DELAY;
+ 		goto out;
+-	if (args->cbl_recall_type == RETURN_FILE)
+-		res = initiate_file_draining(clp, args);
+-	else
+-		res = initiate_bulk_draining(clp, args);
+-	clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
++	}
++	memcpy(&new->pcl_args, args, sizeof(*args));
++	atomic_set(&new->pcl_count, 1);
++	new->pcl_clp = clp;
++	new->pcl_ino = NULL;
++	spin_lock(&clp->cl_lock);
++	if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) {
++		kfree(new);
++		res = NFS4ERR_DELAY;
++		spin_unlock(&clp->cl_lock);
 +		goto out;
 +	}
-+
-+	status = cpu_to_be32(NFS4_OK);
-+	args->result = status;
-+	complete(&args->started);
-+	args = NULL;
-+
-+	/* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
-+	while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
-+		/* FIXME: need to check status on pnfs_return_layout */
-+		pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false);
-+		iput(ino);
-+	}
-+
-+	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
-+	if (!lrp) {
-+		dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
-+			__func__);
-+		goto out;
++	clp->cl_cb_lrecall_count++;
++	/* Adding to the list will block conflicting LGET activity */
++	list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls);
++	for (bit_num = 0, ptr = clp->cl_drain_notification; *ptr; ptr++)
++		bit_num++;
++	*ptr = &new->pcl_count;
++	new->pcl_notify_bit = bit_num;
++	spin_unlock(&clp->cl_lock);
++	res = initiate_layout_draining(new);
++	if (res || atomic_dec_and_test(&new->pcl_count)) {
++		spin_lock(&clp->cl_lock);
++		list_del(&new->pcl_list);
++		clp->cl_cb_lrecall_count--;
++		clp->cl_drain_notification[1 << bit_num] = NULL;
++		rpc_wake_up(&clp->cl_rpcwaitq_recall);
++		spin_unlock(&clp->cl_lock);
++		if (res == NFS4_OK) {
++			if (args->cbl_recall_type == RETURN_FILE) {
++				struct pnfs_layout_hdr *lo;
++
++				lo = NFS_I(new->pcl_ino)->layout;
++				spin_lock(&lo->inode->i_lock);
++				lo->plh_block_lgets--;
++				if (!pnfs_layoutgets_blocked(lo, NULL))
++					rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
++				spin_unlock(&lo->inode->i_lock);
++				put_layout_hdr(lo);
++			}
++			res = NFS4ERR_NOMATCHING_LAYOUT;
++		}
++		kfree(new);
 +	}
-+
-+	/* send final layoutreturn */
-+	lrp->args.reclaim = 0;
-+	lrp->args.layout_type = rl.cbl_layout_type;
-+	lrp->args.return_type = rl.cbl_recall_type;
-+	lrp->args.range = rl.cbl_seg;
-+	lrp->args.inode = inode;
-+	nfs4_proc_layoutreturn(lrp, true);
-+
-+out:
-+	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
-+	nfs_put_client(clp);
-+	module_put_and_exit(0);
-+	dprintk("%s: exit status %d\n", __func__, 0);
-+	return 0;
-+}
-+
-+/*
-+ * Asynchronous layout recall!
-+ */
-+static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
-+				    struct cb_layoutrecallargs *rl)
-+{
-+	struct recall_layout_threadargs data = {
-+		.clp = clp,
-+		.inode = inode,
-+		.rl = rl,
-+	};
-+	struct task_struct *t;
-+	int status = -EAGAIN;
-+
-+	dprintk("%s: -->\n", __func__);
-+
-+	/* FIXME: do not allow two concurrent layout recalls */
-+	if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
-+		return status;
-+
-+	init_completion(&data.started);
-+	__module_get(THIS_MODULE);
-+	atomic_inc(&clp->cl_count);
-+
-+	t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
-+	if (IS_ERR(t)) {
-+		printk(KERN_INFO "NFS: Layout recall callback thread failed "
-+			"for client (clientid %08x/%08x)\n",
-+			(unsigned)(clp->cl_clientid >> 32),
-+			(unsigned)(clp->cl_clientid));
-+		status = PTR_ERR(t);
-+		goto out_module_put;
-+	}
-+	wait_for_completion(&data.started);
-+	return data.result;
-+out_module_put:
-+	nfs_put_client(clp);
-+	clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
-+	module_put(THIS_MODULE);
-+	return status;
-+}
-+
-+static int pnfs_recall_all_layouts(struct nfs_client *clp)
-+{
-+	struct cb_layoutrecallargs rl;
-+	struct inode *inode;
-+	int status = 0;
-+
-+	rl.cbl_recall_type = RETURN_ALL;
-+	rl.cbl_seg.iomode = IOMODE_ANY;
-+	rl.cbl_seg.offset = 0;
-+	rl.cbl_seg.length = NFS4_MAX_UINT64;
-+
-+	/* we need the inode to get the nfs_server struct */
-+	inode = nfs_layoutrecall_find_inode(clp, &rl);
-+	if (!inode)
-+		return status;
-+	status = pnfs_async_return_layout(clp, inode, &rl);
-+	iput(inode);
-+
-+	return status;
-+}
-+
-+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
-+				  void *dummy)
+ out:
+ 	dprintk("%s returning %i\n", __func__, res);
+ 	return res;
+@@ -241,6 +412,36 @@ static void pnfs_recall_all_layouts(stru
+ 	do_callback_layoutrecall(clp, &args);
+ }
+ 
++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
++				  void *dummy, struct cb_process_state *cps)
 +{
-+	struct nfs_client *clp;
-+	struct inode *inode = NULL;
-+	__be32 res;
-+	int status;
-+	unsigned int num_client = 0;
++	int i;
++	u32 type, res = 0;
 +
 +	dprintk("%s: -->\n", __func__);
 +
-+	res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
-+	clp  = nfs_find_client(args->cbl_addr, 4);
-+	if (clp == NULL)
++	if (!cps->clp) {
++		res = NFS4ERR_OP_NOT_IN_SESSION;
 +		goto out;
-+
-+	res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
-+	do {
-+		struct nfs_client *prev = clp;
-+		num_client++;
-+		/* the callback must come from the MDS personality */
-+		if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
-+			goto loop;
-+		/* In the _ALL or _FSID case, we need the inode to get
-+		 * the nfs_server struct.
-+		 */
-+		inode = nfs_layoutrecall_find_inode(clp, args);
-+		if (!inode)
-+			goto loop;
-+		status = pnfs_async_return_layout(clp, inode, args);
-+		if (status)
-+			res = cpu_to_be32(NFS4ERR_DELAY);
-+		iput(inode);
-+loop:
-+		clp = nfs_find_client_next(prev);
-+		nfs_put_client(prev);
-+	} while (clp != NULL);
-+
-+out:
-+	dprintk("%s: exit with status = %d numclient %u\n",
-+		__func__, ntohl(res), num_client);
-+	return res;
-+}
-+
-+/* Remove the deviceid(s) from the nfs_client deviceid cache */
-+static __be32 pnfs_devicenotify_client(struct nfs_client *clp,
-+				       struct cb_devicenotifyargs *args)
-+{
-+	uint32_t type;
-+	int i;
-+
-+	dprintk("%s: --> clp %p\n", __func__, clp);
++	}
 +
 +	for (i = 0; i < args->ndevs; i++) {
 +		struct cb_devicenotifyitem *dev = &args->devs[i];
 +		type = dev->cbd_notify_type;
-+		if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache)
-+			pnfs_delete_deviceid(clp->cl_devid_cache,
++		if (type == NOTIFY_DEVICEID4_DELETE && cps->clp->cl_devid_cache)
++			pnfs_delete_deviceid(cps->clp->cl_devid_cache,
 +					     &dev->cbd_dev_id);
 +		else if (type == NOTIFY_DEVICEID4_CHANGE)
 +			printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE "
 +					"not supported\n", __func__);
 +	}
-+	return 0;
-+}
-+
-+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
-+				  void *dummy)
-+{
-+	struct nfs_client *clp;
-+	__be32 res = 0;
-+	unsigned int num_client = 0;
-+
-+	dprintk("%s: -->\n", __func__);
-+
-+	res = __constant_htonl(NFS4ERR_INVAL);
-+	clp = nfs_find_client(args->addr, 4);
-+	if (clp == NULL)
-+		goto out;
-+
-+	do {
-+		struct nfs_client *prev = clp;
-+		num_client++;
-+		res = pnfs_devicenotify_client(clp, args);
-+		clp = nfs_find_client_next(prev);
-+		nfs_put_client(prev);
-+	} while (clp != NULL);
 +
 +out:
-+	dprintk("%s: exit with status = %d numclient %u\n",
-+		__func__, ntohl(res), num_client);
-+	return res;
++	dprintk("%s: exit with status = %u\n",
++		__func__, res);
++	return cpu_to_be32(res);
 +}
 +
  int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
  {
  	if (delegation == NULL)
- 		return 0;
- 
--	/* seqid is 4-bytes long */
--	if (((u32 *) &stateid->data)[0] != 0)
-+	if (stateid->stateid.seqid != 0)
- 		return 0;
--	if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
--		   sizeof(stateid->data)-4))
-+	if (memcmp(&delegation->stateid.stateid.other,
-+		   &stateid->stateid.other,
-+		   NFS4_STATEID_OTHER_SIZE))
- 		return 0;
- 
- 	return 1;
-@@ -324,13 +650,37 @@ out:
- 	return status;
- }
- 
-+static inline bool
-+validate_bitmap_values(const unsigned long *mask)
-+{
-+	int i;
-+
-+	if (*mask == 0)
-+		return true;
-+	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) ||
-+	    test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) ||
-+	    test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) ||
-+	    test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) ||
-+	    test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask))
-+		return true;
-+	for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN;
-+	     i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++)
-+		if (test_bit(i, mask))
-+			return true;
-+	for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN;
-+	     i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++)
-+		if (test_bit(i, mask))
-+			return true;
-+	return false;
-+}
-+
- __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
- {
- 	struct nfs_client *clp;
- 	__be32 status;
- 	fmode_t flags = 0;
- 
--	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
-+	status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
- 	clp = nfs_find_client(args->craa_addr, 4);
- 	if (clp == NULL)
- 		goto out;
-@@ -338,16 +688,27 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
- 	dprintk("NFS: RECALL_ANY callback request from %s\n",
- 		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
- 
-+	status = cpu_to_be32(NFS4ERR_INVAL);
-+	if (!validate_bitmap_values((const unsigned long *)
-+				    &args->craa_type_mask))
-+		goto out_put;
-+
-+	status = cpu_to_be32(NFS4_OK);
- 	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
- 		     &args->craa_type_mask))
- 		flags = FMODE_READ;
- 	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
- 		     &args->craa_type_mask))
- 		flags |= FMODE_WRITE;
-+	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
-+		     &args->craa_type_mask))
-+		if (pnfs_recall_all_layouts(clp) == -EAGAIN)
-+			status = cpu_to_be32(NFS4ERR_DELAY);
- 
- 	if (flags)
- 		nfs_expire_all_delegation_types(clp, flags);
--	status = htonl(NFS4_OK);
-+out_put:
-+	nfs_put_client(clp);
- out:
- 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- 	return status;
-diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
-index 05af212..fbfa2b9 100644
---- a/fs/nfs/callback_xdr.c
-+++ b/fs/nfs/callback_xdr.c
-@@ -22,6 +22,8 @@
- #define CB_OP_RECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+diff -up linux-2.6.37.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.37.noarch/fs/nfs/callback_xdr.c
+--- linux-2.6.37.noarch/fs/nfs/callback_xdr.c.orig	2011-01-28 09:37:32.522980641 -0500
++++ linux-2.6.37.noarch/fs/nfs/callback_xdr.c	2011-01-28 09:43:53.313776069 -0500
+@@ -25,6 +25,7 @@
  
  #if defined(CONFIG_NFS_V4_1)
-+#define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+ #define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 +#define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
  #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
  					4 + 1 + 3)
  #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
-@@ -220,6 +222,147 @@ out:
- 
- #if defined(CONFIG_NFS_V4_1)
+@@ -284,6 +285,93 @@ out:
+ 	return status;
+ }
  
-+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
-+				       struct xdr_stream *xdr,
-+				       struct cb_layoutrecallargs *args)
-+{
-+	__be32 *p;
-+	__be32 status = 0;
-+
-+	args->cbl_addr = svc_addr(rqstp);
-+	p = read_buf(xdr, 4 * sizeof(uint32_t));
-+	if (unlikely(p == NULL)) {
-+		status = htonl(NFS4ERR_BADXDR);
-+		goto out;
-+	}
-+
-+	args->cbl_layout_type = ntohl(*p++);
-+	args->cbl_seg.iomode = ntohl(*p++);
-+	args->cbl_layoutchanged = ntohl(*p++);
-+	args->cbl_recall_type = ntohl(*p++);
-+
-+	if (likely(args->cbl_recall_type == RETURN_FILE)) {
-+		status = decode_fh(xdr, &args->cbl_fh);
-+		if (unlikely(status != 0))
-+			goto out;
-+
-+		p = read_buf(xdr, 2 * sizeof(uint64_t));
-+		if (unlikely(p == NULL)) {
-+			status = htonl(NFS4ERR_BADXDR);
-+			goto out;
-+		}
-+		p = xdr_decode_hyper(p, &args->cbl_seg.offset);
-+		p = xdr_decode_hyper(p, &args->cbl_seg.length);
-+		status = decode_stateid(xdr, &args->cbl_stateid);
-+		if (unlikely(status != 0))
-+			goto out;
-+	} else if (args->cbl_recall_type == RETURN_FSID) {
-+		p = read_buf(xdr, 2 * sizeof(uint64_t));
-+		if (unlikely(p == NULL)) {
-+			status = htonl(NFS4ERR_BADXDR);
-+			goto out;
-+		}
-+		p = xdr_decode_hyper(p, &args->cbl_fsid.major);
-+		p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
-+	}
-+	dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d "
-+		"fsid %llx-%llx fhsize %d\n", __func__,
-+		args->cbl_layout_type, args->cbl_seg.iomode,
-+		args->cbl_layoutchanged, args->cbl_recall_type,
-+		args->cbl_fsid.major, args->cbl_fsid.minor,
-+		args->cbl_fh.size);
-+out:
-+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
-+	return status;
-+}
-+
 +static
 +__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
 +				struct xdr_stream *xdr,
@@ -5428,30 +5058,22 @@ index 05af212..fbfa2b9 100644
  static __be32 decode_sessionid(struct xdr_stream *xdr,
  				 struct nfs4_sessionid *sid)
  {
-@@ -574,11 +717,11 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
- 	case OP_CB_SEQUENCE:
+@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned in
  	case OP_CB_RECALL_ANY:
  	case OP_CB_RECALL_SLOT:
-+	case OP_CB_LAYOUTRECALL:
+ 	case OP_CB_LAYOUTRECALL:
 +	case OP_CB_NOTIFY_DEVICEID:
  		*op = &callback_ops[op_nr];
  		break;
  
--	case OP_CB_LAYOUTRECALL:
 -	case OP_CB_NOTIFY_DEVICEID:
  	case OP_CB_NOTIFY:
  	case OP_CB_PUSH_DELEG:
  	case OP_CB_RECALLABLE_OBJ_AVAIL:
-@@ -739,6 +882,18 @@ static struct callback_op callback_ops[] = {
- 		.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
+@@ -850,6 +938,12 @@ static struct callback_op callback_ops[]
+ 			(callback_decode_arg_t)decode_layoutrecall_args,
+ 		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
  	},
- #if defined(CONFIG_NFS_V4_1)
-+	[OP_CB_LAYOUTRECALL] = {
-+		.process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
-+		.decode_args =
-+			(callback_decode_arg_t)decode_layoutrecall_args,
-+		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
-+	},
 +	[OP_CB_NOTIFY_DEVICEID] = {
 +		.process_op = (callback_process_op_t)nfs4_callback_devicenotify,
 +		.decode_args =
@@ -5461,38 +5083,43 @@ index 05af212..fbfa2b9 100644
  	[OP_CB_SEQUENCE] = {
  		.process_op = (callback_process_op_t)nfs4_callback_sequence,
  		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
-diff --git a/fs/nfs/client.c b/fs/nfs/client.c
-index e734072..9e1135e 100644
---- a/fs/nfs/client.c
-+++ b/fs/nfs/client.c
-@@ -48,6 +48,7 @@
- #include "iostat.h"
- #include "internal.h"
- #include "fscache.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_CLIENT
- 
-@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
- 	cred = rpc_lookup_machine_cred();
- 	if (!IS_ERR(cred))
+diff -up linux-2.6.37.noarch/fs/nfs/client.c.orig linux-2.6.37.noarch/fs/nfs/client.c
+--- linux-2.6.37.noarch/fs/nfs/client.c.orig	2011-01-28 09:37:32.523980606 -0500
++++ linux-2.6.37.noarch/fs/nfs/client.c	2011-01-28 09:43:53.314775880 -0500
+@@ -185,6 +185,9 @@ static struct nfs_client *nfs_alloc_clie
  		clp->cl_machine_cred = cred;
--
-+#if defined(CONFIG_NFS_V4_1)
-+	INIT_LIST_HEAD(&clp->cl_layouts);
-+#endif
+ #if defined(CONFIG_NFS_V4_1)
+ 	INIT_LIST_HEAD(&clp->cl_layouts);
++	INIT_LIST_HEAD(&clp->cl_layoutrecalls);
++	rpc_init_wait_queue(&clp->cl_rpcwaitq_recall,
++			    "NFS client CB_LAYOUTRECALLS");
+ #endif
  	nfs_fscache_get_client_cookie(clp);
  
- 	return clp;
-@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
- 		nfs_free_client(clp);
- 	}
+@@ -243,11 +246,6 @@ static void nfs_cb_idr_remove_locked(str
+ 		idr_remove(&cb_ident_idr, clp->cl_cb_ident);
+ }
+ 
+-static void pnfs_init_server(struct nfs_server *server)
+-{
+-	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+-}
+-
+ #else
+ static void nfs4_shutdown_client(struct nfs_client *clp)
+ {
+@@ -261,10 +259,6 @@ static void nfs_cb_idr_remove_locked(str
+ {
  }
-+EXPORT_SYMBOL_GPL(nfs_put_client);
  
- #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+-static void pnfs_init_server(struct nfs_server *server)
+-{
+-}
+-
+ #endif /* CONFIG_NFS_V4 */
+ 
  /*
-@@ -344,7 +348,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+@@ -404,7 +398,7 @@ static int nfs_sockaddr_match_ipaddr(con
   * Test if two socket addresses represent the same actual socket,
   * by comparing (only) relevant fields, including the port number.
   */
@@ -5501,15 +5128,15 @@ index e734072..9e1135e 100644
  			    const struct sockaddr *sa2)
  {
  	if (sa1->sa_family != sa2->sa_family)
-@@ -358,6 +362,7 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+@@ -418,6 +412,7 @@ static int nfs_sockaddr_cmp(const struct
  	}
  	return 0;
  }
 +EXPORT_SYMBOL(nfs_sockaddr_cmp);
  
- /*
-  * Find a client by IP address and protocol version
-@@ -549,6 +554,7 @@ int nfs4_check_client_ready(struct nfs_client *clp)
+ /* Common match routine for v4.0 and v4.1 callback services */
+ bool
+@@ -567,6 +562,7 @@ int nfs4_check_client_ready(struct nfs_c
  		return -EPROTONOSUPPORT;
  	return 0;
  }
@@ -5517,7 +5144,7 @@ index e734072..9e1135e 100644
  
  /*
   * Initialise the timeout values for a connection
-@@ -868,7 +874,7 @@ error:
+@@ -889,7 +885,7 @@ error:
  /*
   * Load up the server record from information gained in an fsinfo record
   */
@@ -5526,24 +5153,18 @@ index e734072..9e1135e 100644
  {
  	unsigned long max_rpc_payload;
  
-@@ -898,6 +904,10 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
+@@ -919,7 +915,9 @@ static void nfs_server_set_fsinfo(struct
  	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
  		server->wsize = NFS_MAX_FILE_IO_SIZE;
  	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+-	set_pnfs_layoutdriver(server, fsinfo->layouttype);
 +	server->pnfs_blksize = fsinfo->blksize;
 +	set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
 +	pnfs_set_ds_iosize(server);
-+
- 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
  
- 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-@@ -934,12 +944,13 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
- 			goto out_error;
- 	}
+ 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
  
-+	memset(&fsinfo, 0, sizeof(fsinfo));
- 	fsinfo.fattr = fattr;
- 	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+@@ -965,7 +963,7 @@ static int nfs_probe_fsinfo(struct nfs_s
  	if (error < 0)
  		goto out_error;
  
@@ -5552,15 +5173,16 @@ index e734072..9e1135e 100644
  
  	/* Get some general file system info */
  	if (server->namelen == 0) {
-@@ -1017,6 +1028,7 @@ void nfs_free_server(struct nfs_server *server)
- {
- 	dprintk("--> nfs_free_server()\n");
+@@ -1055,8 +1053,6 @@ static struct nfs_server *nfs_alloc_serv
+ 		return NULL;
+ 	}
  
-+	unset_pnfs_layoutdriver(server);
- 	spin_lock(&nfs_client_lock);
- 	list_del(&server->client_link);
- 	list_del(&server->master_link);
-@@ -1221,7 +1233,7 @@ error:
+-	pnfs_init_server(server);
+-
+ 	return server;
+ }
+ 
+@@ -1360,7 +1356,7 @@ error:
  /*
   * Set up an NFS4 client
   */
@@ -5569,7 +5191,7 @@ index e734072..9e1135e 100644
  		const char *hostname,
  		const struct sockaddr *addr,
  		const size_t addrlen,
-@@ -1264,6 +1276,7 @@ error:
+@@ -1403,6 +1399,7 @@ error:
  	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
  	return error;
  }
@@ -5577,20758 +5199,19666 @@ index e734072..9e1135e 100644
  
  
  /*
-diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
-index 064a809..43786c2 100644
---- a/fs/nfs/direct.c
-+++ b/fs/nfs/direct.c
-@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
- 	.rpc_release = nfs_direct_read_release,
- };
- 
-+static long nfs_direct_read_execute(struct nfs_read_data *data,
-+				    struct rpc_task_setup *task_setup_data,
-+				    struct rpc_message *msg)
-+{
-+	struct inode *inode = data->inode;
-+	struct rpc_task *task;
+diff -up linux-2.6.37.noarch/fs/nfsd/bl_com.c.orig linux-2.6.37.noarch/fs/nfsd/bl_com.c
+--- linux-2.6.37.noarch/fs/nfsd/bl_com.c.orig	2011-01-28 09:43:53.347770803 -0500
++++ linux-2.6.37.noarch/fs/nfsd/bl_com.c	2011-01-28 09:43:53.347770803 -0500
+@@ -0,0 +1,292 @@
++#if defined(CONFIG_SPNFS_BLOCK)
 +
-+	nfs_fattr_init(&data->fattr);
-+	msg->rpc_argp = &data->args;
-+	msg->rpc_resp = &data->res;
++#include <linux/module.h>
++#include <linux/mutex.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/socket.h>
++#include <linux/in.h>
++#include <linux/sched.h>
++#include <linux/exportfs.h>
++#include <linux/namei.h>
++#include <linux/mount.h>
++#include <linux/path.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/workqueue.h>
++#include <linux/sunrpc/rpc_pipe_fs.h>
++#include <linux/proc_fs.h>
++#include <linux/nfs_fs.h>
 +
-+	task_setup_data->task = &data->task;
-+	task_setup_data->callback_data = data;
-+	NFS_PROTO(inode)->read_setup(data, msg);
++#include <linux/nfsd/debug.h>
++#include <linux/nfsd4_block.h>
 +
-+	task = rpc_run_task(task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
 +
-+	rpc_put_task(task);
-+
-+	dprintk("NFS: %5u initiated direct read call "
-+		"(req %s/%lld, %u bytes @ offset %llu)\n",
-+		data->task.tk_pid,
-+		inode->i_sb->s_id,
-+		(long long)NFS_FILEID(inode),
-+		data->args.count,
-+		(unsigned long long)data->args.offset);
++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
++    char __user *, size_t);
++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
 +
-+	return 0;
-+}
++static struct rpc_pipe_ops bl_upcall_ops = {
++	.upcall		= bl_pipe_upcall,
++	.downcall	= bl_pipe_downcall,
++	.destroy_msg	= bl_pipe_destroy_msg,
++};
 +
- /*
-  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
-  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
-@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
- 	unsigned long user_addr = (unsigned long)iov->iov_base;
- 	size_t count = iov->iov_len;
- 	size_t rsize = NFS_SERVER(inode)->rsize;
--	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_cred = ctx->cred,
- 	};
-@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
- 		data->res.fattr = &data->fattr;
- 		data->res.eof = 0;
- 		data->res.count = bytes;
--		nfs_fattr_init(&data->fattr);
--		msg.rpc_argp = &data->args;
--		msg.rpc_resp = &data->res;
- 
--		task_setup_data.task = &data->task;
--		task_setup_data.callback_data = data;
--		NFS_PROTO(inode)->read_setup(data, &msg);
--
--		task = rpc_run_task(&task_setup_data);
--		if (IS_ERR(task))
-+		if (nfs_direct_read_execute(data, &task_setup_data, &msg))
- 			break;
--		rpc_put_task(task);
--
--		dprintk("NFS: %5u initiated direct read call "
--			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
--				data->task.tk_pid,
--				inode->i_sb->s_id,
--				(long long)NFS_FILEID(inode),
--				bytes,
--				(unsigned long long)data->args.offset);
- 
- 		started += bytes;
- 		user_addr += bytes;
-@@ -457,12 +471,15 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
- }
- 
- #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-+static long nfs_direct_write_execute(struct nfs_write_data *data,
-+				     struct rpc_task_setup *task_setup_data,
-+				     struct rpc_message *msg);
++bl_comm_t	*bl_comm_global;
 +
- static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
- {
- 	struct inode *inode = dreq->inode;
- 	struct list_head *p;
- 	struct nfs_write_data *data;
--	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_cred = dreq->ctx->cred,
- 	};
-@@ -496,25 +513,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
- 		 * Reuse data->task; data->args should not have changed
- 		 * since the original request was sent.
- 		 */
--		task_setup_data.task = &data->task;
--		task_setup_data.callback_data = data;
--		msg.rpc_argp = &data->args;
--		msg.rpc_resp = &data->res;
--		NFS_PROTO(inode)->write_setup(data, &msg);
--
--		/*
--		 * We're called via an RPC callback, so BKL is already held.
--		 */
--		task = rpc_run_task(&task_setup_data);
--		if (!IS_ERR(task))
--			rpc_put_task(task);
--
--		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
--				data->task.tk_pid,
--				inode->i_sb->s_id,
--				(long long)NFS_FILEID(inode),
--				data->args.count,
--				(unsigned long long)data->args.offset);
-+		nfs_direct_write_execute(data, &task_setup_data, &msg);
- 	}
- 
- 	if (put_dreq(dreq))
-@@ -557,10 +556,31 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
- 	.rpc_release = nfs_direct_commit_release,
- };
- 
-+static long nfs_direct_commit_execute(struct nfs_direct_req *dreq,
-+				      struct nfs_write_data *data,
-+				      struct rpc_task_setup *task_setup_data,
-+				      struct rpc_message *msg)
++int
++nfsd_bl_start(void)
 +{
-+	struct rpc_task *task;
++	bl_comm_t	*bl_comm = NULL;
++	struct path path;
++	struct nameidata nd;
++	int rc;
 +
-+	NFS_PROTO(data->inode)->commit_setup(data, msg);
++	dprintk("%s: starting pipe\n", __func__);
++	if (bl_comm_global)
++		return -EEXIST;
 +
-+	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
-+	dreq->commit_data = NULL;
++	path.mnt = rpc_get_mount();
++	if (IS_ERR(path.mnt))
++		return PTR_ERR(path.mnt);
 +
-+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
++	/* FIXME: do not abuse rpc_pipefs/nfs */
++	rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
++	if (rc)
++		goto err;
 +
-+	task = rpc_run_task(task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
++	bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL);
++	if (!bl_comm) {
++		rc = -ENOMEM;
++		goto err;
++	}
 +
-+	rpc_put_task(task);
++	/* FIXME: rename to "spnfs_block" */
++	bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm,
++					 &bl_upcall_ops, 0);
++	if (IS_ERR(bl_comm->pipe_dentry)) {
++		rc = -EPIPE;
++		goto err;
++	}
++	mutex_init(&bl_comm->lock);
++	mutex_init(&bl_comm->pipe_lock);
++	init_waitqueue_head(&bl_comm->pipe_wq);
++
++	bl_comm_global = bl_comm;
 +	return 0;
++err:
++	rpc_put_mount();
++	kfree(bl_comm);
++	return rc;
 +}
 +
- static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
- {
- 	struct nfs_write_data *data = dreq->commit_data;
--	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_argp = &data->args,
- 		.rpc_resp = &data->res,
-@@ -589,16 +609,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
- 	data->res.verf = &data->verf;
- 	nfs_fattr_init(&data->fattr);
- 
--	NFS_PROTO(data->inode)->commit_setup(data, &msg);
--
--	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
--	dreq->commit_data = NULL;
--
--	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
--
--	task = rpc_run_task(&task_setup_data);
--	if (!IS_ERR(task))
--		rpc_put_task(task);
-+	nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg);
- }
- 
- static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-@@ -700,6 +711,36 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
- 	.rpc_release = nfs_direct_write_release,
- };
- 
-+static long nfs_direct_write_execute(struct nfs_write_data *data,
-+				     struct rpc_task_setup *task_setup_data,
-+				     struct rpc_message *msg)
++void
++nfsd_bl_stop(void)
 +{
-+	struct inode *inode = data->inode;
-+	struct rpc_task *task;
++	bl_comm_t	*c = bl_comm_global;
 +
-+	task_setup_data->task = &data->task;
-+	task_setup_data->callback_data = data;
-+	msg->rpc_argp = &data->args;
-+	msg->rpc_resp = &data->res;
-+	NFS_PROTO(inode)->write_setup(data, msg);
++	dprintk("%s: stopping pipe\n", __func__);
++	if (!c)
++		return;
++	rpc_unlink(c->pipe_dentry);
++	rpc_put_mount();
++	bl_comm_global = NULL;
++	kfree(c);
++}
 +
-+	task = rpc_run_task(task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
++static ssize_t
++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst,
++    size_t buflen)
++{
++	char	*data	= (char *)msg->data + msg->copied;
++	ssize_t	mlen	= msg->len - msg->copied,
++		left;
 +
-+	rpc_put_task(task);
++	if (mlen > buflen)
++		mlen = buflen;
 +
-+	dprintk("NFS: %5u initiated direct write call "
-+		"(req %s/%lld, %u bytes @ offset %llu)\n",
-+		data->task.tk_pid,
-+		inode->i_sb->s_id,
-+		(long long)NFS_FILEID(inode),
-+		data->args.count,
-+		(unsigned long long)data->args.offset);
++	left = copy_to_user(dst, data, mlen);
++	if (left < 0) {
++		msg->errno = left;
++		return left;
++	}
++	mlen		-= left;
++	msg->copied	+= mlen;
++	msg->errno	= 0;
 +
-+	return 0;
++	return mlen;
 +}
 +
- /*
-  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
-  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
-@@ -715,7 +756,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
- 	struct inode *inode = ctx->path.dentry->d_inode;
- 	unsigned long user_addr = (unsigned long)iov->iov_base;
- 	size_t count = iov->iov_len;
--	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_cred = ctx->cred,
- 	};
-@@ -782,24 +822,8 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
- 		data->res.verf = &data->verf;
- 		nfs_fattr_init(&data->fattr);
- 
--		task_setup_data.task = &data->task;
--		task_setup_data.callback_data = data;
--		msg.rpc_argp = &data->args;
--		msg.rpc_resp = &data->res;
--		NFS_PROTO(inode)->write_setup(data, &msg);
--
--		task = rpc_run_task(&task_setup_data);
--		if (IS_ERR(task))
-+		if (nfs_direct_write_execute(data, &task_setup_data, &msg))
- 			break;
--		rpc_put_task(task);
--
--		dprintk("NFS: %5u initiated direct write call "
--			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
--				data->task.tk_pid,
--				inode->i_sb->s_id,
--				(long long)NFS_FILEID(inode),
--				bytes,
--				(unsigned long long)data->args.offset);
- 
- 		started += bytes;
- 		user_addr += bytes;
-diff --git a/fs/nfs/file.c b/fs/nfs/file.c
-index 05bf3c0..28d4aa3 100644
---- a/fs/nfs/file.c
-+++ b/fs/nfs/file.c
-@@ -36,6 +36,7 @@
- #include "internal.h"
- #include "iostat.h"
- #include "fscache.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_FILE
- 
-@@ -380,12 +381,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
- 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- 	struct page *page;
- 	int once_thru = 0;
-+	struct pnfs_layout_segment *lseg;
- 
- 	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
- 		file->f_path.dentry->d_parent->d_name.name,
- 		file->f_path.dentry->d_name.name,
- 		mapping->host->i_ino, len, (long long) pos);
- 
-+	lseg = pnfs_update_layout(mapping->host,
-+				  nfs_file_open_context(file),
-+				  pos, len, IOMODE_RW);
- start:
- 	/*
- 	 * Prevent starvation issues if someone is doing a consistency
-@@ -394,17 +399,22 @@ start:
- 	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
- 			nfs_wait_bit_killable, TASK_KILLABLE);
- 	if (ret)
--		return ret;
-+		goto out;
- 
- 	page = grab_cache_page_write_begin(mapping, index, flags);
--	if (!page)
--		return -ENOMEM;
-+	if (!page) {
-+		ret = -ENOMEM;
-+		goto out;
-+	}
- 	*pagep = page;
- 
--	ret = nfs_flush_incompatible(file, page);
-+	ret = nfs_flush_incompatible(file, page, lseg);
- 	if (ret) {
- 		unlock_page(page);
- 		page_cache_release(page);
-+		*pagep = NULL;
-+		*fsdata = NULL;
-+		goto out;
- 	} else if (!once_thru &&
- 		   nfs_want_read_modify_write(file, page, pos, len)) {
- 		once_thru = 1;
-@@ -413,6 +423,12 @@ start:
- 		if (!ret)
- 			goto start;
- 	}
-+	ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
-+ out:
-+	if (ret) {
-+		put_lseg(lseg);
-+		*fsdata = NULL;
-+	}
- 	return ret;
- }
- 
-@@ -422,6 +438,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
- {
- 	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
- 	int status;
-+	struct pnfs_layout_segment *lseg;
- 
- 	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
- 		file->f_path.dentry->d_parent->d_name.name,
-@@ -448,10 +465,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
- 			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
- 	}
- 
--	status = nfs_updatepage(file, page, offset, copied);
-+	lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
-+	status = pnfs_write_end(file, page, pos, len, copied, lseg);
-+	if (status)
-+		goto out;
-+	status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
- 
-+ out:
- 	unlock_page(page);
- 	page_cache_release(page);
-+	pnfs_write_end_cleanup(file, fsdata);
-+	put_lseg(lseg);
- 
- 	if (status < 0)
- 		return status;
-@@ -562,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
- 	/* make sure the cache has finished storing the page */
- 	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
- 
-+	/* XXX Do we want to call pnfs_update_layout here? */
-+
- 	lock_page(page);
- 	mapping = page->mapping;
- 	if (mapping != dentry->d_inode->i_mapping)
-@@ -572,11 +598,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
- 	if (pagelen == 0)
- 		goto out_unlock;
- 
--	ret = nfs_flush_incompatible(filp, page);
-+	ret = nfs_flush_incompatible(filp, page, NULL);
- 	if (ret != 0)
- 		goto out_unlock;
- 
--	ret = nfs_updatepage(filp, page, 0, pagelen);
-+	ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL);
- out_unlock:
- 	if (!ret)
- 		return VM_FAULT_LOCKED;
-diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
-index 7d2d6c7..437d9a6 100644
---- a/fs/nfs/inode.c
-+++ b/fs/nfs/inode.c
-@@ -48,6 +48,7 @@
- #include "internal.h"
- #include "fscache.h"
- #include "dns_resolve.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_VFS
- 
-@@ -648,6 +649,7 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
- 		atomic_inc(&ctx->lock_context.count);
- 	return ctx;
- }
-+EXPORT_SYMBOL(get_nfs_open_context);
- 
- static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
- {
-@@ -1000,6 +1002,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
- 	fattr->time_start = jiffies;
- 	fattr->gencount = nfs_inc_attr_generation_counter();
- }
-+EXPORT_SYMBOL(nfs_fattr_init);
- 
- struct nfs_fattr *nfs_alloc_fattr(void)
- {
-@@ -1209,6 +1212,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
- 		server->fsid = fattr->fsid;
- 
- 	/*
-+	 * file needs layout commit, server attributes may be stale
-+	 */
-+	if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) {
-+		dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n",
-+			__func__, inode->i_sb->s_id, inode->i_ino);
-+		return 0;
-+	}
-+	/*
- 	 * Update the read time so we don't revalidate too often.
- 	 */
- 	nfsi->read_cache_jiffies = fattr->time_start;
-@@ -1407,11 +1418,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
-  */
- void nfs4_evict_inode(struct inode *inode)
- {
-+	pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true);
- 	truncate_inode_pages(&inode->i_data, 0);
- 	end_writeback(inode);
-+	pnfs_destroy_layout(NFS_I(inode));
- 	/* If we are holding a delegation, return it! */
- 	nfs_inode_return_delegation_noreclaim(inode);
--	/* First call standard NFS clear_inode() code */
- 	nfs_clear_inode(inode);
- }
- #endif
-@@ -1446,6 +1458,8 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
- 	nfsi->delegation = NULL;
- 	nfsi->delegation_state = 0;
- 	init_rwsem(&nfsi->rwsem);
-+	rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout");
-+	nfsi->layout = NULL;
- #endif
- }
- 
-diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
-index c961bc9..4e7a4c9 100644
---- a/fs/nfs/internal.h
-+++ b/fs/nfs/internal.h
-@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
- 					   struct nfs_fattr *);
- extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
- extern int nfs4_check_client_ready(struct nfs_client *clp);
-+extern int nfs_sockaddr_cmp(const struct sockaddr *sa1,
-+		const struct sockaddr *sa2);
-+extern int nfs4_set_client(struct nfs_server *server,
-+		const char *hostname,
-+		const struct sockaddr *addr,
-+		const size_t addrlen,
-+		const char *ip_addr,
-+		rpc_authflavor_t authflavour,
-+		int proto, const struct rpc_timeout *timeparms,
-+		u32 minorversion);
- #ifdef CONFIG_PROC_FS
- extern int __init nfs_fs_proc_init(void);
- extern void nfs_fs_proc_exit(void);
-@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead;
- extern struct rpc_procinfo nfs4_procedures[];
- #endif
- 
-+extern int nfs4_recover_expired_lease(struct nfs_client *clp);
-+
- /* proc.c */
- void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
- 
-@@ -249,10 +261,31 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
- #endif
- 
- /* read.c */
-+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+			     const struct rpc_call_ops *call_ops);
-+extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+			     const struct rpc_call_ops *call_ops);
- extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
- 
- /* write.c */
-+extern int nfs_initiate_write(struct nfs_write_data *data,
-+			      struct rpc_clnt *clnt,
-+			      const struct rpc_call_ops *call_ops,
-+			      int how);
-+extern int pnfs_initiate_write(struct nfs_write_data *data,
-+			      struct rpc_clnt *clnt,
-+			      const struct rpc_call_ops *call_ops,
-+			      int how);
-+extern int nfs_initiate_commit(struct nfs_write_data *data,
-+			       struct rpc_clnt *clnt,
-+			       const struct rpc_call_ops *call_ops,
-+			       int how);
-+extern int pnfs_initiate_commit(struct nfs_write_data *data,
-+			       struct rpc_clnt *clnt,
-+			       const struct rpc_call_ops *call_ops,
-+				int how, int pnfs);
- extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
-+extern void nfs_mark_list_commit(struct list_head *head);
- #ifdef CONFIG_MIGRATION
- extern int nfs_migrate_page(struct address_space *,
- 		struct page *, struct page *);
-diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
-index 311e15c..cb390fb 100644
---- a/fs/nfs/nfs4_fs.h
-+++ b/fs/nfs/nfs4_fs.h
-@@ -46,6 +46,7 @@ enum nfs4_client_state {
- 	NFS4CLNT_DELEGRETURN,
- 	NFS4CLNT_SESSION_RESET,
- 	NFS4CLNT_RECALL_SLOT,
-+	NFS4CLNT_LAYOUT_RECALL,
- };
- 
- enum nfs4_session_state {
-@@ -256,10 +257,12 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
- }
- 
- extern int nfs4_setup_sequence(const struct nfs_server *server,
-+		struct nfs4_session *ds_session,
- 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- 		int cache_reply, struct rpc_task *task);
- extern void nfs4_destroy_session(struct nfs4_session *session);
- extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
-+extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *);
- extern int nfs4_proc_create_session(struct nfs_client *);
- extern int nfs4_proc_destroy_session(struct nfs4_session *);
- extern int nfs4_init_session(struct nfs_server *server);
-@@ -272,6 +275,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
- }
- 
- static inline int nfs4_setup_sequence(const struct nfs_server *server,
-+		struct nfs4_session *ds_session,
- 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- 		int cache_reply, struct rpc_task *task)
- {
-@@ -289,7 +293,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
- extern const u32 nfs4_fattr_bitmap[2];
- extern const u32 nfs4_statfs_bitmap[2];
- extern const u32 nfs4_pathconf_bitmap[2];
--extern const u32 nfs4_fsinfo_bitmap[2];
-+extern const u32 nfs4_fsinfo_bitmap[3];
- extern const u32 nfs4_fs_locations_bitmap[2];
- 
- /* nfs4renewd.c */
-@@ -299,13 +303,24 @@ extern void nfs4_kill_renewd(struct nfs_client *);
- extern void nfs4_renew_state(struct work_struct *);
- 
- /* nfs4state.c */
-+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
- struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
- struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
- #if defined(CONFIG_NFS_V4_1)
--struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
- struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
- #endif /* CONFIG_NFS_V4_1 */
- 
-+static inline struct rpc_cred *
-+nfs4_get_machine_cred(struct nfs_client *clp)
-+{
-+	struct rpc_cred *cred;
-+
-+	spin_lock(&clp->cl_lock);
-+	cred = nfs4_get_machine_cred_locked(clp);
-+	spin_unlock(&clp->cl_lock);
-+	return cred;
-+}
-+
- extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
- extern void nfs4_put_state_owner(struct nfs4_state_owner *);
- extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
-diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
-new file mode 100644
-index 0000000..aaabe2f
---- /dev/null
-+++ b/fs/nfs/nfs4filelayout.c
-@@ -0,0 +1,679 @@
-+/*
-+ *  Module for the pnfs nfs4 file layout driver.
-+ *  Defines all I/O and Policy interface operations, plus code
-+ *  to register itself with the pNFS client.
-+ *
-+ *  Copyright (c) 2002
-+ *  The Regents of the University of Michigan
-+ *  All Rights Reserved
-+ *
-+ *  Dean Hildebrand <dhildebz at umich.edu>
-+ *
-+ *  Permission is granted to use, copy, create derivative works, and
-+ *  redistribute this software and such derivative works for any purpose,
-+ *  so long as the name of the University of Michigan is not used in
-+ *  any advertising or publicity pertaining to the use or distribution
-+ *  of this software without specific, written prior authorization. If
-+ *  the above copyright notice or any other identification of the
-+ *  University of Michigan is included in any copy of any portion of
-+ *  this software, then the disclaimer below must also be included.
-+ *
-+ *  This software is provided as is, without representation or warranty
-+ *  of any kind either express or implied, including without limitation
-+ *  the implied warranties of merchantability, fitness for a particular
-+ *  purpose, or noninfringement.  The Regents of the University of
-+ *  Michigan shall not be liable for any damages, including special,
-+ *  indirect, incidental, or consequential damages, with respect to any
-+ *  claim arising out of or in connection with the use of the software,
-+ *  even if it has been or is hereafter advised of the possibility of
-+ *  such damages.
-+ */
-+
-+#include <linux/nfs_fs.h>
-+
-+#include "internal.h"
-+#include "nfs4filelayout.h"
-+
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-+
-+MODULE_LICENSE("GPL");
-+MODULE_AUTHOR("Dean Hildebrand <dhildebz at umich.edu>");
-+MODULE_DESCRIPTION("The NFSv4 file layout driver");
-+
-+int
-+filelayout_initialize_mountpoint(struct nfs_server *nfss,
-+				 const struct nfs_fh *mntfh)
-+{
-+	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
-+						nfs4_fl_free_deviceid_callback);
-+	if (status) {
-+		printk(KERN_WARNING "%s: deviceid cache could not be "
-+			"initialized\n", __func__);
-+		return status;
-+	}
-+	dprintk("%s: deviceid cache has been initialized successfully\n",
-+		__func__);
-+	return 0;
-+}
-+
-+/* Uninitialize a mountpoint by destroying its device list */
-+int
-+filelayout_uninitialize_mountpoint(struct nfs_server *nfss)
-+{
-+	dprintk("--> %s\n", __func__);
-+
-+	if (nfss->nfs_client->cl_devid_cache)
-+		pnfs_put_deviceid_cache(nfss->nfs_client);
-+	return 0;
-+}
-+
-+/* This function is used by the layout driver to calculate the
-+ * offset of the file on the dserver based on whether the
-+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
-+ */
-+static loff_t
-+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
-+{
-+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
-+
-+	switch (flseg->stripe_type) {
-+	case STRIPE_SPARSE:
-+		return offset;
-+
-+	case STRIPE_DENSE:
-+	{
-+		u32 stripe_width;
-+		u64 tmp, off;
-+		u32 unit = flseg->stripe_unit;
-+
-+		stripe_width = unit * flseg->dsaddr->stripe_count;
-+		tmp = off = offset - flseg->pattern_offset;
-+		do_div(tmp, stripe_width);
-+		return tmp * unit + do_div(off, unit);
-+	}
-+	default:
-+		BUG();
-+	}
-+
-+	/* We should never get here... just to stop the gcc warning */
-+	return 0;
-+}
-+
-+/*
-+ * Call ops for the async read/write cases
-+ * In the case of dense layouts, the offset needs to be reset to its
-+ * original value.
-+ */
-+static void filelayout_read_call_done(struct rpc_task *task, void *data)
-+{
-+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
-+
-+	if (rdata->fldata.orig_offset) {
-+		dprintk("%s new off %llu orig offset %llu\n", __func__,
-+			rdata->args.offset, rdata->fldata.orig_offset);
-+		rdata->args.offset = rdata->fldata.orig_offset;
-+	}
-+
-+	/* Note this may cause RPC to be resent */
-+	rdata->pdata.call_ops->rpc_call_done(task, data);
-+}
-+
-+static void filelayout_read_release(void *data)
-+{
-+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
-+
-+	put_lseg(rdata->pdata.lseg);
-+	rdata->pdata.lseg = NULL;
-+	rdata->pdata.call_ops->rpc_release(data);
-+}
-+
-+static void filelayout_write_call_done(struct rpc_task *task, void *data)
-+{
-+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
-+
-+	if (wdata->fldata.orig_offset) {
-+		dprintk("%s new off %llu orig offset %llu\n", __func__,
-+			wdata->args.offset, wdata->fldata.orig_offset);
-+		wdata->args.offset = wdata->fldata.orig_offset;
-+	}
-+
-+	/* Note this may cause RPC to be resent */
-+	wdata->pdata.call_ops->rpc_call_done(task, data);
-+}
-+
-+static void filelayout_write_release(void *data)
-+{
-+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
-+
-+	put_lseg(wdata->pdata.lseg);
-+	wdata->pdata.lseg = NULL;
-+	wdata->pdata.call_ops->rpc_release(data);
-+}
-+
-+struct rpc_call_ops filelayout_read_call_ops = {
-+	.rpc_call_prepare = nfs_read_prepare,
-+	.rpc_call_done = filelayout_read_call_done,
-+	.rpc_release = filelayout_read_release,
-+};
-+
-+struct rpc_call_ops filelayout_write_call_ops = {
-+	.rpc_call_prepare = nfs_write_prepare,
-+	.rpc_call_done = filelayout_write_call_done,
-+	.rpc_release = filelayout_write_release,
-+};
-+
-+/* Perform sync or async reads.
-+ *
-+ * An optimization for the NFS file layout driver
-+ * allows the original read/write data structs to be passed in the
-+ * last argument.
-+ *
-+ * TODO: join with write_pagelist?
-+ */
-+static enum pnfs_try_status
-+filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages)
-+{
-+	struct pnfs_layout_segment *lseg = data->pdata.lseg;
-+	struct nfs4_pnfs_ds *ds;
-+	loff_t offset = data->args.offset;
-+	u32 idx;
-+	struct nfs_fh *fh;
-+
-+	dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n",
-+		__func__, data->inode->i_ino, nr_pages,
-+		data->args.pgbase, (size_t)data->args.count, offset);
-+
-+	/* Retrieve the correct rpc_client for the byte range */
-+	idx = nfs4_fl_calc_ds_index(lseg, offset);
-+	ds = nfs4_fl_prepare_ds(lseg, idx);
-+	if (!ds) {
-+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
-+		return PNFS_NOT_ATTEMPTED;
-+	}
-+	dprintk("%s USE DS:ip %x %hu\n", __func__,
-+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
-+
-+	/* just try the first data server for the index..*/
-+	data->fldata.ds_nfs_client = ds->ds_clp;
-+	fh = nfs4_fl_select_ds_fh(lseg, offset);
-+	if (fh)
-+		data->args.fh = fh;
-+
-+	/*
-+	 * Now get the file offset on the dserver
-+	 * Set the read offset to this offset, and
-+	 * save the original offset in orig_offset
-+	 * In the case of aync reads, the offset will be reset in the
-+	 * call_ops->rpc_call_done() routine.
-+	 */
-+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-+	data->fldata.orig_offset = offset;
-+
-+	/* Perform an asynchronous read */
-+	nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
-+			  &filelayout_read_call_ops);
-+
-+	data->pdata.pnfs_error = 0;
-+
-+	return PNFS_ATTEMPTED;
-+}
-+
-+/* Perform async writes. */
-+static enum pnfs_try_status
-+filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync)
-+{
-+	struct pnfs_layout_segment *lseg = data->pdata.lseg;
-+	struct nfs4_pnfs_ds *ds;
-+	loff_t offset = data->args.offset;
-+	u32 idx;
-+	struct nfs_fh *fh;
-+
-+	/* Retrieve the correct rpc_client for the byte range */
-+	idx = nfs4_fl_calc_ds_index(lseg, offset);
-+	ds = nfs4_fl_prepare_ds(lseg, idx);
-+	if (!ds) {
-+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
-+		return PNFS_NOT_ATTEMPTED;
-+	}
-+	dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
-+		data->inode->i_ino, sync, (size_t) data->args.count, offset,
-+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
-+
-+	data->fldata.ds_nfs_client = ds->ds_clp;
-+	fh = nfs4_fl_select_ds_fh(lseg, offset);
-+	if (fh)
-+		data->args.fh = fh;
-+	/*
-+	 * Get the file offset on the dserver. Set the write offset to
-+	 * this offset and save the original offset.
-+	 */
-+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-+	data->fldata.orig_offset = offset;
-+
-+	/*
-+	 * Perform an asynchronous write The offset will be reset in the
-+	 * call_ops->rpc_call_done() routine
-+	 */
-+	nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
-+			   &filelayout_write_call_ops, sync);
-+
-+	data->pdata.pnfs_error = 0;
-+	return PNFS_ATTEMPTED;
-+}
-+
-+/*
-+ * filelayout_check_layout()
-+ *
-+ * Make sure layout segment parameters are sane WRT the device.
-+ * At this point no generic layer initialization of the lseg has occurred,
-+ * and nothing has been added to the layout_hdr cache.
-+ *
-+ */
-+static int
-+filelayout_check_layout(struct pnfs_layout_hdr *lo,
-+			struct nfs4_filelayout_segment *fl,
-+			struct nfs4_layoutget_res *lgr,
-+			struct nfs4_deviceid *id)
-+{
-+	struct nfs4_file_layout_dsaddr *dsaddr;
-+	int status = -EINVAL;
-+	struct nfs_server *nfss = NFS_SERVER(lo->inode);
-+
-+	dprintk("--> %s\n", __func__);
-+
-+	if (fl->pattern_offset > lgr->range.offset) {
-+		dprintk("%s pattern_offset %lld to large\n",
-+				__func__, fl->pattern_offset);
-+		goto out;
-+	}
-+
-+	if (fl->stripe_unit % PAGE_SIZE) {
-+		dprintk("%s Stripe unit (%u) not page aligned\n",
-+			__func__, fl->stripe_unit);
-+		goto out;
-+	}
-+
-+	/* find and reference the deviceid */
-+	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
-+	if (dsaddr == NULL) {
-+		dsaddr = get_device_info(lo->inode, id);
-+		if (dsaddr == NULL)
-+			goto out;
-+	}
-+	fl->dsaddr = dsaddr;
-+
-+	if (fl->first_stripe_index < 0 ||
-+	    fl->first_stripe_index >= dsaddr->stripe_count) {
-+		dprintk("%s Bad first_stripe_index %d\n",
-+				__func__, fl->first_stripe_index);
-+		goto out_put;
-+	}
-+
-+	if ((fl->stripe_type == STRIPE_SPARSE &&
-+	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
-+	    (fl->stripe_type == STRIPE_DENSE &&
-+	    fl->num_fh != dsaddr->stripe_count)) {
-+		dprintk("%s num_fh %u not valid for given packing\n",
-+			__func__, fl->num_fh);
-+		goto out_put;
-+	}
-+
-+	if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
-+		dprintk("%s Stripe unit (%u) not aligned with rsize %u "
-+			"wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
-+			nfss->wsize);
-+	}
-+
-+	status = 0;
-+out:
-+	dprintk("--> %s returns %d\n", __func__, status);
-+	return status;
-+out_put:
-+	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
-+	goto out;
-+}
-+
-+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
-+{
-+	int i;
-+
-+	for (i = 0; i < fl->num_fh; i++) {
-+		if (!fl->fh_array[i])
-+			break;
-+		kfree(fl->fh_array[i]);
-+	}
-+	kfree(fl->fh_array);
-+	fl->fh_array = NULL;
-+}
-+
-+static void
-+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
-+{
-+	filelayout_free_fh_array(fl);
-+	kfree(fl);
-+}
-+
-+static int
-+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
-+			 struct nfs4_filelayout_segment *fl,
-+			 struct nfs4_layoutget_res *lgr,
-+			 struct nfs4_deviceid *id)
-+{
-+	uint32_t *p = (uint32_t *)lgr->layout.buf;
-+	uint32_t nfl_util;
-+	int i;
-+
-+	dprintk("%s: set_layout_map Begin\n", __func__);
-+
-+	memcpy(id, p, sizeof(*id));
-+	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-+	print_deviceid(id);
-+
-+	nfl_util = be32_to_cpup(p++);
-+	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
-+		fl->commit_through_mds = 1;
-+	if (nfl_util & NFL4_UFLG_DENSE)
-+		fl->stripe_type = STRIPE_DENSE;
-+	else
-+		fl->stripe_type = STRIPE_SPARSE;
-+	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
-+
-+	fl->first_stripe_index = be32_to_cpup(p++);
-+	p = xdr_decode_hyper(p, &fl->pattern_offset);
-+	fl->num_fh = be32_to_cpup(p++);
-+
-+	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
-+		__func__, nfl_util, fl->num_fh, fl->first_stripe_index,
-+		fl->pattern_offset);
-+
-+	fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
-+			       GFP_KERNEL);
-+	if (!fl->fh_array)
-+		return -ENOMEM;
-+
-+	for (i = 0; i < fl->num_fh; i++) {
-+		/* Do we want to use a mempool here? */
-+		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
-+		if (!fl->fh_array[i]) {
-+			filelayout_free_fh_array(fl);
-+			return -ENOMEM;
-+		}
-+		fl->fh_array[i]->size = be32_to_cpup(p++);
-+		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
-+			printk(KERN_ERR "Too big fh %d received %d\n",
-+			       i, fl->fh_array[i]->size);
-+			filelayout_free_fh_array(fl);
-+			return -EIO;
-+		}
-+		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
-+		p += XDR_QUADLEN(fl->fh_array[i]->size);
-+		dprintk("DEBUG: %s: fh len %d\n", __func__,
-+			fl->fh_array[i]->size);
-+	}
-+
-+	return 0;
-+}
-+
-+static struct pnfs_layout_segment *
-+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
-+		      struct nfs4_layoutget_res *lgr)
-+{
-+	struct nfs4_filelayout_segment *fl;
-+	int rc;
-+	struct nfs4_deviceid id;
-+
-+	dprintk("--> %s\n", __func__);
-+	fl = kzalloc(sizeof(*fl), GFP_KERNEL);
-+	if (!fl)
-+		return NULL;
-+
-+	rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
-+	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
-+		_filelayout_free_lseg(fl);
-+		return NULL;
-+	}
-+	return &fl->generic_hdr;
-+}
-+
-+static void
-+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
-+{
-+	struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
-+	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
-+
-+	dprintk("--> %s\n", __func__);
-+	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
-+			  &fl->dsaddr->deviceid);
-+	_filelayout_free_lseg(fl);
-+}
-+
-+/* Allocate a new nfs_write_data struct and initialize */
-+static struct nfs_write_data *
-+filelayout_clone_write_data(struct nfs_write_data *old)
-+{
-+	static struct nfs_write_data *new;
-+
-+	new = nfs_commitdata_alloc();
-+	if (!new)
-+		goto out;
-+	kref_init(&new->refcount);
-+	new->parent      = old;
-+	kref_get(&old->refcount);
-+	new->inode       = old->inode;
-+	new->cred        = old->cred;
-+	new->args.offset = 0;
-+	new->args.count  = 0;
-+	new->res.count   = 0;
-+	new->res.fattr   = &new->fattr;
-+	nfs_fattr_init(&new->fattr);
-+	new->res.verf    = &new->verf;
-+	new->args.context = get_nfs_open_context(old->args.context);
-+	new->pdata.lseg = NULL;
-+	new->pdata.call_ops = old->pdata.call_ops;
-+	new->pdata.how = old->pdata.how;
-+out:
-+	return new;
-+}
-+
-+static void filelayout_commit_call_done(struct rpc_task *task, void *data)
-+{
-+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
-+
-+	wdata->pdata.call_ops->rpc_call_done(task, data);
-+}
-+
-+static struct rpc_call_ops filelayout_commit_call_ops = {
-+	.rpc_call_prepare = nfs_write_prepare,
-+	.rpc_call_done = filelayout_commit_call_done,
-+	.rpc_release = filelayout_write_release,
-+};
-+
-+/*
-+ * Execute a COMMIT op to the MDS or to each data server on which a page
-+ * in 'pages' exists.
-+ * Invoke the pnfs_commit_complete callback.
-+ */
-+enum pnfs_try_status
-+filelayout_commit(struct nfs_write_data *data, int sync)
-+{
-+	LIST_HEAD(head);
-+	struct nfs_page *req;
-+	loff_t file_offset = 0;
-+	u16 idx, i;
-+	struct list_head **ds_page_list = NULL;
-+	u16 *indices_used;
-+	int num_indices_seen = 0;
-+	const struct rpc_call_ops *call_ops;
-+	struct rpc_clnt *clnt;
-+	struct nfs_write_data **clone_list = NULL;
-+	struct nfs_write_data *dsdata;
-+	struct nfs4_pnfs_ds *ds;
-+
-+	dprintk("%s data %p sync %d\n", __func__, data, sync);
-+
-+	/* Alloc room for both in one go */
-+	ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) *
-+			       (sizeof(u16) + sizeof(struct list_head *)),
-+			       GFP_KERNEL);
-+	if (!ds_page_list)
-+		goto mem_error;
-+	indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1);
-+	/*
-+	 * Sort pages based on which ds to send to.
-+	 * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT.
-+	 * Note we are assuming there is only a single lseg in play.
-+	 * When that is not true, we could first sort on lseg, then
-+	 * sort within each as we do here.
-+	 */
-+	while (!list_empty(&data->pages)) {
-+		req = nfs_list_entry(data->pages.next);
-+		nfs_list_remove_request(req);
-+		if (!req->wb_lseg ||
-+		    ((struct nfs4_filelayout_segment *)
-+		     FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds)
-+			idx = NFS4_PNFS_MAX_MULTI_CNT;
-+		else {
-+			file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
-+			idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset);
-+		}
-+		if (ds_page_list[idx]) {
-+			/* Already seen this idx */
-+			list_add(&req->wb_list, ds_page_list[idx]);
-+		} else {
-+			/* New idx not seen so far */
-+			list_add_tail(&req->wb_list, &head);
-+			indices_used[num_indices_seen++] = idx;
-+		}
-+		ds_page_list[idx] = &req->wb_list;
-+	}
-+	/* Once created, clone must be released via call_op */
-+	clone_list = kzalloc(num_indices_seen *
-+			     sizeof(struct nfs_write_data *), GFP_KERNEL);
-+	if (!clone_list)
-+		goto mem_error;
-+	for (i = 0; i < num_indices_seen - 1; i++) {
-+		clone_list[i] = filelayout_clone_write_data(data);
-+		if (!clone_list[i])
-+			goto mem_error;
-+	}
-+	clone_list[i] = data;
-+	/*
-+	 * Now send off the RPCs to each ds.  Note that it is important
-+	 * that any RPC to the MDS be sent last (or at least after all
-+	 * clones have been made.)
-+	 */
-+	for (i = 0; i < num_indices_seen; i++) {
-+		dsdata = clone_list[i];
-+		idx = indices_used[i];
-+		list_cut_position(&dsdata->pages, &head, ds_page_list[idx]);
-+		if (idx == NFS4_PNFS_MAX_MULTI_CNT) {
-+			call_ops = data->pdata.call_ops;;
-+			clnt = NFS_CLIENT(dsdata->inode);
-+			ds = NULL;
-+		} else {
-+			struct nfs_fh *fh;
-+
-+			call_ops = &filelayout_commit_call_ops;
-+			req = nfs_list_entry(dsdata->pages.next);
-+			ds = nfs4_fl_prepare_ds(req->wb_lseg, idx);
-+			if (!ds) {
-+				/* Trigger retry of this chunk through MDS */
-+				dsdata->task.tk_status = -EIO;
-+				data->pdata.call_ops->rpc_release(dsdata);
-+				continue;
-+			}
-+			clnt = ds->ds_clp->cl_rpcclient;
-+			dsdata->fldata.ds_nfs_client = ds->ds_clp;
-+			file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
-+			fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset);
-+			if (fh)
-+				dsdata->args.fh = fh;
-+		}
-+		dprintk("%s: Initiating commit: %llu USE DS:\n",
-+			__func__, file_offset);
-+		ifdebug(FACILITY)
-+			print_ds(ds);
-+
-+		/* Send COMMIT to data server */
-+		nfs_initiate_commit(dsdata, clnt, call_ops, sync);
-+	}
-+	kfree(clone_list);
-+	kfree(ds_page_list);
-+	data->pdata.pnfs_error = 0;
-+	return PNFS_ATTEMPTED;
-+
-+ mem_error:
-+	if (clone_list) {
-+		for (i = 0; i < num_indices_seen - 1; i++) {
-+			if (!clone_list[i])
-+				break;
-+			data->pdata.call_ops->rpc_release(clone_list[i]);
-+		}
-+		kfree(clone_list);
-+	}
-+	kfree(ds_page_list);
-+	/* One of these will be empty, but doesn't hurt to do both */
-+	nfs_mark_list_commit(&head);
-+	nfs_mark_list_commit(&data->pages);
-+	data->pdata.call_ops->rpc_release(data);
-+	return PNFS_ATTEMPTED;
-+}
-+
-+/*
-+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
-+ *
-+ * return 1 :  coalesce page
-+ * return 0 :  don't coalesce page
-+ *
-+ * By the time this is called, we know req->wb_lseg == prev->wb_lseg
-+ */
-+int
-+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
-+		   struct nfs_page *req)
-+{
-+	u64 p_stripe, r_stripe;
-+	u32 stripe_unit;
-+
-+	if (!req->wb_lseg)
-+		return 1;
-+	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
-+	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
-+	stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit;
-+
-+	do_div(p_stripe, stripe_unit);
-+	do_div(r_stripe, stripe_unit);
-+
-+	return (p_stripe == r_stripe);
-+}
-+
-+static struct pnfs_layoutdriver_type filelayout_type = {
-+	.id = LAYOUT_NFSV4_1_FILES,
-+	.name = "LAYOUT_NFSV4_1_FILES",
-+	.owner = THIS_MODULE,
-+	.flags                   = PNFS_USE_RPC_CODE,
-+	.initialize_mountpoint   = filelayout_initialize_mountpoint,
-+	.uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
-+	.alloc_lseg              = filelayout_alloc_lseg,
-+	.free_lseg               = filelayout_free_lseg,
-+	.pg_test                 = filelayout_pg_test,
-+	.read_pagelist           = filelayout_read_pagelist,
-+	.write_pagelist          = filelayout_write_pagelist,
-+	.commit                  = filelayout_commit,
-+};
-+
-+static int __init nfs4filelayout_init(void)
-+{
-+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
-+	       __func__);
-+	return pnfs_register_layoutdriver(&filelayout_type);
-+}
-+
-+static void __exit nfs4filelayout_exit(void)
-+{
-+	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
-+	       __func__);
-+	pnfs_unregister_layoutdriver(&filelayout_type);
-+}
-+
-+module_init(nfs4filelayout_init);
-+module_exit(nfs4filelayout_exit);
-diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
-new file mode 100644
-index 0000000..f884b0c
---- /dev/null
-+++ b/fs/nfs/nfs4filelayout.h
-@@ -0,0 +1,100 @@
-+/*
-+ *  NFSv4 file layout driver data structures.
-+ *
-+ *  Copyright (c) 2002
-+ *  The Regents of the University of Michigan
-+ *  All Rights Reserved
-+ *
-+ *  Dean Hildebrand <dhildebz at umich.edu>
-+ *
-+ *  Permission is granted to use, copy, create derivative works, and
-+ *  redistribute this software and such derivative works for any purpose,
-+ *  so long as the name of the University of Michigan is not used in
-+ *  any advertising or publicity pertaining to the use or distribution
-+ *  of this software without specific, written prior authorization. If
-+ *  the above copyright notice or any other identification of the
-+ *  University of Michigan is included in any copy of any portion of
-+ *  this software, then the disclaimer below must also be included.
-+ *
-+ *  This software is provided as is, without representation or warranty
-+ *  of any kind either express or implied, including without limitation
-+ *  the implied warranties of merchantability, fitness for a particular
-+ *  purpose, or noninfringement.  The Regents of the University of
-+ *  Michigan shall not be liable for any damages, including special,
-+ *  indirect, incidental, or consequential damages, with respect to any
-+ *  claim arising out of or in connection with the use of the software,
-+ *  even if it has been or is hereafter advised of the possibility of
-+ *  such damages.
-+ */
-+
-+#ifndef FS_NFS_NFS4FILELAYOUT_H
-+#define FS_NFS_NFS4FILELAYOUT_H
-+
-+#include "pnfs.h"
-+
-+/*
-+ * Field testing shows we need to support upto 4096 stripe indices.
-+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
-+ * reasonable. This in turn means we support a maximum of 256
-+ * RFC 5661 multipath_list4 structures.
-+ */
-+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
-+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
-+
-+enum stripetype4 {
-+	STRIPE_SPARSE = 1,
-+	STRIPE_DENSE = 2
-+};
-+
-+/* Individual ip address */
-+struct nfs4_pnfs_ds {
-+	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
-+	u32			ds_ip_addr;
-+	u32			ds_port;
-+	struct nfs_client	*ds_clp;
-+	atomic_t		ds_count;
-+};
-+
-+struct nfs4_file_layout_dsaddr {
-+	struct pnfs_deviceid_node	deviceid;
-+	u32				stripe_count;
-+	u8				*stripe_indices;
-+	u32				ds_num;
-+	struct nfs4_pnfs_ds		*ds_list[1];
-+};
-+
-+struct nfs4_filelayout_segment {
-+	struct pnfs_layout_segment generic_hdr;
-+	u32 stripe_type;
-+	u32 commit_through_mds;
-+	u32 stripe_unit;
-+	u32 first_stripe_index;
-+	u64 pattern_offset;
-+	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
-+	unsigned int num_fh;
-+	struct nfs_fh **fh_array;
-+};
-+
-+static inline struct nfs4_filelayout_segment *
-+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
-+{
-+	return container_of(lseg,
-+			    struct nfs4_filelayout_segment,
-+			    generic_hdr);
-+}
-+
-+extern struct nfs_fh *
-+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset);
-+
-+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
-+extern void print_ds(struct nfs4_pnfs_ds *ds);
-+extern void print_deviceid(struct nfs4_deviceid *dev_id);
-+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset);
-+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
-+					u32 ds_idx);
-+extern struct nfs4_file_layout_dsaddr *
-+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
-+struct nfs4_file_layout_dsaddr *
-+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
-+
-+#endif /* FS_NFS_NFS4FILELAYOUT_H */
-diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
-new file mode 100644
-index 0000000..1f0ab62
---- /dev/null
-+++ b/fs/nfs/nfs4filelayoutdev.c
-@@ -0,0 +1,620 @@
-+/*
-+ *  Device operations for the pnfs nfs4 file layout driver.
-+ *
-+ *  Copyright (c) 2002
-+ *  The Regents of the University of Michigan
-+ *  All Rights Reserved
-+ *
-+ *  Dean Hildebrand <dhildebz at umich.edu>
-+ *  Garth Goodson   <Garth.Goodson at netapp.com>
-+ *
-+ *  Permission is granted to use, copy, create derivative works, and
-+ *  redistribute this software and such derivative works for any purpose,
-+ *  so long as the name of the University of Michigan is not used in
-+ *  any advertising or publicity pertaining to the use or distribution
-+ *  of this software without specific, written prior authorization. If
-+ *  the above copyright notice or any other identification of the
-+ *  University of Michigan is included in any copy of any portion of
-+ *  this software, then the disclaimer below must also be included.
-+ *
-+ *  This software is provided as is, without representation or warranty
-+ *  of any kind either express or implied, including without limitation
-+ *  the implied warranties of merchantability, fitness for a particular
-+ *  purpose, or noninfringement.  The Regents of the University of
-+ *  Michigan shall not be liable for any damages, including special,
-+ *  indirect, incidental, or consequential damages, with respect to any
-+ *  claim arising out of or in connection with the use of the software,
-+ *  even if it has been or is hereafter advised of the possibility of
-+ *  such damages.
-+ */
-+
-+#include <linux/nfs_fs.h>
-+#include <linux/vmalloc.h>
-+
-+#include "internal.h"
-+#include "nfs4filelayout.h"
-+
-+#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
-+
-+/*
-+ * Data server cache
-+ *
-+ * Data servers can be mapped to different device ids.
-+ * nfs4_pnfs_ds reference counting
-+ *   - set to 1 on allocation
-+ *   - incremented when a device id maps a data server already in the cache.
-+ *   - decremented when deviceid is removed from the cache.
-+ */
-+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-+static LIST_HEAD(nfs4_data_server_cache);
-+
-+/* Debug routines */
-+void
-+print_ds(struct nfs4_pnfs_ds *ds)
-+{
-+	if (ds == NULL) {
-+		printk("%s NULL device\n", __func__);
-+		return;
-+	}
-+	printk("        ip_addr %x port %hu\n"
-+		"        ref count %d\n"
-+		"        client %p\n"
-+		"        cl_exchange_flags %x\n",
-+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
-+		atomic_read(&ds->ds_count), ds->ds_clp,
-+		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
-+}
-+
-+void
-+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
-+{
-+	int i;
-+
-+	ifdebug(FACILITY) {
-+		printk("%s dsaddr->ds_num %d\n", __func__,
-+		       dsaddr->ds_num);
-+		for (i = 0; i < dsaddr->ds_num; i++)
-+			print_ds(dsaddr->ds_list[i]);
-+	}
-+}
-+
-+void print_deviceid(struct nfs4_deviceid *id)
-+{
-+	u32 *p = (u32 *)id;
-+
-+	dprintk("%s: device id= [%x%x%x%x]\n", __func__,
-+		p[0], p[1], p[2], p[3]);
-+}
-+
-+/* nfs4_ds_cache_lock is held */
-+static struct nfs4_pnfs_ds *
-+_data_server_lookup_locked(u32 ip_addr, u32 port)
-+{
-+	struct nfs4_pnfs_ds *ds;
-+
-+	dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
-+			ntohl(ip_addr), ntohs(port));
-+
-+	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
-+		if (ds->ds_ip_addr == ip_addr &&
-+		    ds->ds_port == port) {
-+			return ds;
-+		}
-+	}
-+	return NULL;
-+}
-+
-+/* Create an rpc to the data server defined in 'dev_list' */
-+static int
-+nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
-+{
-+	struct nfs_server	*tmp;
-+	struct sockaddr_in	sin;
-+	struct rpc_clnt		*mds_clnt = mds_srv->client;
-+	struct nfs_client	*clp = mds_srv->nfs_client;
-+	struct sockaddr		*mds_addr;
-+	int err = 0;
-+
-+	dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
-+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
-+		mds_clnt->cl_auth->au_flavor);
-+
-+	sin.sin_family = AF_INET;
-+	sin.sin_addr.s_addr = ds->ds_ip_addr;
-+	sin.sin_port = ds->ds_port;
-+
-+	/*
-+	 * If this DS is also the MDS, use the MDS session only if the
-+	 * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role.
-+	 */
-+	mds_addr = (struct sockaddr *)&clp->cl_addr;
-+	if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) {
-+		if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
-+			printk(KERN_INFO
-+			       "ip:port %x:%hu is not a pNFS Data Server\n",
-+			       ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
-+			err = -ENODEV;
-+		} else {
-+			atomic_inc(&clp->cl_count);
-+			ds->ds_clp = clp;
-+			dprintk("%s Using MDS Session for DS\n", __func__);
-+		}
-+		goto out;
-+	}
-+
-+	/* Temporay server for nfs4_set_client */
-+	tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-+	if (!tmp)
-+		goto out;
-+
-+	/*
-+	 * Set a retrans, timeout interval, and authflavor equual to the MDS
-+	 * values. Use the MDS nfs_client cl_ipaddr field so as to use the
-+	 * same co_ownerid as the MDS.
-+	 */
-+	err = nfs4_set_client(tmp,
-+			      mds_srv->nfs_client->cl_hostname,
-+			      (struct sockaddr *)&sin,
-+			      sizeof(struct sockaddr),
-+			      mds_srv->nfs_client->cl_ipaddr,
-+			      mds_clnt->cl_auth->au_flavor,
-+			      IPPROTO_TCP,
-+			      mds_clnt->cl_xprt->timeout,
-+			      1 /* minorversion */);
-+	if (err < 0)
-+		goto out_free;
-+
-+	clp = tmp->nfs_client;
-+
-+	/* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */
-+	dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp);
-+	clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS;
-+
-+	err = nfs4_recover_expired_lease(clp);
-+	if (!err)
-+		err = nfs4_check_client_ready(clp);
-+	if (err)
-+		goto out_put;
-+
-+	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
-+		printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n",
-+		       ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
-+		err = -ENODEV;
-+		goto out_put;
-+	}
-+	/*
-+	 * Set DS lease equal to the MDS lease, renewal is scheduled in
-+	 * create_session
-+	 */
-+	spin_lock(&mds_srv->nfs_client->cl_lock);
-+	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
-+	spin_unlock(&mds_srv->nfs_client->cl_lock);
-+	clp->cl_last_renewal = jiffies;
-+
-+	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-+	ds->ds_clp = clp;
-+
-+	dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__,
-+				ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
-+				clp->cl_rpcclient);
-+out_free:
-+	kfree(tmp);
-+out:
-+	dprintk("%s Returns %d\n", __func__, err);
-+	return err;
-+out_put:
-+	nfs_put_client(clp);
-+	goto out_free;
-+}
-+
-+static void
-+destroy_ds(struct nfs4_pnfs_ds *ds)
-+{
-+	dprintk("--> %s\n", __func__);
-+	ifdebug(FACILITY)
-+		print_ds(ds);
-+
-+	if (ds->ds_clp)
-+		nfs_put_client(ds->ds_clp);
-+	kfree(ds);
-+}
-+
-+static void
-+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
-+{
-+	struct nfs4_pnfs_ds *ds;
-+	int i;
-+
-+	print_deviceid(&dsaddr->deviceid.de_id);
-+
-+	for (i = 0; i < dsaddr->ds_num; i++) {
-+		ds = dsaddr->ds_list[i];
-+		if (ds != NULL) {
-+			if (atomic_dec_and_lock(&ds->ds_count,
-+						&nfs4_ds_cache_lock)) {
-+				list_del_init(&ds->ds_node);
-+				spin_unlock(&nfs4_ds_cache_lock);
-+				destroy_ds(ds);
-+			}
-+		}
-+	}
-+	kfree(dsaddr->stripe_indices);
-+	kfree(dsaddr);
-+}
-+
-+void
-+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-+{
-+	struct nfs4_file_layout_dsaddr *dsaddr =
-+		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-+
-+	nfs4_fl_free_deviceid(dsaddr);
-+}
-+
-+static struct nfs4_pnfs_ds *
-+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
-+{
-+	struct nfs4_pnfs_ds *tmp_ds, *ds;
-+
-+	ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
-+	if (!ds)
-+		goto out;
-+
-+	spin_lock(&nfs4_ds_cache_lock);
-+	tmp_ds = _data_server_lookup_locked(ip_addr, port);
-+	if (tmp_ds == NULL) {
-+		ds->ds_ip_addr = ip_addr;
-+		ds->ds_port = port;
-+		atomic_set(&ds->ds_count, 1);
-+		INIT_LIST_HEAD(&ds->ds_node);
-+		ds->ds_clp = NULL;
-+		list_add(&ds->ds_node, &nfs4_data_server_cache);
-+		dprintk("%s add new data server ip 0x%x\n", __func__,
-+			ds->ds_ip_addr);
-+	} else {
-+		kfree(ds);
-+		atomic_inc(&tmp_ds->ds_count);
-+		dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
-+			__func__, tmp_ds->ds_ip_addr,
-+			atomic_read(&tmp_ds->ds_count));
-+		ds = tmp_ds;
-+	}
-+	spin_unlock(&nfs4_ds_cache_lock);
-+out:
-+	return ds;
-+}
-+
-+/*
-+ * Currently only support ipv4, and one multi-path address.
-+ */
-+static struct nfs4_pnfs_ds *
-+decode_and_add_ds(__be32 **pp, struct inode *inode)
-+{
-+	struct nfs4_pnfs_ds *ds = NULL;
-+	char *buf;
-+	const char *ipend, *pstr;
-+	u32 ip_addr, port;
-+	int nlen, rlen, i;
-+	int tmp[2];
-+	__be32 *r_netid, *r_addr, *p = *pp;
-+
-+	/* r_netid */
-+	nlen = be32_to_cpup(p++);
-+	r_netid = p;
-+	p += XDR_QUADLEN(nlen);
-+
-+	/* r_addr */
-+	rlen = be32_to_cpup(p++);
-+	r_addr = p;
-+	p += XDR_QUADLEN(rlen);
-+	*pp = p;
-+
-+	/* Check that netid is "tcp" */
-+	if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
-+		dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
-+		goto out_err;
-+	}
-+
-+	/* ipv6 length plus port is legal */
-+	if (rlen > INET6_ADDRSTRLEN + 8) {
-+		dprintk("%s Invalid address, length %d\n", __func__,
-+			rlen);
-+		goto out_err;
-+	}
-+	buf = kmalloc(rlen + 1, GFP_KERNEL);
-+	buf[rlen] = '\0';
-+	memcpy(buf, r_addr, rlen);
-+
-+	/* replace the port dots with dashes for the in4_pton() delimiter*/
-+	for (i = 0; i < 2; i++) {
-+		char *res = strrchr(buf, '.');
-+		*res = '-';
-+	}
-+
-+	/* Currently only support ipv4 address */
-+	if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
-+		dprintk("%s: Only ipv4 addresses supported\n", __func__);
-+		goto out_free;
-+	}
-+
-+	/* port */
-+	pstr = ipend;
-+	sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
-+	port = htons((tmp[0] << 8) | (tmp[1]));
-+
-+	ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
-+	dprintk("%s Decoded address and port %s\n", __func__, buf);
-+out_free:
-+	kfree(buf);
-+out_err:
-+	return ds;
-+}
-+
-+/* Decode opaque device data and return the result */
-+static struct nfs4_file_layout_dsaddr*
-+decode_device(struct inode *ino, struct pnfs_device *pdev)
-+{
-+	int i, dummy;
-+	u32 cnt, num;
-+	u8 *indexp;
-+	__be32 *p = (__be32 *)pdev->area, *indicesp;
-+	struct nfs4_file_layout_dsaddr *dsaddr;
-+
-+	/* Get the stripe count (number of stripe index) */
-+	cnt = be32_to_cpup(p++);
-+	dprintk("%s stripe count  %d\n", __func__, cnt);
-+	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
-+		printk(KERN_WARNING "%s: stripe count %d greater than "
-+		       "supported maximum %d\n", __func__,
-+			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
-+		goto out_err;
-+	}
-+
-+	/* Check the multipath list count */
-+	indicesp = p;
-+	p += XDR_QUADLEN(cnt << 2);
-+	num = be32_to_cpup(p++);
-+	dprintk("%s ds_num %u\n", __func__, num);
-+	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
-+		printk(KERN_WARNING "%s: multipath count %d greater than "
-+			"supported maximum %d\n", __func__,
-+			num, NFS4_PNFS_MAX_MULTI_CNT);
-+		goto out_err;
-+	}
-+	dsaddr = kzalloc(sizeof(*dsaddr) +
-+			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
-+			GFP_KERNEL);
-+	if (!dsaddr)
-+		goto out_err;
-+
-+	dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
-+	if (!dsaddr->stripe_indices)
-+		goto out_err_free;
-+
-+	dsaddr->stripe_count = cnt;
-+	dsaddr->ds_num = num;
-+
-+	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
-+
-+	/* Go back an read stripe indices */
-+	p = indicesp;
-+	indexp = &dsaddr->stripe_indices[0];
-+	for (i = 0; i < dsaddr->stripe_count; i++) {
-+		*indexp = be32_to_cpup(p++);
-+		if (*indexp >= num)
-+			goto out_err_free;
-+		indexp++;
-+	}
-+	/* Skip already read multipath list count */
-+	p++;
-+
-+	for (i = 0; i < dsaddr->ds_num; i++) {
-+		int j;
-+
-+		dummy = be32_to_cpup(p++); /* multipath count */
-+		if (dummy > 1) {
-+			printk(KERN_WARNING
-+			       "%s: Multipath count %d not supported, "
-+			       "skipping all greater than 1\n", __func__,
-+				dummy);
-+		}
-+		for (j = 0; j < dummy; j++) {
-+			if (j == 0) {
-+				dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
-+				if (dsaddr->ds_list[i] == NULL)
-+					goto out_err_free;
-+			} else {
-+				u32 len;
-+				/* skip extra multipath */
-+				len = be32_to_cpup(p++);
-+				p += XDR_QUADLEN(len);
-+				len = be32_to_cpup(p++);
-+				p += XDR_QUADLEN(len);
-+				continue;
-+			}
-+		}
-+	}
-+	return dsaddr;
-+
-+out_err_free:
-+	nfs4_fl_free_deviceid(dsaddr);
-+out_err:
-+	dprintk("%s ERROR: returning NULL\n", __func__);
-+	return NULL;
-+}
-+
-+/*
-+ * Decode the opaque device specified in 'dev'
-+ * and add it to the list of available devices.
-+ * If the deviceid is already cached, nfs4_add_deviceid will return
-+ * a pointer to the cached struct and throw away the new.
-+ */
-+static struct nfs4_file_layout_dsaddr*
-+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
-+{
-+	struct nfs4_file_layout_dsaddr *dsaddr;
-+	struct pnfs_deviceid_node *d;
-+
-+	dsaddr = decode_device(inode, dev);
-+	if (!dsaddr) {
-+		printk(KERN_WARNING "%s: Could not decode or add device\n",
-+			__func__);
-+		return NULL;
-+	}
-+
-+	d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
-+			      &dsaddr->deviceid);
-+
-+	return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
-+}
-+
-+/*
-+ * Retrieve the information for dev_id, add it to the list
-+ * of available devices, and return it.
-+ */
-+struct nfs4_file_layout_dsaddr *
-+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
-+{
-+	struct pnfs_device *pdev = NULL;
-+	u32 max_resp_sz;
-+	int max_pages;
-+	struct page **pages = NULL;
-+	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
-+	int rc, i;
-+	struct nfs_server *server = NFS_SERVER(inode);
-+
-+	/*
-+	 * Use the session max response size as the basis for setting
-+	 * GETDEVICEINFO's maxcount
-+	 */
-+	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-+	max_pages = max_resp_sz >> PAGE_SHIFT;
-+	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
-+		__func__, inode, max_resp_sz, max_pages);
-+
-+	pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
-+	if (pdev == NULL)
-+		return NULL;
-+
-+	pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
-+	if (pages == NULL) {
-+		kfree(pdev);
-+		return NULL;
-+	}
-+	for (i = 0; i < max_pages; i++) {
-+		pages[i] = alloc_page(GFP_KERNEL);
-+		if (!pages[i])
-+			goto out_free;
-+	}
-+
-+	/* set pdev->area */
-+	pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
-+	if (!pdev->area)
-+		goto out_free;
-+
-+	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
-+	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
-+	pdev->pages = pages;
-+	pdev->pgbase = 0;
-+	pdev->pglen = PAGE_SIZE * max_pages;
-+	pdev->mincount = 0;
-+
-+	rc = nfs4_proc_getdeviceinfo(server, pdev);
-+	dprintk("%s getdevice info returns %d\n", __func__, rc);
-+	if (rc)
-+		goto out_free;
-+
-+	/*
-+	 * Found new device, need to decode it and then add it to the
-+	 * list of known devices for this mountpoint.
-+	 */
-+	dsaddr = decode_and_add_device(inode, pdev);
-+out_free:
-+	if (pdev->area != NULL)
-+		vunmap(pdev->area);
-+	for (i = 0; i < max_pages; i++)
-+		__free_page(pages[i]);
-+	kfree(pages);
-+	kfree(pdev);
-+	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
-+	return dsaddr;
-+}
-+
-+struct nfs4_file_layout_dsaddr *
-+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
-+{
-+	struct pnfs_deviceid_node *d;
-+
-+	d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
-+	return (d == NULL) ? NULL :
-+		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
-+}
-+
-+/*
-+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
-+ * Then: ((res + fsi) % dsaddr->stripe_count)
-+ */
-+static u32
-+_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
-+{
-+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
-+	u64 tmp;
-+
-+	tmp = offset - flseg->pattern_offset;
-+	do_div(tmp, flseg->stripe_unit);
-+	tmp += flseg->first_stripe_index;
-+	return do_div(tmp, flseg->dsaddr->stripe_count);
-+}
-+
-+u32
-+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset)
-+{
-+	u32 j;
-+
-+	j = _nfs4_fl_calc_j_index(lseg, offset);
-+	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
-+}
-+
-+struct nfs_fh *
-+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset)
-+{
-+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
-+	u32 i;
-+
-+	if (flseg->stripe_type == STRIPE_SPARSE) {
-+		if (flseg->num_fh == 1)
-+			i = 0;
-+		else if (flseg->num_fh == 0)
-+			return NULL;
-+		else
-+			i = nfs4_fl_calc_ds_index(lseg, offset);
-+	} else
-+		i = _nfs4_fl_calc_j_index(lseg, offset);
-+	return flseg->fh_array[i];
-+}
-+
-+struct nfs4_pnfs_ds *
-+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
-+{
-+	struct nfs4_file_layout_dsaddr *dsaddr;
-+
-+	dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
-+	if (dsaddr->ds_list[ds_idx] == NULL) {
-+		printk(KERN_ERR "%s: No data server for device id!\n",
-+			__func__);
-+		return NULL;
-+	}
-+
-+	if (!dsaddr->ds_list[ds_idx]->ds_clp) {
-+		int err;
-+
-+		err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->layout->inode),
-+					  dsaddr->ds_list[ds_idx]);
-+		if (err) {
-+			printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n",
-+			       __func__, err);
-+			return NULL;
-+		}
-+	}
-+	return dsaddr->ds_list[ds_idx];
-+}
-diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
-index 089da5b..cc642dd 100644
---- a/fs/nfs/nfs4proc.c
-+++ b/fs/nfs/nfs4proc.c
-@@ -55,6 +55,7 @@
- #include "internal.h"
- #include "iostat.h"
- #include "callback.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_PROC
- 
-@@ -67,7 +68,7 @@ struct nfs4_opendata;
- static int _nfs4_proc_open(struct nfs4_opendata *data);
- static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
- static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
--static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *);
- static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
- static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
- static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-@@ -125,11 +126,12 @@ const u32 nfs4_pathconf_bitmap[2] = {
- 	0
- };
- 
--const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
-+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
- 			| FATTR4_WORD0_MAXREAD
- 			| FATTR4_WORD0_MAXWRITE
- 			| FATTR4_WORD0_LEASE_TIME,
--			0
-+			FATTR4_WORD1_FS_LAYOUT_TYPES,
-+			FATTR4_WORD2_LAYOUT_BLKSIZE
- };
- 
- const u32 nfs4_fs_locations_bitmap[2] = {
-@@ -562,6 +564,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
- }
- 
- int nfs4_setup_sequence(const struct nfs_server *server,
-+		struct nfs4_session *ds_session,
- 			struct nfs4_sequence_args *args,
- 			struct nfs4_sequence_res *res,
- 			int cache_reply,
-@@ -570,6 +573,8 @@ int nfs4_setup_sequence(const struct nfs_server *server,
- 	struct nfs4_session *session = nfs4_get_session(server);
- 	int ret = 0;
- 
-+	if (ds_session)
-+		session = ds_session;
- 	if (session == NULL) {
- 		args->sa_session = NULL;
- 		res->sr_session = NULL;
-@@ -599,7 +604,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
- 
- 	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
- 
--	if (nfs4_setup_sequence(data->seq_server, data->seq_args,
-+	if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args,
- 				data->seq_res, data->cache_reply, task))
- 		return;
- 	rpc_call_start(task);
-@@ -1378,7 +1383,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
- 		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
- 	}
- 	data->timestamp = jiffies;
--	if (nfs4_setup_sequence(data->o_arg.server,
-+	if (nfs4_setup_sequence(data->o_arg.server, NULL,
- 				&data->o_arg.seq_args,
- 				&data->o_res.seq_res, 1, task))
- 		return;
-@@ -1553,9 +1558,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
- 	return 0;
- }
- 
--static int nfs4_recover_expired_lease(struct nfs_server *server)
-+int nfs4_recover_expired_lease(struct nfs_client *clp)
- {
--	struct nfs_client *clp = server->nfs_client;
- 	unsigned int loop;
- 	int ret;
- 
-@@ -1571,6 +1575,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
- 	}
- 	return ret;
- }
-+EXPORT_SYMBOL(nfs4_recover_expired_lease);
- 
- /*
-  * OPEN_EXPIRED:
-@@ -1660,7 +1665,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
- 		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
- 		goto out_err;
- 	}
--	status = nfs4_recover_expired_lease(server);
-+	status = nfs4_recover_expired_lease(server->nfs_client);
- 	if (status != 0)
- 		goto err_put_state_owner;
- 	if (path->dentry->d_inode != NULL)
-@@ -1871,7 +1876,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
- 			if (calldata->arg.fmode == 0)
- 				break;
- 		default:
--			if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
-+			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
- 				rpc_restart_call_prepare(task);
- 	}
- 	nfs_release_seqid(calldata->arg.seqid);
-@@ -1916,7 +1921,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
- 
- 	nfs_fattr_init(calldata->res.fattr);
- 	calldata->timestamp = jiffies;
--	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
-+	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL,
- 				&calldata->arg.seq_args, &calldata->res.seq_res,
- 				1, task))
- 		return;
-@@ -1979,8 +1984,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
- 	path_get(path);
- 	calldata->path = *path;
- 
--	msg.rpc_argp = &calldata->arg,
--	msg.rpc_resp = &calldata->res,
-+	msg.rpc_argp = &calldata->arg;
-+	msg.rpc_resp = &calldata->res;
- 	task_setup_data.callback_data = calldata;
- 	task = rpc_run_task(&task_setup_data);
- 	if (IS_ERR(task))
-@@ -2337,6 +2342,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
- 	struct nfs4_state *state = NULL;
- 	int status;
- 
-+	if (pnfs_ld_layoutret_on_setattr(inode))
-+		pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true);
-+
- 	nfs_fattr_init(fattr);
- 	
- 	/* Search for an existing open(O_WRITE) file */
-@@ -2664,7 +2672,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
- 
- 	if (!nfs4_sequence_done(task, &res->seq_res))
- 		return 0;
--	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
-+	if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
- 		return 0;
- 	update_changeattr(dir, &res->cinfo);
- 	nfs_post_op_update_inode(dir, res->dir_attr);
-@@ -3105,19 +3113,31 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
- static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
- {
- 	struct nfs_server *server = NFS_SERVER(data->inode);
-+	struct nfs_client *client = server->nfs_client;
- 
- 	dprintk("--> %s\n", __func__);
- 
-+#ifdef CONFIG_NFS_V4_1
-+	if (data->pdata.pnfsflags & PNFS_NO_RPC)
-+		return 0;
-+
-+	/* Is this a DS session */
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS read\n", __func__);
-+		client = data->fldata.ds_nfs_client;
-+	}
-+#endif /* CONFIG_NFS_V4_1 */
-+
- 	if (!nfs4_sequence_done(task, &data->res.seq_res))
- 		return -EAGAIN;
- 
--	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
--		nfs_restart_rpc(task, server->nfs_client);
-+	if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
-+		nfs_restart_rpc(task, client);
- 		return -EAGAIN;
- 	}
- 
- 	nfs_invalidate_atime(data->inode);
--	if (task->tk_status > 0)
-+	if (task->tk_status > 0 && client == server->nfs_client)
- 		renew_lease(server, data->timestamp);
- 	return 0;
- }
-@@ -3128,20 +3148,56 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
- 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
- }
- 
-+static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data)
-+{
-+#ifdef CONFIG_NFS_V4_1
-+	pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
-+	pnfs_need_layoutcommit(nfsi, data->args.context);
-+#endif /* CONFIG_NFS_V4_1 */
-+}
-+
- static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
- {
- 	struct inode *inode = data->inode;
--	
-+	struct nfs_server *server = NFS_SERVER(inode);
-+	struct nfs_client *client = server->nfs_client;
-+
- 	if (!nfs4_sequence_done(task, &data->res.seq_res))
- 		return -EAGAIN;
- 
--	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
--		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
-+#ifdef CONFIG_NFS_V4_1
-+	/* restore original count after retry? */
-+	if (data->pdata.orig_count) {
-+		dprintk("%s: restoring original count %u\n", __func__,
-+			data->pdata.orig_count);
-+		data->args.count = data->pdata.orig_count;
-+	}
-+
-+	if (data->pdata.pnfsflags & PNFS_NO_RPC)
-+		return 0;
-+
-+	/* Is this a DS session */
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS write\n", __func__);
-+		client = data->fldata.ds_nfs_client;
-+	}
-+#endif /* CONFIG_NFS_V4_1 */
-+
-+	if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
-+		nfs_restart_rpc(task, client);
- 		return -EAGAIN;
- 	}
-+
-+	/*
-+	 * MDS write: renew lease
-+	 * DS write: update lastbyte written, mark for layout commit
-+	 */
- 	if (task->tk_status >= 0) {
--		renew_lease(NFS_SERVER(inode), data->timestamp);
--		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
-+		if (client == server->nfs_client) {
-+			renew_lease(server, data->timestamp);
-+			nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
-+		} else
-+			pnfs4_update_write_done(NFS_I(inode), data);
- 	}
- 	return 0;
- }
-@@ -3154,21 +3210,42 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
- 	data->res.server = server;
- 	data->timestamp   = jiffies;
- 
-+#ifdef CONFIG_NFS_V4_1
-+	/* writes to DS use pnfs vector */
-+	if (data->fldata.ds_nfs_client) {
-+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE];
-+		return;
-+	}
-+#endif /* CONFIG_NFS_V4_1 */
- 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- }
- 
- static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
- {
- 	struct inode *inode = data->inode;
--	
-+	struct nfs_server *server = NFS_SERVER(data->inode);
-+	struct nfs_client *client = server->nfs_client;
-+
-+#ifdef CONFIG_NFS_V4_1
-+	if (data->pdata.pnfsflags & PNFS_NO_RPC)
-+		return 0;
-+
-+	/* Is this a DS session */
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS commit\n", __func__);
-+		client = data->fldata.ds_nfs_client;
-+	}
-+#endif /* CONFIG_NFS_V4_1 */
-+
- 	if (!nfs4_sequence_done(task, &data->res.seq_res))
- 		return -EAGAIN;
- 
--	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-+	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) {
- 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
- 		return -EAGAIN;
- 	}
--	nfs_refresh_inode(inode, data->res.fattr);
-+	if (client == server->nfs_client)
-+		nfs_refresh_inode(inode, data->res.fattr);
- 	return 0;
- }
- 
-@@ -3178,6 +3255,12 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
- 	
- 	data->args.bitmask = server->cache_consistency_bitmask;
- 	data->res.server = server;
-+#if defined(CONFIG_NFS_V4_1)
-+	if (data->fldata.ds_nfs_client) {
-+		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT];
-+		return;
-+	}
-+#endif /* CONFIG_NFS_V4_1 */
- 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
- }
- 
-@@ -3475,9 +3558,10 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
- }
- 
- static int
--nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
-+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp)
- {
--	struct nfs_client *clp = server->nfs_client;
-+	if (!clp)
-+		clp = server->nfs_client;
- 
- 	if (task->tk_status >= 0)
- 		return 0;
-@@ -3504,14 +3588,16 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
- 		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- 		case -NFS4ERR_SEQ_FALSE_RETRY:
- 		case -NFS4ERR_SEQ_MISORDERED:
--			dprintk("%s ERROR %d, Reset session\n", __func__,
--				task->tk_status);
-+			dprintk("%s ERROR %d, Reset session. Exchangeid "
-+				"flags 0x%x\n", __func__, task->tk_status,
-+				clp->cl_exchange_flags);
- 			nfs4_schedule_state_recovery(clp);
- 			task->tk_status = 0;
- 			return -EAGAIN;
- #endif /* CONFIG_NFS_V4_1 */
- 		case -NFS4ERR_DELAY:
--			nfs_inc_server_stats(server, NFSIOS_DELAY);
-+			if (server)
-+				nfs_inc_server_stats(server, NFSIOS_DELAY);
- 		case -NFS4ERR_GRACE:
- 		case -EKEYEXPIRED:
- 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
-@@ -3524,6 +3610,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
- 	task->tk_status = nfs4_map_errors(task->tk_status);
- 	return 0;
- do_state_recovery:
-+	if (is_ds_only_client(clp))
-+		return 0;
- 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
- 	nfs4_schedule_state_recovery(clp);
- 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
-@@ -3657,8 +3745,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
- 		renew_lease(data->res.server, data->timestamp);
- 		break;
- 	default:
--		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
--				-EAGAIN) {
-+		if (nfs4_async_handle_error(task, data->res.server, NULL, NULL)
-+				== -EAGAIN) {
- 			nfs_restart_rpc(task, data->res.server->nfs_client);
- 			return;
- 		}
-@@ -3678,7 +3766,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
- 
- 	d_data = (struct nfs4_delegreturndata *)data;
- 
--	if (nfs4_setup_sequence(d_data->res.server,
-+	if (nfs4_setup_sequence(d_data->res.server, NULL,
- 				&d_data->args.seq_args,
- 				&d_data->res.seq_res, 1, task))
- 		return;
-@@ -3913,7 +4001,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
- 		case -NFS4ERR_EXPIRED:
- 			break;
- 		default:
--			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-+			if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN)
- 				nfs_restart_rpc(task,
- 						 calldata->server->nfs_client);
- 	}
-@@ -3931,7 +4019,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
- 		return;
- 	}
- 	calldata->timestamp = jiffies;
--	if (nfs4_setup_sequence(calldata->server,
-+	if (nfs4_setup_sequence(calldata->server, NULL,
- 				&calldata->arg.seq_args,
- 				&calldata->res.seq_res, 1, task))
- 		return;
-@@ -3973,8 +4061,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
- 		return ERR_PTR(-ENOMEM);
- 	}
- 
--	msg.rpc_argp = &data->arg,
--	msg.rpc_resp = &data->res,
-+	msg.rpc_argp = &data->arg;
-+	msg.rpc_resp = &data->res;
- 	task_setup_data.callback_data = data;
- 	return rpc_run_task(&task_setup_data);
- }
-@@ -4086,7 +4174,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
- 	} else
- 		data->arg.new_lock_owner = 0;
- 	data->timestamp = jiffies;
--	if (nfs4_setup_sequence(data->server,
-+	if (nfs4_setup_sequence(data->server, NULL,
- 				&data->arg.seq_args,
- 				&data->res.seq_res, 1, task))
- 		return;
-@@ -4211,8 +4299,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
- 			data->arg.reclaim = NFS_LOCK_RECLAIM;
- 		task_setup_data.callback_ops = &nfs4_recover_lock_ops;
- 	}
--	msg.rpc_argp = &data->arg,
--	msg.rpc_resp = &data->res,
-+	msg.rpc_argp = &data->arg;
-+	msg.rpc_resp = &data->res;
- 	task_setup_data.callback_data = data;
- 	task = rpc_run_task(&task_setup_data);
- 	if (IS_ERR(task))
-@@ -4557,7 +4645,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
- 	nfs4_verifier verifier;
- 	struct nfs41_exchange_id_args args = {
- 		.client = clp,
--		.flags = clp->cl_exchange_flags,
-+		.flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R,
- 	};
- 	struct nfs41_exchange_id_res res = {
- 		.client = clp,
-@@ -5081,7 +5169,7 @@ int nfs4_init_session(struct nfs_server *server)
- 	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
- 	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
- 
--	ret = nfs4_recover_expired_lease(server);
-+	ret = nfs4_recover_expired_lease(server->nfs_client);
- 	if (!ret)
- 		ret = nfs4_check_client_ready(clp);
- 	return ret;
-@@ -5333,6 +5421,412 @@ out:
- 	dprintk("<-- %s status=%d\n", __func__, status);
- 	return status;
- }
-+
-+static void
-+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
-+{
-+	struct nfs4_layoutget *lgp = calldata;
-+	struct inode *ino = lgp->args.inode;
-+	struct nfs_inode *nfsi = NFS_I(ino);
-+	struct nfs_server *server = NFS_SERVER(ino);
-+	struct pnfs_layout_segment *lseg;
-+
-+	dprintk("--> %s\n", __func__);
-+	spin_lock(&ino->i_lock);
-+	lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range);
-+	if (likely(!lseg)) {
-+		spin_unlock(&ino->i_lock);
-+		dprintk("%s: no lseg found, proceeding\n", __func__);
-+		if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
-+					 &lgp->res.seq_res, 0, task))
-+			rpc_call_start(task);
-+		return;
-+	}
-+	if (!lseg->valid) {
-+		spin_unlock(&ino->i_lock);
-+		dprintk("%s: invalid lseg found, waiting\n", __func__);
-+		rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
-+		return;
-+	}
-+	get_lseg(lseg);
-+	*lgp->lsegpp = lseg;
-+	spin_unlock(&ino->i_lock);
-+	dprintk("%s: valid lseg found, no rpc required\n", __func__);
-+	rpc_exit(task, NFS4_OK);
-+}
-+
-+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
-+{
-+	struct nfs4_layoutget *lgp = calldata;
-+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-+
-+	dprintk("--> %s\n", __func__);
-+
-+	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
-+		return;
-+
-+	switch (task->tk_status) {
-+	case 0:
-+		break;
-+	case -NFS4ERR_LAYOUTTRYLATER:
-+	case -NFS4ERR_RECALLCONFLICT:
-+		task->tk_status = -NFS4ERR_DELAY;
-+		/* Fall through */
-+	default:
-+		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
-+			rpc_restart_call_prepare(task);
-+			return;
-+		}
-+	}
-+	lgp->status = task->tk_status;
-+	dprintk("<-- %s\n", __func__);
-+}
-+
-+static void nfs4_layoutget_release(void *calldata)
-+{
-+	struct nfs4_layoutget *lgp = calldata;
-+
-+	dprintk("--> %s\n", __func__);
-+	put_layout_hdr(lgp->args.inode);
-+	if (lgp->res.layout.buf != NULL)
-+		free_page((unsigned long) lgp->res.layout.buf);
-+	put_nfs_open_context(lgp->args.ctx);
-+	kfree(calldata);
-+	dprintk("<-- %s\n", __func__);
-+}
-+
-+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
-+	.rpc_call_prepare = nfs4_layoutget_prepare,
-+	.rpc_call_done = nfs4_layoutget_done,
-+	.rpc_release = nfs4_layoutget_release,
-+};
-+
-+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
-+{
-+	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-+	struct rpc_task *task;
-+	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
-+		.rpc_argp = &lgp->args,
-+		.rpc_resp = &lgp->res,
-+	};
-+	struct rpc_task_setup task_setup_data = {
-+		.rpc_client = server->client,
-+		.rpc_message = &msg,
-+		.callback_ops = &nfs4_layoutget_call_ops,
-+		.callback_data = lgp,
-+		.flags = RPC_TASK_ASYNC,
-+	};
-+	int status = 0;
-+
-+	dprintk("--> %s\n", __func__);
-+
-+	lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
-+	if (lgp->res.layout.buf == NULL) {
-+		nfs4_layoutget_release(lgp);
-+		return -ENOMEM;
-+	}
-+
-+	lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-+	task = rpc_run_task(&task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
-+	status = nfs4_wait_for_completion_rpc_task(task);
-+	if (status != 0)
-+		goto out;
-+	status = lgp->status;
-+	if (status != 0)
-+		goto out;
-+	status = pnfs_layout_process(lgp);
-+out:
-+	rpc_put_task(task);
-+	dprintk("<-- %s status=%d\n", __func__, status);
-+	return status;
-+}
-+
-+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data)
-+{
-+	struct nfs4_layoutcommit_data *ldata =
-+		(struct nfs4_layoutcommit_data *)data;
-+	struct nfs_server *server = NFS_SERVER(ldata->args.inode);
-+
-+	if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args,
-+				&ldata->res.seq_res, 1, task))
-+		return;
-+	rpc_call_start(task);
-+}
-+
-+static void
-+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
-+{
-+	struct nfs4_layoutcommit_data *data =
-+		(struct nfs4_layoutcommit_data *)calldata;
-+	struct nfs_server *server = NFS_SERVER(data->args.inode);
-+
-+	if (!nfs4_sequence_done(task, &data->res.seq_res))
-+		return;
-+
-+	if (RPC_ASSASSINATED(task))
-+		return;
-+
-+	if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
-+		nfs_restart_rpc(task, server->nfs_client);
-+
-+	data->status = task->tk_status;
-+}
-+
-+static void nfs4_layoutcommit_release(void *lcdata)
-+{
-+	struct nfs4_layoutcommit_data *data =
-+		(struct nfs4_layoutcommit_data *)lcdata;
-+
-+	pnfs_cleanup_layoutcommit(lcdata);
-+	/* Matched by get_layout in pnfs_layoutcommit_inode */
-+	put_layout_hdr(data->args.inode);
-+	put_rpccred(data->cred);
-+	kfree(lcdata);
-+}
-+
-+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
-+	.rpc_call_prepare = nfs4_layoutcommit_prepare,
-+	.rpc_call_done = nfs4_layoutcommit_done,
-+	.rpc_release = nfs4_layoutcommit_release,
-+};
-+
-+/* Execute a layoutcommit to the server */
-+int
-+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
-+{
-+	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
-+		.rpc_argp = &data->args,
-+		.rpc_resp = &data->res,
-+		.rpc_cred = data->cred,
-+	};
-+	struct rpc_task_setup task_setup_data = {
-+		.task = &data->task,
-+		.rpc_client = NFS_CLIENT(data->args.inode),
-+		.rpc_message = &msg,
-+		.callback_ops = &nfs4_layoutcommit_ops,
-+		.callback_data = data,
-+		.flags = RPC_TASK_ASYNC,
-+	};
-+	struct rpc_task *task;
-+	int status = 0;
-+
-+	dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu "
-+		"type: %d issync %d\n",
-+		data->task.tk_pid,
-+		data->args.range.length,
-+		data->args.range.offset,
-+		data->args.lastbytewritten,
-+		data->args.layout_type, issync);
-+
-+	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-+	task = rpc_run_task(&task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
-+	if (!issync)
-+		goto out;
-+	status = nfs4_wait_for_completion_rpc_task(task);
-+	if (status != 0)
-+		goto out;
-+	status = data->status;
-+out:
-+	dprintk("%s: status %d\n", __func__, status);
-+	rpc_put_task(task);
-+	return 0;
-+}
-+
-+static void
-+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
-+{
-+	struct nfs4_layoutreturn *lrp = calldata;
-+	struct inode *ino = lrp->args.inode;
-+	struct nfs_inode *nfsi = NFS_I(ino);
-+	struct nfs_server *server = NFS_SERVER(ino);
-+
-+	dprintk("--> %s\n", __func__);
-+	if ((lrp->args.return_type == RETURN_FILE) &&
-+	    pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
-+		dprintk("%s: waiting on barrier\n", __func__);
-+		rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
-+		return;
-+	}
-+	if (lrp->stateid) {
-+		/* Forget the layout, without sending the return */
-+		rpc_exit(task, 0);
-+		return;
-+	}
-+	if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args,
-+				&lrp->res.seq_res, 0, task))
-+		return;
-+	rpc_call_start(task);
-+}
-+
-+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
-+{
-+	struct nfs4_layoutreturn *lrp = calldata;
-+	struct inode *ino = lrp->args.inode;
-+	struct nfs_server *server = NFS_SERVER(ino);
-+
-+	dprintk("--> %s\n", __func__);
-+
-+	if (!nfs4_sequence_done(task, &lrp->res.seq_res))
-+		return;
-+
-+	if (RPC_ASSASSINATED(task))
-+		return;
-+
-+	if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
-+		nfs_restart_rpc(task, server->nfs_client);
-+
-+	dprintk("<-- %s\n", __func__);
-+}
-+
-+static void nfs4_layoutreturn_release(void *calldata)
-+{
-+	struct nfs4_layoutreturn *lrp = calldata;
-+	struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
-+
-+	dprintk("--> %s return_type %d lo %p\n", __func__,
-+		lrp->args.return_type, lo);
-+
-+	pnfs_layoutreturn_release(lrp);
-+	kfree(calldata);
-+	dprintk("<-- %s\n", __func__);
-+}
-+
-+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
-+	.rpc_call_prepare = nfs4_layoutreturn_prepare,
-+	.rpc_call_done = nfs4_layoutreturn_done,
-+	.rpc_release = nfs4_layoutreturn_release,
-+};
-+
-+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
-+{
-+	struct inode *ino = lrp->args.inode;
-+	struct nfs_server *server = NFS_SERVER(ino);
-+	struct rpc_task *task;
-+	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
-+		.rpc_argp = &lrp->args,
-+		.rpc_resp = &lrp->res,
-+	};
-+	struct rpc_task_setup task_setup_data = {
-+		.rpc_client = server->client,
-+		.rpc_message = &msg,
-+		.callback_ops = &nfs4_layoutreturn_call_ops,
-+		.callback_data = lrp,
-+		.flags = RPC_TASK_ASYNC,
-+	};
-+	int status = 0;
-+
-+	dprintk("--> %s\n", __func__);
-+	lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-+	task = rpc_run_task(&task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
-+	if (!issync)
-+		goto out;
-+	status = nfs4_wait_for_completion_rpc_task(task);
-+	if (status != 0)
-+		goto out;
-+	status = task->tk_status;
-+out:
-+	dprintk("<-- %s\n", __func__);
-+	rpc_put_task(task);
-+	return status;
-+}
-+
-+/*
-+ * Retrieve the list of Data Server devices from the MDS.
-+ */
-+static int _nfs4_getdevicelist(struct nfs_server *server,
-+				    const struct nfs_fh *fh,
-+				    struct pnfs_devicelist *devlist)
-+{
-+	struct nfs4_getdevicelist_args args = {
-+		.fh = fh,
-+		.layoutclass = server->pnfs_curr_ld->id,
-+	};
-+	struct nfs4_getdevicelist_res res = {
-+		.devlist = devlist,
-+	};
-+	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
-+		.rpc_argp = &args,
-+		.rpc_resp = &res,
-+		.rpc_cred = nfs4_get_machine_cred(server->nfs_client),
-+	};
-+	int status;
-+
-+	dprintk("--> %s\n", __func__);
-+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
-+	put_rpccred(msg.rpc_cred);
-+	dprintk("<-- %s status=%d\n", __func__, status);
-+	return status;
-+}
-+
-+int nfs4_proc_getdevicelist(struct nfs_server *server,
-+			    const struct nfs_fh *fh,
-+			    struct pnfs_devicelist *devlist)
-+{
-+	struct nfs4_exception exception = { };
-+	int err;
-+
-+	do {
-+		err = nfs4_handle_exception(server,
-+				_nfs4_getdevicelist(server, fh, devlist),
-+				&exception);
-+	} while (exception.retry);
-+
-+	dprintk("%s: err=%d, num_devs=%u\n", __func__,
-+		err, devlist->num_devs);
-+
-+	return err;
-+}
-+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
-+
-+static int
-+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
-+{
-+	struct nfs4_getdeviceinfo_args args = {
-+		.pdev = pdev,
-+	};
-+	struct nfs4_getdeviceinfo_res res = {
-+		.pdev = pdev,
-+	};
-+	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
-+		.rpc_argp = &args,
-+		.rpc_resp = &res,
-+		.rpc_cred = nfs4_get_machine_cred(server->nfs_client),
-+	};
-+	int status;
-+
-+	dprintk("--> %s\n", __func__);
-+	status = nfs4_call_sync(server, &msg, &args, &res, 0);
-+	put_rpccred(msg.rpc_cred);
-+	dprintk("<-- %s status=%d\n", __func__, status);
-+
-+	return status;
-+}
-+
-+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
-+{
-+	struct nfs4_exception exception = { };
-+	int err;
-+
-+	do {
-+		err = nfs4_handle_exception(server,
-+					_nfs4_proc_getdeviceinfo(server, pdev),
-+					&exception);
-+	} while (exception.retry);
-+	return err;
-+}
-+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
-+
- #endif /* CONFIG_NFS_V4_1 */
- 
- struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
-diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
-index 72b6c58..b57f41f 100644
---- a/fs/nfs/nfs4renewd.c
-+++ b/fs/nfs/nfs4renewd.c
-@@ -64,7 +64,7 @@ nfs4_renew_state(struct work_struct *work)
- 	ops = clp->cl_mvops->state_renewal_ops;
- 	dprintk("%s: start\n", __func__);
- 	/* Are there any active superblocks? */
--	if (list_empty(&clp->cl_superblocks))
-+	if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp))
- 		goto out;
- 	spin_lock(&clp->cl_lock);
- 	lease = clp->cl_lease_time;
-diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
-index 3e2f19b..3168d77 100644
---- a/fs/nfs/nfs4state.c
-+++ b/fs/nfs/nfs4state.c
-@@ -53,6 +53,7 @@
- #include "callback.h"
- #include "delegation.h"
- #include "internal.h"
-+#include "pnfs.h"
- 
- #define OPENOWNER_POOL_SIZE	8
- 
-@@ -126,6 +127,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
- 	int status;
- 	struct nfs_fsinfo fsinfo;
- 
-+	if (is_ds_only_client(clp)) {
-+		nfs4_schedule_state_renewal(clp);
-+		return 0;
-+	}
-+
- 	status = nfs4_proc_get_lease_time(clp, &fsinfo);
- 	if (status == 0) {
- 		/* Update lease time and schedule renewal */
-@@ -182,6 +188,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
- int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
- {
- 	int status;
-+	u32 req_exchange_flags = clp->cl_exchange_flags;
- 
- 	nfs4_begin_drain_session(clp);
- 	status = nfs4_proc_exchange_id(clp, cred);
-@@ -190,6 +197,16 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
- 	status = nfs4_proc_create_session(clp);
- 	if (status != 0)
- 		goto out;
-+	if (is_ds_only_session(req_exchange_flags)) {
-+		clp->cl_exchange_flags &=
-+		     ~(EXCHGID4_FLAG_USE_PNFS_MDS | EXCHGID4_FLAG_USE_NON_PNFS);
-+		if (!is_ds_only_session(clp->cl_exchange_flags)) {
-+			nfs4_destroy_session(clp->cl_session);
-+			clp->cl_session = NULL;
-+			status = -ENOTSUPP;
-+			goto out;
-+		}
-+	}
- 	nfs41_setup_state_renewal(clp);
- 	nfs_mark_client_ready(clp, NFS_CS_READY);
- out:
-@@ -583,8 +600,24 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
- 	if (!call_close) {
- 		nfs4_put_open_state(state);
- 		nfs4_put_state_owner(owner);
--	} else
-+	} else {
-+		u32 roc_iomode;
-+		struct nfs_inode *nfsi = NFS_I(state->inode);
-+
-+		if (has_layout(nfsi) &&
-+		    (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
-+			struct pnfs_layout_range range = {
-+				.iomode = roc_iomode,
-+				.offset = 0,
-+				.length = NFS4_MAX_UINT64,
-+			};
-+
-+			pnfs_return_layout(state->inode, &range, NULL,
-+					   RETURN_FILE, wait);
-+		}
-+
- 		nfs4_do_close(path, state, gfp_mask, wait);
-+	}
- }
- 
- void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
-@@ -1447,6 +1480,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
- 			}
- 			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
- 			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
-+			pnfs_destroy_all_layouts(clp);
- 		}
- 
- 		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
-diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
-index 08ef912..30ce2ed 100644
---- a/fs/nfs/nfs4xdr.c
-+++ b/fs/nfs/nfs4xdr.c
-@@ -52,6 +52,7 @@
- #include <linux/nfs_idmap.h>
- #include "nfs4_fs.h"
- #include "internal.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_XDR
- 
-@@ -89,7 +90,7 @@ static int nfs4_stat_to_errno(int);
- #define encode_getfh_maxsz      (op_encode_hdr_maxsz)
- #define decode_getfh_maxsz      (op_decode_hdr_maxsz + 1 + \
- 				((3+NFS4_FHSIZE) >> 2))
--#define nfs4_fattr_bitmap_maxsz 3
-+#define nfs4_fattr_bitmap_maxsz 4
- #define encode_getattr_maxsz    (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
- #define nfs4_name_maxsz		(1 + ((3 + NFS4_MAXNAMLEN) >> 2))
- #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
-@@ -111,7 +112,11 @@ static int nfs4_stat_to_errno(int);
- #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
- #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
- #define encode_fsinfo_maxsz	(encode_getattr_maxsz)
--#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + 11)
-+/* The 5 accounts for the PNFS attributes, and assumes that at most three
-+ * layout types will be returned.
-+ */
-+#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + \
-+				 nfs4_fattr_bitmap_maxsz + 8 + 5)
- #define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)
- #define decode_renew_maxsz	(op_decode_hdr_maxsz)
- #define encode_setclientid_maxsz \
-@@ -310,6 +315,41 @@ static int nfs4_stat_to_errno(int);
- 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
- #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
- #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
-+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
-+				encode_verifier_maxsz)
-+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
-+				2 /* nfs_cookie4 gdlr_cookie */ + \
-+				decode_verifier_maxsz \
-+				  /* verifier4 gdlr_verifier */ + \
-+				1 /* gdlr_deviceid_list count */ + \
-+				XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
-+					    NFS4_DEVICEID4_SIZE) \
-+				  /* gdlr_deviceid_list */ + \
-+				1 /* bool gdlr_eof */)
-+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
-+				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
-+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
-+				1 /* layout type */ + \
-+				1 /* opaque devaddr4 length */ + \
-+				  /* devaddr4 payload is read into page */ \
-+				1 /* notification bitmap length */ + \
-+				1 /* notification bitmap */)
-+#define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
-+				encode_stateid_maxsz)
-+#define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
-+				decode_stateid_maxsz + \
-+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
-+#define encode_layoutcommit_maxsz (18 +                           \
-+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
-+				op_encode_hdr_maxsz +          \
-+				encode_stateid_maxsz)
-+#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz)
-+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
-+				encode_stateid_maxsz + \
-+				1 /* FIXME: opaque lrf_body always empty at
-+				   *the moment */)
-+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
-+				1 + decode_stateid_maxsz)
- #else /* CONFIG_NFS_V4_1 */
- #define encode_sequence_maxsz	0
- #define decode_sequence_maxsz	0
-@@ -699,6 +739,60 @@ static int nfs4_stat_to_errno(int);
- #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
- 					 decode_sequence_maxsz + \
- 					 decode_reclaim_complete_maxsz)
-+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
-+				encode_sequence_maxsz + \
-+				encode_putfh_maxsz + \
-+				encode_getdevicelist_maxsz)
-+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
-+				decode_sequence_maxsz + \
-+				decode_putfh_maxsz + \
-+				decode_getdevicelist_maxsz)
-+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
-+				encode_sequence_maxsz +\
-+				encode_getdeviceinfo_maxsz)
-+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
-+				decode_sequence_maxsz + \
-+				decode_getdeviceinfo_maxsz)
-+#define NFS4_enc_layoutget_sz	(compound_encode_hdr_maxsz + \
-+				encode_sequence_maxsz + \
-+				encode_putfh_maxsz +        \
-+				encode_layoutget_maxsz)
-+#define NFS4_dec_layoutget_sz	(compound_decode_hdr_maxsz + \
-+				decode_sequence_maxsz + \
-+				decode_putfh_maxsz +        \
-+				decode_layoutget_maxsz)
-+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
-+				encode_sequence_maxsz +\
-+				encode_putfh_maxsz + \
-+				encode_layoutcommit_maxsz + \
-+				encode_getattr_maxsz)
-+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
-+				decode_sequence_maxsz + \
-+				decode_putfh_maxsz + \
-+				decode_layoutcommit_maxsz + \
-+				decode_getattr_maxsz)
-+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
-+				encode_sequence_maxsz + \
-+				encode_putfh_maxsz + \
-+				encode_layoutreturn_maxsz)
-+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
-+				decode_sequence_maxsz + \
-+				decode_putfh_maxsz + \
-+				decode_layoutreturn_maxsz)
-+#define NFS4_enc_dswrite_sz	(compound_encode_hdr_maxsz + \
-+				encode_sequence_maxsz +\
-+				encode_putfh_maxsz + \
-+				encode_write_maxsz)
-+#define NFS4_dec_dswrite_sz	(compound_decode_hdr_maxsz + \
-+				decode_sequence_maxsz + \
-+				decode_putfh_maxsz + \
-+				decode_write_maxsz)
-+#define NFS4_enc_dscommit_sz	(compound_encode_hdr_maxsz + \
-+				encode_putfh_maxsz + \
-+				encode_commit_maxsz)
-+#define NFS4_dec_dscommit_sz	(compound_decode_hdr_maxsz + \
-+				decode_putfh_maxsz + \
-+				decode_commit_maxsz)
- 
- const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
- 				      compound_encode_hdr_maxsz +
-@@ -1003,6 +1097,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
- 	hdr->replen += decode_getattr_maxsz;
- }
- 
-+static void
-+encode_getattr_three(struct xdr_stream *xdr,
-+		     uint32_t bm0, uint32_t bm1, uint32_t bm2,
-+		     struct compound_hdr *hdr)
-+{
-+	__be32 *p;
-+
-+	p = reserve_space(xdr, 4);
-+	*p = cpu_to_be32(OP_GETATTR);
-+	if (bm2) {
-+		p = reserve_space(xdr, 16);
-+		*p++ = cpu_to_be32(3);
-+		*p++ = cpu_to_be32(bm0);
-+		*p++ = cpu_to_be32(bm1);
-+		*p = cpu_to_be32(bm2);
-+	} else if (bm1) {
-+		p = reserve_space(xdr, 12);
-+		*p++ = cpu_to_be32(2);
-+		*p++ = cpu_to_be32(bm0);
-+		*p = cpu_to_be32(bm1);
-+	} else {
-+		p = reserve_space(xdr, 8);
-+		*p++ = cpu_to_be32(1);
-+		*p = cpu_to_be32(bm0);
-+	}
-+	hdr->nops++;
-+	hdr->replen += decode_getattr_maxsz;
-+}
-+
- static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
- {
- 	encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-@@ -1011,8 +1134,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
- 
- static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
- {
--	encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
--			   bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
-+	encode_getattr_three(xdr,
-+			     bitmask[0] & nfs4_fsinfo_bitmap[0],
-+			     bitmask[1] & nfs4_fsinfo_bitmap[1],
-+			     bitmask[2] & nfs4_fsinfo_bitmap[2],
-+			     hdr);
- }
- 
- static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
-@@ -1726,6 +1852,155 @@ static void encode_sequence(struct xdr_stream *xdr,
- #endif /* CONFIG_NFS_V4_1 */
- }
- 
-+#ifdef CONFIG_NFS_V4_1
-+static void
-+encode_getdevicelist(struct xdr_stream *xdr,
-+		     const struct nfs4_getdevicelist_args *args,
-+		     struct compound_hdr *hdr)
-+{
-+	__be32 *p;
-+	nfs4_verifier dummy = {
-+		.data = "dummmmmy",
-+	};
-+
-+	p = reserve_space(xdr, 20);
-+	*p++ = cpu_to_be32(OP_GETDEVICELIST);
-+	*p++ = cpu_to_be32(args->layoutclass);
-+	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
-+	xdr_encode_hyper(p, 0ULL);                          /* cookie */
-+	encode_nfs4_verifier(xdr, &dummy);
-+	hdr->nops++;
-+	hdr->replen += decode_getdevicelist_maxsz;
-+}
-+
-+static void
-+encode_getdeviceinfo(struct xdr_stream *xdr,
-+		     const struct nfs4_getdeviceinfo_args *args,
-+		     struct compound_hdr *hdr)
-+{
-+	__be32 *p;
-+
-+	p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
-+	*p++ = cpu_to_be32(OP_GETDEVICEINFO);
-+	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
-+				    NFS4_DEVICEID4_SIZE);
-+	*p++ = cpu_to_be32(args->pdev->layout_type);
-+	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */
-+	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
-+	hdr->nops++;
-+	hdr->replen += decode_getdeviceinfo_maxsz;
-+}
-+
-+static void
-+encode_layoutget(struct xdr_stream *xdr,
-+		      const struct nfs4_layoutget_args *args,
-+		      struct compound_hdr *hdr)
-+{
-+	nfs4_stateid stateid;
-+	__be32 *p;
-+
-+	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
-+	*p++ = cpu_to_be32(OP_LAYOUTGET);
-+	*p++ = cpu_to_be32(0);     /* Signal layout available */
-+	*p++ = cpu_to_be32(args->type);
-+	*p++ = cpu_to_be32(args->range.iomode);
-+	p = xdr_encode_hyper(p, args->range.offset);
-+	p = xdr_encode_hyper(p, args->range.length);
-+	p = xdr_encode_hyper(p, args->minlength);
-+	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
-+				args->ctx->state);
-+	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
-+	*p = cpu_to_be32(args->maxcount);
-+
-+	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
-+		__func__,
-+		args->type,
-+		args->range.iomode,
-+		(unsigned long)args->range.offset,
-+		(unsigned long)args->range.length,
-+		args->maxcount);
-+	hdr->nops++;
-+	hdr->replen += decode_layoutget_maxsz;
-+}
-+
-+static int
-+encode_layoutcommit(struct xdr_stream *xdr,
-+		    const struct nfs4_layoutcommit_args *args,
-+		    struct compound_hdr *hdr)
-+{
-+	__be32 *p;
-+
-+	dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__,
-+		args->range.length, args->range.offset, args->lastbytewritten,
-+		args->layout_type);
-+
-+	p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE);
-+	*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
-+	p = xdr_encode_hyper(p, args->range.offset);
-+	p = xdr_encode_hyper(p, args->range.length);
-+	*p++ = cpu_to_be32(0);     /* reclaim */
-+	p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
-+	*p++ = cpu_to_be32(1);     /* newoffset = TRUE */
-+	p = xdr_encode_hyper(p, args->lastbytewritten);
-+	*p = cpu_to_be32(args->time_modify_changed != 0);
-+	if (args->time_modify_changed) {
-+		p = reserve_space(xdr, 12);
-+		*p++ = cpu_to_be32(0);
-+		*p++ = cpu_to_be32(args->time_modify.tv_sec);
-+		*p = cpu_to_be32(args->time_modify.tv_nsec);
-+	}
-+
-+	p = reserve_space(xdr, 4);
-+	*p = cpu_to_be32(args->layout_type);
-+
-+	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit) {
-+		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit(
-+			NFS_I(args->inode)->layout, xdr, args);
-+	} else {
-+		p = reserve_space(xdr, 4);
-+		xdr_encode_opaque(p, NULL, 0);
-+	}
-+
-+	hdr->nops++;
-+	hdr->replen += decode_layoutcommit_maxsz;
-+	return 0;
-+}
-+
-+static void
-+encode_layoutreturn(struct xdr_stream *xdr,
-+		    const struct nfs4_layoutreturn_args *args,
-+		    struct compound_hdr *hdr)
++static ssize_t
++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 +{
-+	nfs4_stateid stateid;
-+	__be32 *p;
++	struct rpc_inode	*rpci	= RPC_I(filp->f_dentry->d_inode);
++	bl_comm_t		*bc	= (bl_comm_t *)rpci->private;
++	bl_comm_msg_t		*im	= &bc->msg;
++	int			ret;
++	bl_comm_res_t		*res;
++	
 +
-+	p = reserve_space(xdr, 20);
-+	*p++ = cpu_to_be32(OP_LAYOUTRETURN);
-+	*p++ = cpu_to_be32(args->reclaim);
-+	*p++ = cpu_to_be32(args->layout_type);
-+	*p++ = cpu_to_be32(args->range.iomode);
-+	*p = cpu_to_be32(args->return_type);
-+	if (args->return_type == RETURN_FILE) {
-+		p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
-+		p = xdr_encode_hyper(p, args->range.offset);
-+		p = xdr_encode_hyper(p, args->range.length);
-+		pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
-+					NULL);
-+		p = xdr_encode_opaque_fixed(p, &stateid.data,
-+					    NFS4_STATEID_SIZE);
-+		if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
-+			NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
-+				NFS_I(args->inode)->layout, xdr, args);
-+		} else {
-+			p = reserve_space(xdr, 4);
-+			*p = cpu_to_be32(0);
-+		}
++	if (mlen == 0) {
++		im->msg_status = PNFS_BLOCK_FAILURE;
++		im->msg_res = NULL;
++		wake_up(&bc->pipe_wq);
++		return -EFAULT;
 +	}
-+	hdr->nops++;
-+	hdr->replen += decode_layoutreturn_maxsz;
-+}
-+#endif /* CONFIG_NFS_V4_1 */
-+
- /*
-  * END OF "GENERIC" ENCODE ROUTINES.
-  */
-@@ -2374,7 +2649,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
- 	struct compound_hdr hdr = {
- 		.nops	= 0,
- 	};
--	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-+	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
- 
- 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- 	encode_compound_hdr(&xdr, req, &hdr);
-@@ -2513,7 +2788,7 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
- 	struct compound_hdr hdr = {
- 		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
- 	};
--	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-+	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
- 
- 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- 	encode_compound_hdr(&xdr, req, &hdr);
-@@ -2543,6 +2818,153 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
- 	return 0;
- }
- 
-+/*
-+ * Encode GETDEVICELIST request
-+ */
-+static int
-+nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p,
-+			   struct nfs4_getdevicelist_args *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_putfh(&xdr, args->fh, &hdr);
-+	encode_getdevicelist(&xdr, args, &hdr);
-+	encode_nops(&hdr);
-+	return 0;
-+}
-+
-+/*
-+ * Encode GETDEVICEINFO request
-+ */
-+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
-+				      struct nfs4_getdeviceinfo_args *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_getdeviceinfo(&xdr, args, &hdr);
-+
-+	/* set up reply kvec. Subtract notification bitmap max size (2)
-+	 * so that notification bitmap is put in xdr_buf tail */
-+	xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
-+			 args->pdev->pages, args->pdev->pgbase,
-+			 args->pdev->pglen);
-+
-+	encode_nops(&hdr);
-+	return 0;
-+}
-+
-+/*
-+ *  Encode LAYOUTGET request
-+ */
-+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
-+				  struct nfs4_layoutget_args *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-+	encode_layoutget(&xdr, args, &hdr);
-+	encode_nops(&hdr);
-+	return 0;
-+}
-+
-+/*
-+ *  Encode LAYOUTCOMMIT request
-+ */
-+static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p,
-+				     struct nfs4_layoutcommit_args *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_putfh(&xdr, args->fh, &hdr);
-+	encode_layoutcommit(&xdr, args, &hdr);
-+	encode_getfattr(&xdr, args->bitmask, &hdr);
-+	encode_nops(&hdr);
-+	return 0;
-+}
-+
-+/*
-+ * Encode LAYOUTRETURN request
-+ */
-+static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p,
-+				     struct nfs4_layoutreturn_args *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-+	encode_layoutreturn(&xdr, args, &hdr);
-+	encode_nops(&hdr);
-+	return 0;
-+}
-+
-+/*
-+ * Encode a pNFS File Layout Data Server WRITE request
-+ */
-+static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p,
-+				struct nfs_writeargs *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_putfh(&xdr, args->fh, &hdr);
-+	encode_write(&xdr, args, &hdr);
-+	encode_nops(&hdr);
-+	return 0;
-+}
-+
-+/*
-+ * Encode a pNFS File Layout Data Server COMMIT request
-+ */
-+static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p,
-+				 struct nfs_writeargs *args)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr = {
-+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+	};
-+
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_compound_hdr(&xdr, req, &hdr);
-+	encode_sequence(&xdr, &args->seq_args, &hdr);
-+	encode_putfh(&xdr, args->fh, &hdr);
-+	encode_commit(&xdr, args, &hdr);
-+	encode_nops(&hdr);
-+	return 0;
-+}
- #endif /* CONFIG_NFS_V4_1 */
- 
- static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-@@ -2643,14 +3065,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
- 		goto out_overflow;
- 	bmlen = be32_to_cpup(p);
- 
--	bitmap[0] = bitmap[1] = 0;
-+	bitmap[0] = bitmap[1] = bitmap[2] = 0;
- 	p = xdr_inline_decode(xdr, (bmlen << 2));
- 	if (unlikely(!p))
- 		goto out_overflow;
- 	if (bmlen > 0) {
- 		bitmap[0] = be32_to_cpup(p++);
--		if (bmlen > 1)
--			bitmap[1] = be32_to_cpup(p);
-+		if (bmlen > 1) {
-+			bitmap[1] = be32_to_cpup(p++);
-+			if (bmlen > 2)
-+				bitmap[2] = be32_to_cpup(p);
-+		}
- 	}
- 	return 0;
- out_overflow:
-@@ -2679,8 +3104,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
- 		decode_attr_bitmap(xdr, bitmask);
- 		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
- 	} else
--		bitmask[0] = bitmask[1] = 0;
--	dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
-+		bitmask[0] = bitmask[1] = bitmask[2] = 0;
-+	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
-+		bitmask[0], bitmask[1], bitmask[2]);
- 	return 0;
- }
- 
-@@ -3665,7 +4091,7 @@ out_overflow:
- static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
- {
- 	__be32 *savep;
--	uint32_t attrlen, bitmap[2] = {0};
-+	uint32_t attrlen, bitmap[3] = {0};
- 	int status;
- 
- 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3691,7 +4117,7 @@ xdr_error:
- static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
- {
- 	__be32 *savep;
--	uint32_t attrlen, bitmap[2] = {0};
-+	uint32_t attrlen, bitmap[3] = {0};
- 	int status;
- 
- 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3723,7 +4149,7 @@ xdr_error:
- static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
- {
- 	__be32 *savep;
--	uint32_t attrlen, bitmap[2] = {0};
-+	uint32_t attrlen, bitmap[3] = {0};
- 	int status;
- 
- 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3749,7 +4175,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
- {
- 	__be32 *savep;
- 	uint32_t attrlen,
--		 bitmap[2] = {0},
-+		 bitmap[3] = {0},
- 		 type;
- 	int status;
- 	umode_t fmode = 0;
-@@ -3868,11 +4294,87 @@ xdr_error:
- 	return status;
- }
- 
-+/*
-+ * Decode potentially multiple layout types. Currently we only support
-+ * one layout driver per file system.
-+ */
-+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
-+					 uint32_t *layouttype)
-+{
-+	uint32_t *p;
-+	int num;
-+
-+	p = xdr_inline_decode(xdr, 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	num = be32_to_cpup(p);
++	
++	if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL)
++		return -ENOMEM;
++	
++	if (copy_from_user(res, src, mlen)) {
++		kfree(res);
++		return -EFAULT;
++	}
++	
++	mutex_lock(&bc->pipe_lock);
++	
++	ret		= mlen;
++	im->msg_status	= res->res_status;
++	im->msg_res	= res;
++	
++	wake_up(&bc->pipe_wq);
++	mutex_unlock(&bc->pipe_lock);
++	return ret;
++}
 +
-+	/* pNFS is not supported by the underlying file system */
-+	if (num == 0) {
-+		*layouttype = 0;
-+		return 0;
-+	}
-+	if (num > 1)
-+		printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
-+			"per filesystem not supported\n", __func__);
++static void
++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
++{
++	bl_comm_msg_t	*im = msg->data;
++	bl_comm_t	*bc = container_of(im, struct bl_comm, msg);
++	
++	if (msg->errno >= 0)
++		return;
 +
-+	/* Decode and set first layout type, move xdr->p past unused types */
-+	p = xdr_inline_decode(xdr, num * 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	*layouttype = be32_to_cpup(p);
-+	return 0;
-+out_overflow:
-+	print_overflow_msg(__func__, xdr);
-+	return -EIO;
++	mutex_lock(&bc->pipe_lock);
++	im->msg_status = PNFS_BLOCK_FAILURE;
++	wake_up(&bc->pipe_wq);
++	mutex_unlock(&bc->pipe_lock);
 +}
 +
-+/*
-+ * The type of file system exported.
-+ * Note we must ensure that layouttype is set in any non-error case.
-+ */
-+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
-+				uint32_t *layouttype)
++int
++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res)
 +{
-+	int status = 0;
-+
-+	dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
-+	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
-+		return -EIO;
-+	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-+		status = decode_first_pnfs_layout_type(xdr, layouttype);
-+		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
++	struct rpc_pipe_msg	msg;
++	DECLARE_WAITQUEUE(wq, current);
++	int			rval	= 1;
++	bl_comm_msg_t		*m	= &bc->msg;
++	
++	if (bc == NULL) {
++		dprintk("%s: No pNFS block daemon available\n", __func__);
++		return 1;
++	}
++	
++	mutex_lock(&bc->lock);
++	mutex_lock(&bc->pipe_lock);
++	
++	memcpy(m, upmsg, sizeof (*m));
++	
++	memset(&msg, 0, sizeof (msg));
++	msg.data = m;
++	msg.len = sizeof (*m);
++	
++	add_wait_queue(&bc->pipe_wq, &wq);
++	rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg);
++	if (rval < 0) {
++		remove_wait_queue(&bc->pipe_wq, &wq);
++		goto out;
++	}
++	
++	set_current_state(TASK_UNINTERRUPTIBLE);
++	mutex_unlock(&bc->pipe_lock);
++	schedule();
++	__set_current_state(TASK_RUNNING);
++	remove_wait_queue(&bc->pipe_wq, &wq);
++	mutex_lock(&bc->pipe_lock);
++	
++	if (m->msg_status == PNFS_BLOCK_SUCCESS) {
++		*res = m->msg_res;
++		rval = 0;
 +	} else
-+		*layouttype = 0;
-+	return status;
++		rval = 1;
++	
++out:
++	mutex_unlock(&bc->pipe_lock);
++	mutex_unlock(&bc->lock);
++	return rval;
 +}
 +
-+/*
-+ * The prefered block size for layout directed io
-+ */
-+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
-+				      uint32_t *res)
++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len,
++    loff_t *offset)
 +{
-+	__be32 *p;
++	int		cmd,
++			rc;
++	bl_comm_t	*bc	= bl_comm_global;
++	bl_comm_msg_t	msg;
++	bl_comm_res_t	*res;
 +
-+	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
-+	*res = 0;
-+	if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
-+		p = xdr_inline_decode(xdr, 4);
-+		if (unlikely(!p)) {
-+			print_overflow_msg(__func__, xdr);
-+			return -EIO;
++	if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int)))
++		return -EFAULT;
++	switch (cmd) {
++	case PNFS_BLOCK_CTL_STOP:
++		msg.msg_type = PNFS_UPCALL_MSG_STOP;
++		(void) bl_upcall(bc, &msg, &res);
++		kfree(res);
++		nfsd_bl_stop();
++		break;
++		
++	case PNFS_BLOCK_CTL_START:
++		rc = nfsd_bl_start();
++		if (rc != 0)
++			return rc;
++		break;
++		
++	case PNFS_BLOCK_CTL_VERS:
++		msg.msg_type = PNFS_UPCALL_MSG_VERS;
++		msg.u.msg_vers = PNFS_UPCALL_VERS;
++		if (bl_upcall(bc, &msg, &res)) {
++			dprintk("%s: Failed to contact pNFS block daemon\n",
++			    __func__);
++			return 0;
 +		}
-+		*res = be32_to_cpup(p);
-+		bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
++		kfree(res);
++		break;
++		
++	default:
++		dprintk("%s: unknown ctl command %d\n", __func__, cmd);
++		break;
 +	}
-+	return 0;
++	return len;
 +}
- 
- static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
- {
- 	__be32 *savep;
--	uint32_t attrlen, bitmap[2];
-+	uint32_t attrlen, bitmap[3];
- 	int status;
- 
- 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3894,6 +4396,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
- 	if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
- 		goto xdr_error;
- 	fsinfo->wtpref = fsinfo->wtmax;
-+	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
-+	if (status)
-+		goto xdr_error;
-+	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
-+	if (status)
-+		goto xdr_error;
- 
- 	status = verify_attr_len(xdr, savep, attrlen);
- xdr_error:
-@@ -4382,7 +4890,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
- {
- 	__be32 *savep;
- 	uint32_t attrlen,
--		 bitmap[2] = {0};
-+		 bitmap[3] = {0};
- 	struct kvec *iov = req->rq_rcv_buf.head;
- 	int status;
- 
-@@ -4731,16 +5239,238 @@ out_overflow:
- #endif /* CONFIG_NFS_V4_1 */
- }
- 
-+#if defined(CONFIG_NFS_V4_1)
- /*
-- * END OF "GENERIC" DECODE ROUTINES.
-- */
--
--/*
-- * Decode OPEN_DOWNGRADE response
-+ * TODO: Need to handle case when EOF != true;
-  */
--static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
-+static int decode_getdevicelist(struct xdr_stream *xdr,
-+				struct pnfs_devicelist *res)
- {
--	struct xdr_stream xdr;
-+	__be32 *p;
-+	int status, i;
-+	struct nfs_writeverf verftemp;
-+
-+	status = decode_op_hdr(xdr, OP_GETDEVICELIST);
-+	if (status)
-+		return status;
-+
-+	p = xdr_inline_decode(xdr, 8 + 8 + 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
 +
-+	/* TODO: Skip cookie for now */
-+	p += 2;
-+
-+	/* Read verifier */
-+	p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
++static struct file_operations ctl_ops = {
++	.write	= ctl_write,
++};
 +
-+	res->num_devs = be32_to_cpup(p);
++/*
++ * bl_init_proc -- set up proc interfaces
++ *
++ * Creating a pnfs_block directory isn't really required at this point
++ * since we've only got a single node in that directory. If the need for
++ * more nodes doesn't present itself shortly this code should revert
++ * to a single top level node. McNeal 11-Aug-2008.
++ */
++int
++bl_init_proc(void)
++{
++	struct proc_dir_entry *e;
 +
-+	dprintk("%s: num_dev %d\n", __func__, res->num_devs);
++	e = proc_mkdir("fs/pnfs_block", NULL);
++	if (!e)
++		return -ENOMEM;
 +
-+	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM)
-+		return -NFS4ERR_REP_TOO_BIG;
++	e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL);
++	if (!e)
++		return -ENOMEM;
++	e->proc_fops = &ctl_ops;
 +
-+	p = xdr_inline_decode(xdr,
-+			      res->num_devs * NFS4_DEVICEID4_SIZE + 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	for (i = 0; i < res->num_devs; i++)
-+		p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
-+					    NFS4_DEVICEID4_SIZE);
-+	res->eof = be32_to_cpup(p);
 +	return 0;
-+out_overflow:
-+	print_overflow_msg(__func__, xdr);
-+	return -EIO;
 +}
++#endif /* CONFIG_SPNFS_BLOCK */
+diff -up linux-2.6.37.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.37.noarch/fs/nfsd/bl_ops.c
+--- linux-2.6.37.noarch/fs/nfsd/bl_ops.c.orig	2011-01-28 09:43:53.349770555 -0500
++++ linux-2.6.37.noarch/fs/nfsd/bl_ops.c	2011-01-28 09:43:53.349770555 -0500
+@@ -0,0 +1,1672 @@
++/*
++ *  bl_ops.c
++ *  spNFS
++ *
++ *  Created by Rick McNeal on 4/1/08.
++ *  Copyright 2008 __MyCompanyName__. All rights reserved.
++ *
++ */
 +
-+static int decode_getdeviceinfo(struct xdr_stream *xdr,
-+				struct pnfs_device *pdev)
-+{
-+	__be32 *p;
-+	uint32_t len, type;
-+	int status;
-+
-+	status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
-+	if (status) {
-+		if (status == -ETOOSMALL) {
-+			p = xdr_inline_decode(xdr, 4);
-+			if (unlikely(!p))
-+				goto out_overflow;
-+			pdev->mincount = be32_to_cpup(p);
-+			dprintk("%s: Min count too small. mincnt = %u\n",
-+				__func__, pdev->mincount);
-+		}
-+		return status;
-+	}
++/*
++ * Block layout operations.
++ *
++ * These functions, with the exception of pnfs_block_enabled, are assigned to
++ * the super block s_export_op structure.
++ */
++#if defined(CONFIG_SPNFS_BLOCK)
 +
-+	p = xdr_inline_decode(xdr, 8);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	type = be32_to_cpup(p++);
-+	if (type != pdev->layout_type) {
-+		dprintk("%s: layout mismatch req: %u pdev: %u\n",
-+			__func__, pdev->layout_type, type);
-+		return -EINVAL;
-+	}
-+	/*
-+	 * Get the length of the opaque device_addr4. xdr_read_pages places
-+	 * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
-+	 * and places the remaining xdr data in xdr_buf->tail
-+	 */
-+	pdev->mincount = be32_to_cpup(p);
-+	xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
++#include <linux/module.h>
++#include <linux/genhd.h>
++#include <linux/fs.h>
++#include <linux/exportfs.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/nfsd/export.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
++#include <linux/nfsd/debug.h>
++#include <linux/spinlock_types.h>
++#include <linux/dm-ioctl.h>
++#include <asm/uaccess.h>
++#include <linux/falloc.h>
++#include <linux/nfsd4_block.h>
 +
-+	/* Parse notification bitmap, verifying that it is zero. */
-+	p = xdr_inline_decode(xdr, 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	len = be32_to_cpup(p);
-+	if (len) {
-+		int i;
++#include "pnfsd.h"
 +
-+		p = xdr_inline_decode(xdr, 4 * len);
-+		if (unlikely(!p))
-+			goto out_overflow;
-+		for (i = 0; i < len; i++, p++) {
-+			if (be32_to_cpup(p)) {
-+				dprintk("%s: notifications not supported\n",
-+					__func__);
-+				return -EIO;
-+			}
-+		}
-+	}
-+	return 0;
-+out_overflow:
-+	print_overflow_msg(__func__, xdr);
-+	return -EIO;
-+}
++#define NFSDDBG_FACILITY	NFSDDBG_PNFS
 +
-+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
-+			    struct nfs4_layoutget_res *res)
-+{
-+	__be32 *p;
-+	int status;
-+	u32 layout_count;
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
 +
-+	status = decode_op_hdr(xdr, OP_LAYOUTGET);
-+	if (status)
-+		return status;
-+	p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	res->return_on_close = be32_to_cpup(p++);
-+	p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
-+	layout_count = be32_to_cpup(p);
-+	if (!layout_count) {
-+		dprintk("%s: server responded with empty layout array\n",
-+			__func__);
-+		return -EINVAL;
-+	}
++#define BL_LAYOUT_HASH_BITS	4
++#define BL_LAYOUT_HASH_SIZE	(1 << BL_LAYOUT_HASH_BITS)
++#define BL_LAYOUT_HASH_MASK	(BL_LAYOUT_HASH_SIZE - 1)
++#define BL_LIST_REQ	(sizeof (struct dm_ioctl) + 256)
 +
-+	p = xdr_inline_decode(xdr, 24);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	p = xdr_decode_hyper(p, &res->range.offset);
-+	p = xdr_decode_hyper(p, &res->range.length);
-+	res->range.iomode = be32_to_cpup(p++);
-+	res->type = be32_to_cpup(p++);
++#define bl_layout_hashval(id) \
++	((id) & BL_LAYOUT_HASH_MASK)
 +
-+	status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
-+	if (unlikely(status))
-+		return status;
++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len)
++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len)
++#define _2SECTS(v) ((v) >> 9)
 +
-+	dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
-+		__func__,
-+		(unsigned long)res->range.offset,
-+		(unsigned long)res->range.length,
-+		res->range.iomode,
-+		res->type,
-+		res->layout.len);
-+
-+	/* nfs4_proc_layoutget allocated a single page */
-+	if (res->layout.len > PAGE_SIZE)
-+		return -ENOMEM;
-+	memcpy(res->layout.buf, p, res->layout.len);
++#ifndef READ32
++#define READ32(x)	(x) = ntohl(*p++)
++#define READ64(x)	do {			\
++(x) = (u64)ntohl(*p++) << 32;	\
++(x) |= ntohl(*p++);		\
++} while (0)
++#endif
 +
-+	if (layout_count > 1) {
-+		/* We only handle a length one array at the moment.  Any
-+		 * further entries are just ignored.  Note that this means
-+		 * the client may see a response that is less than the
-+		 * minimum it requested.
-+		 */
-+		dprintk("%s: server responded with %d layouts, dropping tail\n",
-+			__func__, layout_count);
-+	}
 +
-+	return 0;
-+out_overflow:
-+	print_overflow_msg(__func__, xdr);
-+	return -EIO;
-+}
++typedef enum {True, False} boolean_t;
++/* ---- block layoutget and commit structure ---- */
++typedef struct bl_layout_rec {
++	struct list_head	blr_hash,
++				blr_layouts;
++	dev_t			blr_rdev;
++	struct inode		*blr_inode;
++	int			blr_recalled;	// debug
++	u64			blr_orig_size,
++				blr_commit_size,
++				blr_ext_size;
++	spinlock_t		blr_lock;	// Protects blr_layouts
++} bl_layout_rec_t;
 +
-+static int decode_layoutreturn(struct xdr_stream *xdr,
-+			       struct nfs4_layoutreturn_res *res)
-+{
-+	__be32 *p;
-+	int status;
++static struct list_head layout_hash;
++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE];
++static spinlock_t layout_hashtbl_lock;
 +
-+	status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
-+	if (status)
-+		return status;
-+	p = xdr_inline_decode(xdr, 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	res->valid = true;
-+	res->lrs_present = be32_to_cpup(p);
-+	if (res->lrs_present)
-+		status = decode_stateid(xdr, &res->stateid);
-+	return status;
-+out_overflow:
-+	print_overflow_msg(__func__, xdr);
-+	return -EIO;
-+}
++/* ---- prototypes ---- */
++static boolean_t device_slice(dev_t devid);
++static boolean_t device_dm(dev_t devid);
++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **);
++static bl_layout_rec_t *layout_inode_find(struct inode *i);
++static void layout_inode_del(struct inode *i);
++static char *map_state2name(enum pnfs_block_extent_state4 s);
++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type);
++static void bld_free(pnfs_blocklayout_devinfo_t *bld);
++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes,
++    dev_t devid, int local_index);
++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes,
++    dev_t devid, int my_loc, int idx);
++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
++    struct nfsd4_layout_seg *seg);
++struct list_head *layout_cache_iter(bl_layout_rec_t *r,
++    struct list_head *bl_possible, struct nfsd4_layout_seg *seg);
++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h);
++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h);
++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg);
++static void print_bll(pnfs_blocklayout_layout_t *b, char *);
++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r,
++    struct list_head *h, struct nfsd4_layout_seg *seg);
++static inline void bll_collapse(bl_layout_rec_t *r,
++    pnfs_blocklayout_layout_t *c);
++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len,
++    enum bl_cache_state state, struct list_head *h);
++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b,
++    enum bl_cache_state c, struct list_head *h);
++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
++    enum pnfs_block_extent_state4 *s);
++static void extents_setup(struct fiemap_extent_info *fei);
++static void extents_count(struct fiemap_extent_info *fei, struct inode *i,
++    u64 foff, u64 len);
++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i,
++    u64 foff, u64 len);
++static boolean_t extents_process(struct fiemap_extent_info *fei,
++    struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev,
++    pnfs_blocklayout_layout_t *b);
++static void extents_cleanup(struct fiemap_extent_info *fei);
 +
-+static int decode_layoutcommit(struct xdr_stream *xdr,
-+				    struct rpc_rqst *req,
-+				    struct nfs4_layoutcommit_res *res)
++void
++nfsd_bl_init(void)
 +{
-+	__be32 *p;
-+	int status;
-+
-+	status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
-+	if (status)
-+		return status;
-+
-+	p = xdr_inline_decode(xdr, 4);
-+	if (unlikely(!p))
-+		goto out_overflow;
-+	res->sizechanged = be32_to_cpup(p);
++	int	i;
++	dprintk("%s loaded\n", __func__);
 +
-+	if (res->sizechanged) {
-+		p = xdr_inline_decode(xdr, 8);
-+		if (unlikely(!p))
-+			goto out_overflow;
-+		xdr_decode_hyper(p, &res->newsize);
-+	}
-+	return 0;
-+out_overflow:
-+	print_overflow_msg(__func__, xdr);
-+	return -EIO;
++	spin_lock_init(&layout_hashtbl_lock);
++	INIT_LIST_HEAD(&layout_hash);
++	for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++)
++		INIT_LIST_HEAD(&layout_hashtbl[i]);
++	bl_init_proc();
 +}
-+#endif /* CONFIG_NFS_V4_1 */
-+
-+/*
-+ * END OF "GENERIC" DECODE ROUTINES.
-+ */
 +
 +/*
-+ * Decode OPEN_DOWNGRADE response
++ * pnfs_block_enabled -- check to see if this file system should be export as
++ * block pnfs
 + */
-+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
++int
++pnfs_block_enabled(struct inode *inode, int ex_flags)
 +{
-+	struct xdr_stream xdr;
- 	struct compound_hdr hdr;
- 	int status;
- 
-@@ -5758,6 +6488,186 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
- 		status = decode_reclaim_complete(&xdr, (void *)NULL);
- 	return status;
- }
++	bl_comm_msg_t	msg;
++	bl_comm_res_t	*res	= NULL;
++	static int bl_comm_once	= 0;
++	
++	dprintk("--> %s\n", __func__);
++	/*
++	 * FIXME: Figure out method to determine if this file system should
++	 * be exported. The following areas need to be checked.
++	 * (1) Validate that this file system was exported as a pNFS
++	 *     block-layout
++	 * (2) Has there been successful communication with the
++	 *     volume daemon?
++	 */
++	/* Check #1 */
++#ifdef notyet
++	if (!(ex_flags & NFSEXP_PNFS_BLOCK)) {
++		dprintk("%s: pnfs_block not set in export\n", __func__);
++		return 0;
++	}
++#endif
++	
++	/* Check #1 */
++	if (!bl_comm_once) {
++		msg.msg_type = PNFS_UPCALL_MSG_VERS;
++		msg.u.msg_vers = PNFS_UPCALL_VERS;
++		if (bl_upcall(bl_comm_global, &msg, &res)) {
++			dprintk("%s: Failed to contact pNFS block daemon\n",
++				__func__);
++			return 0;
++		}
++		if (msg.u.msg_vers != res->u.vers) {
++			dprintk("%s: vers mismatch, kernel != daemon\n",
++				__func__);
++			kfree(res);
++			return 0;
++		}
++	}
++	bl_comm_once = 1;
 +
-+/*
-+ * Decode GETDEVICELIST response
-+ */
-+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p,
-+				      struct nfs4_getdevicelist_res *res)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
++	kfree(res);
++	
++	dprintk("<-- %s okay\n", __func__);
++	return 1;
++}
 +
-+	dprintk("encoding getdevicelist!\n");
++int
++bl_layout_type(struct super_block *sb)
++{
++	return LAYOUT_BLOCK_VOLUME;
++}
 +
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status != 0)
-+		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status != 0)
-+		goto out;
-+	status = decode_putfh(&xdr);
-+	if (status != 0)
-+		goto out;
-+	status = decode_getdevicelist(&xdr, res->devlist);
-+out:
-+	return status;
++int
++bl_getdeviceiter(struct super_block *sb,
++		 u32 layout_type,
++		 struct nfsd4_pnfs_dev_iter_res *res)
++{
++	res->gd_eof = 1;	
++	if (res->gd_cookie)
++		return -ENOENT;
++	res->gd_devid	= sb->s_dev;
++	res->gd_verf	= 1;
++	res->gd_cookie	= 1;
++	return 0;
 +}
 +
-+/*
-+ * Decode GETDEVINFO response
-+ */
-+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
-+				      struct nfs4_getdeviceinfo_res *res)
++static int
++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr,
++		       const struct nfsd4_pnfs_deviceid *devid)
 +{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
++	pnfs_blocklayout_devinfo_t	*bld_slice_p,
++					*bld_simple_p,
++					*bld;
++	int				status		= -EIO,
++					location	= 0;
++	struct list_head		volumes;
++	
++	dprintk("--> %s\n", __func__);
++	INIT_LIST_HEAD(&volumes);
 +
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status != 0)
++	bld_simple_p = bld_simple(&volumes, devid->devid,
++				  location++);
++	if (!bld_simple_p)
 +		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status != 0)
++	bld_slice_p = bld_slice(&volumes, devid->devid, location++,
++	    bld_simple_p->bld_index_loc);
++
++	if (!bld_slice_p)
 +		goto out;
-+	status = decode_getdeviceinfo(&xdr, res->pdev);
++	
++	status = blocklayout_encode_devinfo(xdr, &volumes);
++
 +out:
++	while (!list_empty(&volumes)) {
++		bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
++		    bld_list);
++		if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE)
++			kfree(bld->u.simple.bld_sig);
++		bld_free(bld);
++	}
++	
++	dprintk("<-- %s (rval %d)\n", __func__, status);
 +	return status;
 +}
 +
-+/*
-+ * Decode LAYOUTGET response
-+ */
-+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
-+				  struct nfs4_layoutget_res *res)
++static int
++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr,
++		    const struct nfsd4_pnfs_deviceid *devid)
 +{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
-+
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status)
++	pnfs_blocklayout_devinfo_t	*bld		= NULL;
++	int				status		= -EIO,	// default to error
++					i,
++					location	= 0;
++	struct list_head		volumes;
++	bl_comm_msg_t			msg;
++	bl_comm_res_t			*res;
++	
++	dprintk("--> %s\n", __func__);
++	INIT_LIST_HEAD(&volumes);
++	
++	msg.msg_type = PNFS_UPCALL_MSG_DMGET;
++	msg.u.msg_dev = devid->devid;
++	if (bl_upcall(bl_comm_global, &msg, &res)) {
++		dprintk("%s: upcall for DMGET failed\n", __func__);
 +		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status)
++	}
++		
++	/*
++	 * Don't use bld_alloc() here. If used this will be the first volume
++	 * type added to the list whereas the protocol requires it to be the
++	 * last.
++	 */
++	bld = kmalloc(sizeof (*bld), GFP_KERNEL);
++	if (!bld)
 +		goto out;
-+	status = decode_putfh(&xdr);
-+	if (status)
++	memset(bld, 0, sizeof (*bld));
++	bld->bld_type			= PNFS_BLOCK_VOLUME_STRIPE;
++	bld->u.stripe.bld_stripes	= res->u.stripe.num_stripes;
++	bld->u.stripe.bld_chunk_size	= res->u.stripe.stripe_size * 512LL;
++	dprintk("%s: stripes %d, chunk_size %Lu\n", __func__,
++	    bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL);
++	
++	bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes *
++						  sizeof (int), GFP_KERNEL);
++	if (!bld->u.stripe.bld_stripe_indexs)
 +		goto out;
-+	status = decode_layoutget(&xdr, rqstp, res);
++
++	for (i = 0; i < bld->u.stripe.bld_stripes; i++) {
++		dev_t			dev;
++		pnfs_blocklayout_devinfo_t	*bldp;
++		
++		dev = MKDEV(res->u.stripe.devs[i].major,
++			    res->u.stripe.devs[i].minor);
++		if (dev == 0)
++			goto out;
++		
++		bldp = bld_simple(&volumes, dev, location++);
++		if (!bldp) {
++			dprintk("%s: bld_simple failed\n", __func__);
++			goto out;
++		}
++		bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc);
++
++		if (!bldp) {
++			dprintk("%s: bld_slice failed\n", __func__);
++			goto out;
++		}
++		bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc;
++
++	}
++	list_add_tail(&bld->bld_list, &volumes);
++	status = blocklayout_encode_devinfo(xdr, &volumes);
++	
 +out:
++	while (!list_empty(&volumes)) {
++		bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
++		    bld_list);
++		switch (bld->bld_type) {
++			case PNFS_BLOCK_VOLUME_SLICE:
++			case PNFS_BLOCK_VOLUME_CONCAT:
++				// No memory to release for these
++				break;
++			case PNFS_BLOCK_VOLUME_SIMPLE:
++				kfree(bld->u.simple.bld_sig);
++				break;
++			case PNFS_BLOCK_VOLUME_STRIPE:
++				kfree(bld->u.stripe.bld_stripe_indexs);
++				break;
++		}
++		bld_free(bld);
++	}
++	kfree(res);
++	dprintk("<-- %s (rval %d)\n", __func__, status);
 +	return status;
 +}
 +
 +/*
-+ * Decode LAYOUTRETURN response
++ * bl_getdeviceinfo -- determine device tree for requested devid
 + */
-+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p,
-+				     struct nfs4_layoutreturn_res *res)
++int
++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
++		 u32 layout_type,
++		 const struct nfsd4_pnfs_deviceid *devid)
 +{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
++	if (device_slice(devid->devid) == True)
++		return bl_getdeviceinfo_slice(sb, xdr, devid);
++	else if (device_dm(devid->devid) == True)
++		return bl_getdeviceinfo_dm(sb, xdr, devid);
++	return -EINVAL;
++}
 +
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status)
-+		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status)
-+		goto out;
-+	status = decode_putfh(&xdr);
-+	if (status)
-+		goto out;
-+	status = decode_layoutreturn(&xdr, res);
-+out:
-+	return status;
++enum nfsstat4
++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr,
++	     const struct nfsd4_pnfs_layoutget_arg *arg,
++	     struct nfsd4_pnfs_layoutget_res *res)
++{
++	pnfs_blocklayout_layout_t	*b;
++	bl_layout_rec_t			*r;
++	struct list_head		bl_possible,
++					*bl_candidates	= NULL;
++	boolean_t			del_on_error	= False;
++	int				adj;
++	enum nfsstat4			nfserr		= NFS4_OK;
++	
++	dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n",
++	    __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset),
++	    _2SECTS(res->lg_seg.length), res->lg_seg.iomode);
++
++	if (res->lg_seg.length == 0) {
++		printk("%s: request length of 0, error condition\n", __func__);
++		return NFS4ERR_BADLAYOUT;
++	}
++	
++	/*
++	 * Adjust the length as required per spec.
++	 * - First case is were the length is set to (u64)-1. Cheap means to
++	 *   define the end of the file.
++	 * - Second case is were the I/O mode is read-only, but the request is
++	 *   past the end of the file so the request needs to be trimed.
++	 */
++	if ((res->lg_seg.length == NFS4_MAX_UINT64) ||
++	    (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) &&
++	     (res->lg_seg.iomode == IOMODE_READ)))
++		res->lg_seg.length = i->i_size - res->lg_seg.offset;
++	
++	adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0;
++	res->lg_seg.offset -= adj;
++	res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511;
++	
++	if (res->lg_seg.iomode != IOMODE_READ)
++		if (i->i_fop->fallocate(i, FALLOC_FL_KEEP_SIZE,
++					res->lg_seg.offset, res->lg_seg.length))
++			return NFS4ERR_IO;
++		
++	INIT_LIST_HEAD(&bl_possible);
++	
++	if ((r = layout_inode_find(i)) == NULL) {
++		if (layout_inode_add(i, &r) == False) {
++			printk("%s: layout_inode_add failed\n", __func__);
++			return NFS4ERR_IO;
++		}
++		del_on_error = True;
++	}
++	BUG_ON(!r);
++	
++	spin_lock(&r->blr_lock);
++	
++	if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) {
++		/*
++		 * This will send LAYOUTTRYAGAIN error to the client.
++		 */
++		dprintk("%s: layout_cache_fill_from() failed\n", __func__);
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
++	
++	res->lg_return_on_close	= 1;
++	res->lg_seg.length	= 0;
++	
++	bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg);
++	if (!bl_candidates) {
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
++	
++	layout_cache_merge(r, bl_candidates);
++	if (layout_cache_update(r, bl_candidates)) {
++		/* ---- Failed to allocate memory. ---- */
++		dprintk("%s: layout_cache_update() failed\n", __func__);
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
++	
++	nfserr = blocklayout_encode_layout(xdr, bl_candidates);
++	if (nfserr)
++		dprintk("%s: layoutget xdr routine failed\n", __func__);
++	
++layoutget_cleanup:
++	if (bl_candidates) {
++		while (!list_empty(bl_candidates)) {
++			b = list_entry(bl_candidates->next,
++			    struct pnfs_blocklayout_layout, bll_list);
++			list_del(&b->bll_list);
++			kfree(b);
++		}
++	}
++
++	spin_unlock(&r->blr_lock);
++	if (unlikely(nfserr)) {
++		if (del_on_error == True)
++			layout_inode_del(i);
++		res->lg_seg.length = 0;
++		res->lg_seg.offset = 0;
++	}
++	
++	dprintk("<-- %s (rval %u)\n", __func__, nfserr);
++	return nfserr;
 +}
 +
 +/*
-+ * Decode LAYOUTCOMMIT response
++ * bl_layoutcommit -- commit changes, especially size, to file systemj
++ *
++ * Currently this routine isn't called and everything is handled within
++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't
++ * handle a partial return, a set of extents, of the layout. The extents
++ * are decoded here, but nothing is done with them. If this routine is
++ * be called the interface must change to pass the 'dentry' pointer such
++ * that notify_change() can be called.
 + */
-+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p,
-+				     struct nfs4_layoutcommit_res *res)
++int
++bl_layoutcommit(struct inode *i,
++		const struct nfsd4_pnfs_layoutcommit_arg *args,
++		struct nfsd4_pnfs_layoutcommit_res *res)
 +{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
++	bl_layout_rec_t			*r;
++	int				status	= 0;
++	u64				lw_plus;
++	
++	dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
++	r = layout_inode_find(i);
++	if (r) {
++		lw_plus = args->lc_last_wr + 1;
++		if (args->lc_newoffset) {
++			dprintk("  lc_last_wr %Lu\n", lw_plus);
++			if (r->blr_orig_size < lw_plus) {
++				r->blr_orig_size	= lw_plus;
++				res->lc_size_chg	= 1;
++				res->lc_newsize		= lw_plus;
++			}
++		}
 +
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status)
-+		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status)
-+		goto out;
-+	status = decode_putfh(&xdr);
-+	if (status)
-+		goto out;
-+	status = decode_layoutcommit(&xdr, rqstp, res);
-+	if (status)
-+		goto out;
-+	decode_getfattr(&xdr, res->fattr, res->server,
-+			!RPC_IS_ASYNC(rqstp->rq_task));
-+out:
++		if (args->lc_up_len) {
++			int	extents,
++				i;
++			struct pnfs_blocklayout_layout *b;
++			__be32 *p = args->lc_up_layout;
++			
++			/*
++			 * Client is returning a set of extents which
++			 * should/could be used to update the file system.
++			 * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08
++			 */
++			READ32(extents);
++			dprintk("  Client returning %d extents: data size %d\n",
++			    extents, args->lc_up_len);
++			b = kmalloc(sizeof (struct pnfs_blocklayout_layout) *
++				    extents, GFP_KERNEL);
++			if (b) {
++				for (i = 0; i < extents; i++) {
++					READ64(b[i].bll_vol_id.sbid);
++					READ64(b[i].bll_vol_id.devid);
++					READ64(b[i].bll_foff);
++					READ64(b[i].bll_len);
++					READ64(b[i].bll_soff);
++					READ32(b[i].bll_es);
++					dprintk("  %d: foff %Lu, len %Lu, soff %Lu "
++					    "state %s\n",
++					    i, _2SECTS(b[i].bll_foff),
++					    _2SECTS(b[i].bll_len),
++					    _2SECTS(b[i].bll_soff),
++					    map_state2name(b[i].bll_es));
++				}
++				kfree(b);
++			} else {
++				status = -ENOMEM;
++			}
++		}
++	} else
++		dprintk("%s: Unexpected commit to inode %p\n", __func__, i);
++	
++	dprintk("<-- %s (rval %d)\n", __func__, status);
 +	return status;
 +}
 +
-+/*
-+ * Decode pNFS File Layout Data Server WRITE response
-+ */
-+static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p,
-+				struct nfs_writeres *res)
++int
++bl_layoutreturn(struct inode *i,
++		const struct nfsd4_pnfs_layoutreturn_arg *args)
 +{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
-+
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status)
-+		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status)
-+		goto out;
-+	status = decode_putfh(&xdr);
-+	if (status)
-+		goto out;
-+	status = decode_write(&xdr, res);
-+	if (!status)
-+		return res->count;
-+out:
-+	return status;
-+}
++	int				status	= 0;
++	bl_layout_rec_t			*r;
 +
-+/*
-+ * Decode pNFS File Layout Data Server COMMIT response
-+ */
-+static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p,
-+				 struct nfs_writeres *res)
-+{
-+	struct xdr_stream xdr;
-+	struct compound_hdr hdr;
-+	int status;
++	dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
++	
++	r = layout_inode_find(i);
++	if (r) {
++		spin_lock(&r->blr_lock);
++		layout_cache_del(r, &args->lr_seg);
++		spin_unlock(&r->blr_lock);
++		dprintk("    ext_size %Lu, i_size %Lu, orig_size %Lu\n",
++		    r->blr_ext_size, i->i_size, r->blr_orig_size);
++	}
 +
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_compound_hdr(&xdr, &hdr);
-+	if (status)
-+		goto out;
-+	status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+	if (status)
-+		goto out;
-+	status = decode_putfh(&xdr);
-+	if (status)
-+		goto out;
-+	status = decode_commit(&xdr, res);
-+out:
++	layout_inode_del(i);
++	dprintk("<-- %s (rval %d)\n", __func__, status);
 +	return status;
 +}
- #endif /* CONFIG_NFS_V4_1 */
- 
- __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
-@@ -5936,6 +6846,13 @@ struct rpc_procinfo	nfs4_procedures[] = {
-   PROC(SEQUENCE,	enc_sequence,	dec_sequence),
-   PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
-   PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
-+  PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
-+  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
-+  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
-+  PROC(LAYOUTCOMMIT, enc_layoutcommit,  dec_layoutcommit),
-+  PROC(LAYOUTRETURN, enc_layoutreturn,  dec_layoutreturn),
-+  PROC(PNFS_WRITE, enc_dswrite,  dec_dswrite),
-+  PROC(PNFS_COMMIT, enc_dscommit,  dec_dscommit),
- #endif /* CONFIG_NFS_V4_1 */
- };
- 
-diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
-new file mode 100644
-index 0000000..9addfe8
---- /dev/null
-+++ b/fs/nfs/objlayout/Kbuild
-@@ -0,0 +1,11 @@
-+#
-+# Makefile for the pNFS Objects Layout Driver kernel module
-+#
-+objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o
-+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
-+
-+#
-+# Panasas pNFS Layout Driver kernel module
-+#
-+panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o
-+obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o
-diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
-new file mode 100644
-index 0000000..e945ace
---- /dev/null
-+++ b/fs/nfs/objlayout/objio_osd.c
-@@ -0,0 +1,1060 @@
-+/*
-+ *  objio_osd.c
-+ *
-+ *  pNFS Objects layout implementation over open-osd initiator library
-+ *
-+ *  Copyright (C) 2009 Panasas Inc.
-+ *  All rights reserved.
-+ *
-+ *  Benny Halevy <bharrosh at panasas.com>
-+ *  Boaz Harrosh <bharrosh at panasas.com>
-+ *
-+ *  This program is free software; you can redistribute it and/or modify
-+ *  it under the terms of the GNU General Public License version 2
-+ *  See the file COPYING included with this distribution for more details.
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the Panasas company nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
-+
-+#include <linux/module.h>
-+#include <scsi/scsi_device.h>
-+#include <scsi/osd_attributes.h>
-+#include <scsi/osd_initiator.h>
-+#include <scsi/osd_sec.h>
-+#include <scsi/osd_sense.h>
-+
-+#include "objlayout.h"
-+
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
-+
-+#define _LLU(x) ((unsigned long long)x)
-+
-+enum { BIO_MAX_PAGES_KMALLOC =
-+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-+};
-+
-+/* A per mountpoint struct currently for device cache */
-+struct objio_mount_type {
-+	struct list_head dev_list;
-+	spinlock_t dev_list_lock;
-+};
-+
-+struct _dev_ent {
-+	struct list_head list;
-+	struct nfs4_deviceid d_id;
-+	struct osd_dev *od;
-+};
 +
-+static void _dev_list_remove_all(struct objio_mount_type *omt)
++int
++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
 +{
-+	spin_lock(&omt->dev_list_lock);
++	struct super_block		*sb;
++	struct nfsd4_pnfs_cb_layout	lr;
++	bl_layout_rec_t			*r;
++	pnfs_blocklayout_layout_t	*b;
++	u64				adj;
++	
++	dprintk("--> %s\n", __func__);
++	BUG_ON(!len);
++	switch (type) {
++		case RETURN_FILE:
++			sb = inode->i_sb;
++			dprintk("  recalling layout [0x%x:%lu], %Lu:%Lu\n",
++			    inode->i_sb->s_dev, inode->i_ino,
++				_2SECTS(offset), _2SECTS(len));
++			break;
++		case RETURN_FSID:
++			sb = inode->i_sb;
++			dprintk("%s: recalling layout for fsid x (unimplemented)\n",
++				__func__);
++			return 0;
++		case RETURN_ALL:
++			/*
++			 * XXX figure out how to get a sb since there's no
++			 * inode ptr
++			 */
++			dprintk("%s: recalling all layouts (unimplemented)\n",
++				__func__);
++			return 0;
++		default:
++			return -EINVAL;
++	}
++	
++restart:
++	r = layout_inode_find(inode);
++	if (r && len && !r->blr_recalled) {
++		spin_lock(&r->blr_lock);
++		list_for_each_entry(b, &r->blr_layouts, bll_list) {
++			if (!r->blr_recalled && !b->bll_recalled &&
++			    (offset >= b->bll_foff) && (offset < BLL_F_END(b))) {
++				b->bll_recalled		= 1;
++				lr.cbl_recall_type	= type;
++				lr.cbl_seg.layout_type	= LAYOUT_BLOCK_VOLUME;
++				lr.cbl_seg.clientid	= 0;
++				lr.cbl_seg.offset	= 0;
++				lr.cbl_seg.length	= NFS4_MAX_UINT64;
++				r->blr_recalled		= 1;
++				dprintk("  FULL LAYOUTRECALL\n");
++				lr.cbl_seg.iomode = IOMODE_ANY;
 +
-+	while (!list_empty(&omt->dev_list)) {
-+		struct _dev_ent *de = list_entry(omt->dev_list.next,
-+				 struct _dev_ent, list);
++				/*
++				 * Currently there are only two cases where the
++				 * layout is being returned.
++				 *    (1) Someone is issuing a NFS_WRITE operation
++				 *        to this layout.
++				 *    (2) The file has been truncated which means
++				 *        the layout is immediately made invalid.
++				 * In both cases the client must write any
++				 * uncommitted modifications to the server via
++				 * NFS_WRITE.
++				 */
++				lr.cbl_layoutchanged = 1;
 +
-+		list_del_init(&de->list);
-+		osduld_put_device(de->od);
-+		kfree(de);
++				/*
++				 * Need to drop the lock because we'll get a
++				 * layoutreturn which will block waiting for
++				 * the lock. The request will come in on the
++				 * same thread which will cause a deadlock.
++				 */
++				spin_unlock(&r->blr_lock);
++				nfsd_layout_recall_cb(sb, inode, &lr);
++				adj = MIN(b->bll_len - (offset - b->bll_foff),
++				    len);
++				offset += adj;
++				len -= adj;
++				if (!len) {
++					spin_lock(&r->blr_lock);
++					break;
++				}
++				/*
++				 * Since layoutreturn will have been called we
++				 * can't assume blr_layouts is still valid,
++				 * so restart.
++				 */
++				goto restart;
++			}
++		}
++		spin_unlock(&r->blr_lock);
 +	}
-+
-+	spin_unlock(&omt->dev_list_lock);
++	
++	dprintk("<-- %s\n", __func__);
++	return 0;
 +}
 +
-+static struct osd_dev *___dev_list_find(struct objio_mount_type *omt,
-+	struct nfs4_deviceid *d_id)
-+{
-+	struct list_head *le;
-+
-+	list_for_each(le, &omt->dev_list) {
-+		struct _dev_ent *de = list_entry(le, struct _dev_ent, list);
++/*
++ * []------------------------------------------------------------------[]
++ * | Support functions from here on down.				|
++ * []------------------------------------------------------------------[]
++ */
 +
-+		if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id)))
-+			return de->od;
++/*
++ * bld_simple -- given a dev_t build a simple volume structure
++ *
++ * Simple volume contains the device signature and offset to that data in
++ * the storage volume.
++ */
++static pnfs_blocklayout_devinfo_t *
++bld_simple(struct list_head *volumes, dev_t devid, int local_index)
++{
++	pnfs_blocklayout_devinfo_t	*bld	= NULL;
++	bl_comm_msg_t			msg;
++	bl_comm_res_t			*res	= NULL;
++	
++	msg.msg_type = PNFS_UPCALL_MSG_GETSIG;
++	msg.u.msg_dev = devid;
++	if (bl_upcall(bl_comm_global, &msg, &res)) {
++		dprintk("%s: Failed to get signature information\n", __func__);
++		goto error;
 +	}
-+
++	
++	bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE);
++	if (!bld)
++		return NULL;
++	
++	bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset;
++	bld->u.simple.bld_sig_len = res->u.sig.len;
++	bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL);
++	if (!bld->u.simple.bld_sig)
++		goto error;
++	
++	memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len);
++	kfree(res);
++	return bld;
++	
++error:
++	if (bld)
++		bld_free(bld);
++	if (res)
++		kfree(res);
++	dprintk("%s: error in bld_simple\n", __func__);
 +	return NULL;
 +}
 +
-+static struct osd_dev *_dev_list_find(struct objio_mount_type *omt,
-+	struct nfs4_deviceid *d_id)
-+{
-+	struct osd_dev *od;
-+
-+	spin_lock(&omt->dev_list_lock);
-+	od = ___dev_list_find(omt, d_id);
-+	spin_unlock(&omt->dev_list_lock);
-+	return od;
-+}
-+
-+static int _dev_list_add(struct objio_mount_type *omt,
-+	struct nfs4_deviceid *d_id, struct osd_dev *od)
++/*
++ * bld_slice -- given a dev_t build a slice volume structure
++ *
++ * A slice volume contains the length of the slice/partition and its offset
++ * from the beginning of the storage volume. There's also a reference to
++ * the "simple" volume which contains this slice.
++ */
++static pnfs_blocklayout_devinfo_t *
++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc)
 +{
-+	struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL);
-+
-+	if (!de)
-+		return -ENOMEM;
-+
-+	spin_lock(&omt->dev_list_lock);
-+
-+	if (___dev_list_find(omt, d_id)) {
-+		kfree(de);
-+		goto out;
++	pnfs_blocklayout_devinfo_t	*bld;
++	bl_comm_msg_t			msg;
++	bl_comm_res_t			*res;
++	
++	dprintk("--> %s\n", __func__);
++	bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE);
++	if (!bld)
++		return NULL;
++	
++	msg.msg_type	= PNFS_UPCALL_MSG_GETSLICE;
++	msg.u.msg_dev	= devid;
++	if (bl_upcall(bl_comm_global, &msg, &res)) {
++		dprintk("Upcall to get slice info failed\n");
++		bld_free(bld);
++		return NULL;
 +	}
++	
++	bld->bld_devid.devid = devid;
++	bld->bld_index_loc	= my_loc;
++	bld->u.slice.bld_start	= res->u.slice.start * 512LL;
++	bld->u.slice.bld_len	= res->u.slice.length * 512LL;
++	bld->u.slice.bld_index	= simple_loc;
 +
-+	de->d_id = *d_id;
-+	de->od = od;
-+	list_add(&de->list, &omt->dev_list);
++	dprintk("%s: start %Lu, len %Lu\n", __func__,
++		bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL);
 +
-+out:
-+	spin_unlock(&omt->dev_list_lock);
-+	return 0;
++	kfree(res);
++	dprintk("<-- %s (rval %p)\n", __func__, bld);
++	return bld;
 +}
 +
-+struct objio_segment {
-+	struct pnfs_osd_layout *layout;
-+
-+	unsigned mirrors_p1;
-+	unsigned stripe_unit;
-+	unsigned group_width;	/* Data stripe_units without integrity comps */
-+	u64 group_depth;
-+	unsigned group_count;
-+
-+	unsigned num_comps;
-+	/* variable length */
-+	struct osd_dev	*ods[1];
-+};
-+
-+struct objio_state;
-+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
-+
-+struct objio_state {
-+	/* Generic layer */
-+	struct objlayout_io_state ol_state;
-+
-+	struct objio_segment *objio_seg;
-+
-+	struct kref kref;
-+	objio_done_fn done;
-+	void *private;
-+
-+	unsigned long length;
-+	unsigned numdevs; /* Actually used devs in this IO */
-+	/* A per-device variable array of size numdevs */
-+	struct _objio_per_comp {
-+		struct bio *bio;
-+		struct osd_request *or;
-+		unsigned long length;
-+		u64 offset;
-+		unsigned dev;
-+	} per_dev[];
-+};
-+
-+/* Send and wait for a get_device_info of devices in the layout,
-+   then look them up with the osd_initiator library */
-+static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay,
-+			       struct objio_segment *objio_seg, unsigned comp)
++static int
++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
++    struct nfsd4_layout_seg *seg)
 +{
-+	struct pnfs_osd_layout *layout = objio_seg->layout;
-+	struct pnfs_osd_deviceaddr *deviceaddr;
-+	struct nfs4_deviceid *d_id;
-+	struct osd_dev *od;
-+	struct osd_dev_info odi;
-+	struct objio_mount_type *omt = NFS_SERVER(pnfslay->inode)->pnfs_ld_data;
-+	int err;
-+
-+	d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id;
-+
-+	od = _dev_list_find(omt, d_id);
-+	if (od)
-+		return od;
-+
-+	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr);
-+	if (unlikely(err)) {
-+		dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err);
-+		return ERR_PTR(err);
++	pnfs_blocklayout_layout_t	*n;
++	
++	dprintk("--> %s\n", __func__);
++	
++	if (!list_empty(&r->blr_layouts))
++		if (layout_cache_fill_from_list(r, h, seg) == False)
++			return -EIO;
++	
++	/*
++	 * This deals with two conditions.
++	 *    (1) When blr_layouts is empty we need to create the first entry
++	 *    (2) When the range requested falls past the end of any current
++	 *        layout the residual must be taken care of.
++	 */	
++	if (seg->length) {
++		n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h);
++		if (!n)
++			return -ENOMEM;
++		dprintk("  remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff),
++			_2SECTS(n->bll_len));
 +	}
++	
++	dprintk("<-- %s\n", __func__);
++	return 0;
++}
 +
-+	odi.systemid_len = deviceaddr->oda_systemid.len;
-+	if (odi.systemid_len > sizeof(odi.systemid)) {
-+		err = -EINVAL;
-+		goto out;
-+	} else if (odi.systemid_len)
-+		memcpy(odi.systemid, deviceaddr->oda_systemid.data,
-+		       odi.systemid_len);
-+	odi.osdname_len	 = deviceaddr->oda_osdname.len;
-+	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data;
++struct list_head *
++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible,
++    struct nfsd4_layout_seg *seg)
++{
++	pnfs_blocklayout_layout_t	*b,
++					*n		= NULL;
++	struct list_head		*bl_candidates	= NULL;
++	struct fiemap_extent_info	fei;
++	struct inode			*i;
++	dev_t				dev;
++	
++	dev	= r->blr_rdev;
++	i	= r->blr_inode;
++	
++	dprintk("--> %s\n", __func__);
++	bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL);
++	if (!bl_candidates)
++		return NULL;
++	INIT_LIST_HEAD(bl_candidates);
++	extents_setup(&fei);
++	
++	list_for_each_entry(b, bl_possible, bll_list) {
++		if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
++			
++			extents_count(&fei, i, b->bll_foff, b->bll_len);
++			if (fei.fi_extents_mapped) {
++				
++				/*
++				 * Common case here. Got a range which has
++				 * extents. Now get those extents and process
++				 * them into pNFS extents.
++				 */
++				if (extents_get(&fei, i, b->bll_foff,
++				    b->bll_len) == False)
++					goto cleanup;
++				if (extents_process(&fei, bl_candidates,
++				    seg, dev, b) == False)
++					goto cleanup;
++				extents_cleanup(&fei);
++				
++			} else if (seg->iomode == IOMODE_READ) {
++				
++				/*
++				 * Found a hole in a file while reading. No 
++				 * problem, just create a pNFS extent for the
++				 * range and let the client know there's no
++				 * backing store.
++				 */
++				n = bll_alloc(b->bll_foff, b->bll_len,
++				    BLOCK_LAYOUT_NEW, bl_candidates);
++				n->bll_es = PNFS_BLOCK_NONE_DATA;
++				n->bll_vol_id.sbid = 0;
++				n->bll_vol_id.devid = dev;
++				seg->length += b->bll_len;
++			} else {
++				
++				/*
++				 * There's a problem here. Since the iomode
++				 * is read/write fallocate should have allocated
++				 * any necessary storage for the given range.
++				 */
++				dprintk("    Extent count for RW is 0\n");
++				goto cleanup;
++			}
++			
++		} else {
++			n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates);
++			seg->length += n->bll_len;
++		}
 +
-+	if (!odi.osdname_len && !odi.systemid_len) {
-+		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
-+			__func__);
-+		err = -ENODEV;
-+		goto out;
++		if (r->blr_ext_size < (b->bll_foff + b->bll_len))
++			r->blr_ext_size = b->bll_foff + b->bll_len;
 +	}
-+
-+	od = osduld_info_lookup(&odi);
-+	if (unlikely(IS_ERR(od))) {
-+		err = PTR_ERR(od);
-+		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
-+		goto out;
++	
++	while (!list_empty(bl_possible)) {
++		b = list_entry(bl_possible->next,
++		    struct pnfs_blocklayout_layout, bll_list);
++		list_del(&b->bll_list);
++		kfree(b);
 +	}
-+
-+	_dev_list_add(omt, d_id, od);
-+
-+out:
-+	dprintk("%s: return=%d\n", __func__, err);
-+	objlayout_put_deviceinfo(deviceaddr);
-+	return err ? ERR_PTR(err) : od;
++		
++	b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout,
++	    bll_list);
++	seg->offset = b->bll_foff;
++	dprintk("<-- %s okay\n", __func__);
++	return bl_candidates;
++	
++cleanup:
++	extents_cleanup(&fei);
++	if (bl_candidates)
++		kfree(bl_candidates);
++	dprintk("<-- %s, error occurred\n", __func__);
++	return NULL;
 +}
 +
-+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-+	struct objio_segment *objio_seg)
++/*
++ * layout_cache_merge -- collapse layouts which make up a contiguous range.
++ */
++static void
++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h)
 +{
-+	struct pnfs_osd_layout *layout = objio_seg->layout;
-+	unsigned i, num_comps = layout->olo_num_comps;
-+	int err;
++	pnfs_blocklayout_layout_t	*b,
++					*p;
++	
++	dprintk("--> %s\n", __func__);
++restart:
++	p = NULL;
++	list_for_each_entry(b, h, bll_list) {
++		if (p && (BLL_S_END(p) == b->bll_soff) &&
++		    (p->bll_es == b->bll_es) &&
++		    (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
++			/*
++			 * We've got a condidate.
++			 */
++#ifdef too_verbose
++			dprintk("  merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n",
++				_2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++				_2SECTS(b->bll_soff),
++				_2SECTS(p->bll_foff), _2SECTS(p->bll_len),
++				_2SECTS(b->bll_soff));
++#endif
++			
++			if (p->bll_cache_state == BLOCK_LAYOUT_CACHE)
++				p->bll_cache_state = BLOCK_LAYOUT_UPDATE;
++			p->bll_len += b->bll_len;
++			list_del(&b->bll_list);
++			kfree(b);
++			goto restart;
++		} else if (p && (BLL_F_END(p) == b->bll_foff) &&
++			   (p->bll_es == b->bll_es) &&
++			   (b->bll_es == PNFS_BLOCK_NONE_DATA)) {
++			p->bll_len += b->bll_len;
++			list_del(&b->bll_list);
++			kfree(b);
++			goto restart;
++		} else
++			p = b;
++	}
++	dprintk("<-- %s\n", __func__);
++}
 +
-+	/* lookup all devices */
-+	for (i = 0; i < num_comps; i++) {
-+		struct osd_dev *od;
++static int
++layout_cache_update(bl_layout_rec_t *r, struct list_head *h)
++{
++	pnfs_blocklayout_layout_t	*b,
++					*c,
++					*n;
++	boolean_t			status = 0;
++	
++	dprintk("--> %s\n", __func__);
++	if (list_empty(&r->blr_layouts)) {
++		/* ---- Just add entries and return ---- */
++		dprintk("  cache empty for inode 0x%x:%ld\n", r->blr_rdev,
++			r->blr_inode->i_ino);
++		list_for_each_entry(b, h, bll_list) {
++			c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE,
++					  &r->blr_layouts);
++			if (!c) {
++				status = -ENOMEM;
++				break;
++			}
++			dprintk("    adding %Lu(f):%Lu(l):%Lu(s):%d\n",
++				_2SECTS(c->bll_foff), _2SECTS(c->bll_len),
++				_2SECTS(c->bll_soff), c->bll_es);
++		}
++		return status;
++	}
++	
++	list_for_each_entry(b, h, bll_list) {
++		BUG_ON(!b->bll_vol_id.devid);
++		if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) {
++			boolean_t found = False;
++			list_for_each_entry(c, &r->blr_layouts, bll_list) {
++				if ((b->bll_soff >= c->bll_soff) &&
++				    (b->bll_soff < BLL_S_END(c)) &&
++				    (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
++					u64	u;
++					
++					if ((b->bll_foff < c->bll_foff) ||
++					    (b->bll_foff > BLL_F_END(c)))
++						BUG();
++					
++					u = BLL_S_END(b) - BLL_S_END(c);
++					/*
++					 * The updated cache entry has to be
++					 * different than the current.
++					 * Otherwise the cache state for 'b'
++					 * should be BLOCK_LAYOUT_CACHE.
++					 */
++					BUG_ON(BLL_S_END(b) < BLL_S_END(c));
++					
++					dprintk("  "
++						"updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n",
++						_2SECTS(c->bll_foff),
++						_2SECTS(c->bll_len),
++						_2SECTS(c->bll_soff),
++						_2SECTS(c->bll_len + u));
++					c->bll_len += u;
++					bll_collapse(r, c);
++					found = True;
++					break;
++				}
++			}
 +
-+		od = _device_lookup(pnfslay, objio_seg, i);
-+		if (unlikely(IS_ERR(od))) {
-+			err = PTR_ERR(od);
-+			goto out;
++			if (found == False) {
++				dprintk("  ERROR Expected to find"
++				    " %Lu(f):%Lu(l):%Lu(s), but didn't\n",
++				    _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++				    _2SECTS(b->bll_soff));
++				list_for_each_entry(c, &r->blr_layouts, bll_list)
++					print_bll(c, "Cached");
++				BUG();
++			}
++		} else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
++			
++			c = list_first_entry(&r->blr_layouts,
++			    struct pnfs_blocklayout_layout, bll_list);
++			if (b->bll_foff < c->bll_foff) {
++				/*
++				 * Special case where new entry is before
++				 * first cached entry.
++				 */
++				c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL);
++				list_add(&c->bll_list, &r->blr_layouts);
++				dprintk("  new entry at head of list at %Lu, "
++					"len %Lu\n",
++					_2SECTS(c->bll_foff), _2SECTS(c->bll_len));
++			} else {
++				list_for_each_entry(c, &r->blr_layouts,
++				    bll_list) {
++					n = list_entry(c->bll_list.next,
++					    struct pnfs_blocklayout_layout,
++					    bll_list);
++					/*
++					 * This is ugly, but can't think of
++					 * another way to examine this case.
++					 * Consider the following. Need to
++					 * add an entry which starts at 40
++					 * and the cache has the following
++					 * entries:
++					 * Start    Length
++					 * 10       5
++					 * 30       5
++					 * 50       5
++					 * So, need to look and see if the new
++					 * entry starts after the current
++					 * cache, but before the next one.
++					 * There's a catch in that the next
++					 * entry might not be valid as it's
++					 * really just a pointer to the list
++					 * head.
++					 */
++					if (((b->bll_foff >=
++					      BLL_F_END(c)) &&
++					     (c->bll_list.next == &r->blr_layouts)) ||
++					    ((b->bll_foff >=
++					      BLL_F_END(c)) &&
++					     (b->bll_foff < n->bll_foff))) {
++						
++						n = bll_alloc_dup(b,
++								  BLOCK_LAYOUT_CACHE, NULL);
++						dprintk("  adding new %Lu:%Lu"
++							" after %Lu:%Lu\n",
++							_2SECTS(n->bll_foff),
++							_2SECTS(n->bll_len),
++							_2SECTS(c->bll_foff),
++							_2SECTS(c->bll_len));
++						list_add(&n->bll_list,
++							 &c->bll_list);
++						break;
++					}
++				}
++			}
 +		}
-+		objio_seg->ods[i] = od;
 +	}
-+	objio_seg->num_comps = num_comps;
-+	err = 0;
-+
-+out:
-+	dprintk("%s: return=%d\n", __func__, err);
-+	return err;
++	dprintk("<-- %s\n", __func__);
++	return status;
 +}
 +
-+static int _verify_data_map(struct pnfs_osd_layout *layout)
++static void
++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in)
 +{
-+	struct pnfs_osd_data_map *data_map = &layout->olo_map;
-+	u64 stripe_length;
-+	u32 group_width;
-+
-+/* FIXME: Only raid0 for now. if not go through MDS */
-+	if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
-+		printk(KERN_ERR "Only RAID_0 for now\n");
-+		return -ENOTSUPP;
-+	}
-+	if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
-+		printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
-+			  data_map->odm_num_comps, data_map->odm_mirror_cnt);
-+		return -EINVAL;
-+	}
-+
-+	if (data_map->odm_group_width)
-+		group_width = data_map->odm_group_width;
-+	else
-+		group_width = data_map->odm_num_comps /
-+						(data_map->odm_mirror_cnt + 1);
-+
-+	stripe_length = (u64)data_map->odm_stripe_unit * group_width;
-+	if (stripe_length >= (1ULL << 32)) {
-+		printk(KERN_ERR "Total Stripe length(0x%llx)"
-+			  " >= 32bit is not supported\n", _LLU(stripe_length));
-+		return -ENOTSUPP;
-+	}
-+
-+	if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
-+		printk(KERN_ERR "Stripe Unit(0x%llx)"
-+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-+			  _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
-+		return -ENOTSUPP;
++	struct pnfs_blocklayout_layout	*b,
++					*n;
++	u64				len;
++	struct nfsd4_layout_seg		seg = *seg_in;
++	
++	dprintk("--> %s\n", __func__);
++	if (seg.length == NFS4_MAX_UINT64) {
++		r->blr_recalled = 0;
++		dprintk("  Fast return of all layouts\n");
++		while (!list_empty(&r->blr_layouts)) {
++			b = list_entry(r->blr_layouts.next,
++				       struct pnfs_blocklayout_layout, bll_list);
++			dprintk("    foff %Lu, len %Lu, soff %Lu\n",
++				_2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++				_2SECTS(b->bll_soff));
++			list_del(&b->bll_list);
++			kfree(b);
++		}
++		dprintk("<-- %s\n", __func__);
++		return;
 +	}
 +
-+	return 0;
-+}
-+
-+int objio_alloc_lseg(void **outp,
-+	struct pnfs_layout_hdr *pnfslay,
-+	struct pnfs_layout_segment *lseg,
-+	struct pnfs_osd_layout *layout)
-+{
-+	struct objio_segment *objio_seg;
-+	int err;
-+
-+	err = _verify_data_map(layout);
-+	if (unlikely(err))
-+		return err;
-+
-+	objio_seg = kzalloc(sizeof(*objio_seg) +
-+			(layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
-+			GFP_KERNEL);
-+	if (!objio_seg)
-+		return -ENOMEM;
-+
-+	objio_seg->layout = layout;
-+	err = objio_devices_lookup(pnfslay, objio_seg);
-+	if (err)
-+		goto free_seg;
-+
-+	objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
-+	objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
-+	if (layout->olo_map.odm_group_width) {
-+		objio_seg->group_width = layout->olo_map.odm_group_width;
-+		objio_seg->group_depth = layout->olo_map.odm_group_depth;
-+		objio_seg->group_count = layout->olo_map.odm_num_comps /
-+						objio_seg->mirrors_p1 /
-+						objio_seg->group_width;
-+	} else {
-+		objio_seg->group_width = layout->olo_map.odm_num_comps /
-+						objio_seg->mirrors_p1;
-+		objio_seg->group_depth = -1;
-+		objio_seg->group_count = 1;
++restart:
++	list_for_each_entry(b, &r->blr_layouts, bll_list) {
++		if (seg.offset == b->bll_foff) {
++			/*
++			 * This handle the following three cases:
++			 * (1) return layout matches entire cache layout
++			 * (2) return layout matches beginning portion of cache
++			 * (3) return layout matches entire cache layout and
++			 *     into next entry. Varies from #1 in end case.
++			 */
++			dprintk("  match on offsets, %Lu:%Lu\n",
++				_2SECTS(seg.offset), _2SECTS(seg.length));
++			len = MIN(seg.length, b->bll_len);
++			b->bll_foff	+= len;
++			b->bll_soff	+= len;
++			b->bll_len	-= len;
++			seg.length	-= len;
++			seg.offset	+= len;
++			if (!b->bll_len) {
++				list_del(&b->bll_list);
++				kfree(b);
++				dprintk("    removing cache line\n");
++				if (!seg.length) {
++					dprintk("    also finished\n");
++					goto complete;
++				}
++				/*
++				 * Since 'b' was freed we can't continue at the
++				 * next entry which is referenced as
++				 * b->bll_list.next by the list_for_each_entry
++				 * macro. Need to restart the loop.
++				 * TODO: Think about creating a dummy 'b' which
++				 *       would keep list_for_each_entry() happy.
++				 */
++				goto restart;
++			}
++			if (!seg.length) {
++				dprintk("    finished, but cache line not"
++					"empty\n");
++				goto complete;
++			}
++		} else if ((seg.offset >= b->bll_foff) &&
++		    (seg.offset < BLL_F_END(b))) {
++			/*
++			 * layout being returned is within this cache line.
++			 */
++			dprintk("  layout %Lu:%Lu within cache line %Lu:%Lu\n",
++				_2SECTS(seg.offset), _2SECTS(seg.length),
++				_2SECTS(b->bll_foff), _2SECTS(b->bll_len));
++			BUG_ON(!seg.length);
++			if ((seg.offset + seg.length) >= BLL_F_END(b)) {
++				/*
++				 * Layout returned starts in the middle of
++				 * cache entry and just need to trim back
++				 * cache to shorter length.
++				 */
++				dprintk("    trim back cache line\n");
++				len = seg.offset - b->bll_foff;
++				seg.offset += b->bll_len - len;
++				seg.length -= b->bll_len - len;
++				b->bll_len = len;
++				if (!seg.length)
++					return;
++			} else {
++				/*
++				 * Need to split current cache layout because
++				 * chunk is being removed from the middle.
++				 */
++				dprintk("    split cache line\n");
++				len = seg.offset + seg.length;
++				n = bll_alloc(len,
++					      (b->bll_foff + b->bll_len) - len,
++					      BLOCK_LAYOUT_CACHE, NULL);
++				n->bll_soff = b->bll_soff + len;
++				list_add(&n->bll_list, &b->bll_list);
++				b->bll_len = seg.offset - b->bll_foff;
++				return;
++			}
++		}
 +	}
-+
-+	*outp = objio_seg;
-+	return 0;
-+
-+free_seg:
-+	dprintk("%s: Error: return %d\n", __func__, err);
-+	kfree(objio_seg);
-+	*outp = NULL;
-+	return err;
++complete:
++	if (list_empty(&r->blr_layouts))
++		r->blr_recalled = 0;
++	dprintk("<-- %s\n", __func__);
 +}
 +
-+void objio_free_lseg(void *p)
++/*
++ * layout_cache_fill_from_list -- fills from cache list
++ *
++ * NOTE: This routine was only seperated out from layout_cache_file_from()
++ * to reduce the indentation level which makes the code easier to read.
++ */
++static inline boolean_t
++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h,
++    struct nfsd4_layout_seg *seg)
 +{
-+	struct objio_segment *objio_seg = p;
-+
-+	kfree(objio_seg);
++	pnfs_blocklayout_layout_t	*b,
++					*n;
++	enum pnfs_block_extent_state4	s;
++	
++	list_for_each_entry(b, &r->blr_layouts, bll_list) {
++		if (seg->offset < b->bll_foff) {
++			n = bll_alloc(seg->offset,
++			    MIN(seg->length, b->bll_foff - seg->offset),
++			    BLOCK_LAYOUT_NEW, NULL);
++			if (!n)
++				return False;
++			
++			list_add(&n->bll_list, h->prev);
++			dprintk("  new: %Lu:%Lu, added before %Lu:%Lu\n",
++			    _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
++			    _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
++			seg->offset += n->bll_len;
++			seg->length -= n->bll_len;
++			if (!seg->length)
++				break;
++		}
++		
++		if ((seg->offset >= b->bll_foff) &&
++		    (seg->offset < BLL_F_END(b))) {
++			if (layout_conflict(b, seg->iomode, &s) == False) {
++				dprintk("  CONFLICT FOUND: "
++				    "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n",
++				    _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++				    _2SECTS(b->bll_soff), b->bll_es,
++				    seg->iomode);
++				return False;
++			}
++			n = bll_alloc(seg->offset,
++			    MIN(seg->length, BLL_F_END(b) - seg->offset),
++			    BLOCK_LAYOUT_CACHE, h);
++			dprintk("  CACHE hit: Found %Lu(f):%Lu(l): "
++			    "in %Lu(f):%Lu(l):%Lu(s):%d\n",
++			    _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
++			    _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++			    _2SECTS(b->bll_soff), b->bll_es);
++			if (!n)
++				return False;
++			
++			n->bll_soff = b->bll_soff + seg->offset - b->bll_foff;
++			n->bll_vol_id.sbid = 0;
++			n->bll_vol_id.devid = b->bll_vol_id.devid;
++			n->bll_es = s;
++			seg->offset += n->bll_len;
++			seg->length -= n->bll_len;
++			if (!seg->length)
++				break;
++		}
++	}
++	return True;
 +}
 +
-+int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++static u64
++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length,
++    dev_t dev)
 +{
-+	struct objio_segment *objio_seg = seg;
-+	struct objio_state *ios;
-+	const unsigned first_size = sizeof(*ios) +
-+				objio_seg->num_comps * sizeof(ios->per_dev[0]);
-+	const unsigned sec_size = objio_seg->num_comps *
-+						sizeof(ios->ol_state.ioerrs[0]);
-+
-+	dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
-+	ios = kzalloc(first_size + sec_size, GFP_KERNEL);
-+	if (unlikely(!ios))
-+		return -ENOMEM;
-+
-+	ios->objio_seg = objio_seg;
-+	ios->ol_state.ioerrs = ((void *)ios) + first_size;
-+	ios->ol_state.num_comps = objio_seg->num_comps;
-+
-+	*outp = &ios->ol_state;
-+	return 0;
++	pnfs_blocklayout_layout_t	*n;
++	
++	n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates);
++	if (!n)
++		return 0;
++	n->bll_es = PNFS_BLOCK_NONE_DATA;
++	n->bll_vol_id.sbid = 0;
++	n->bll_vol_id.devid = dev;
++	
++	return n->bll_len;
 +}
 +
-+void objio_free_io_state(struct objlayout_io_state *ol_state)
++static void
++extents_setup(struct fiemap_extent_info *fei)
 +{
-+	struct objio_state *ios = container_of(ol_state, struct objio_state,
-+					       ol_state);
-+
-+	kfree(ios);
++	fei->fi_extents_start	= NULL;
 +}
 +
-+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
++/*
++ * extents_count -- Determine the number of extents for a given range.
++ *
++ * No need to call set_fs() here because the function
++ * doesn't use copy_to_user() if it's only counting
++ * the number of extents needed.
++ */
++static void
++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
 +{
-+	switch (oep) {
-+	case OSD_ERR_PRI_NO_ERROR:
-+		return (enum pnfs_osd_errno)0;
-+
-+	case OSD_ERR_PRI_CLEAR_PAGES:
-+		BUG_ON(1);
-+		return 0;
-+
-+	case OSD_ERR_PRI_RESOURCE:
-+		return PNFS_OSD_ERR_RESOURCE;
-+	case OSD_ERR_PRI_BAD_CRED:
-+		return PNFS_OSD_ERR_BAD_CRED;
-+	case OSD_ERR_PRI_NO_ACCESS:
-+		return PNFS_OSD_ERR_NO_ACCESS;
-+	case OSD_ERR_PRI_UNREACHABLE:
-+		return PNFS_OSD_ERR_UNREACHABLE;
-+	case OSD_ERR_PRI_NOT_FOUND:
-+		return PNFS_OSD_ERR_NOT_FOUND;
-+	case OSD_ERR_PRI_NO_SPACE:
-+		return PNFS_OSD_ERR_NO_SPACE;
-+	default:
-+		WARN_ON(1);
-+		/* fallthrough */
-+	case OSD_ERR_PRI_EIO:
-+		return PNFS_OSD_ERR_EIO;
-+	}
++	dprintk("    Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len));
++	fei->fi_flags		= FIEMAP_FLAG_SYNC;
++	fei->fi_extents_max	= 0;
++	fei->fi_extents_start	= NULL;
++	fei->fi_extents_mapped	= 0;
++	i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1);
 +}
 +
-+static void _clear_bio(struct bio *bio)
++/*
++ * extents_get -- Get list of extents for range
++ *
++ * extents_count() must have been called before this routine such that
++ * fi_extents_mapped is known.
++ */
++static boolean_t
++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
 +{
-+	struct bio_vec *bv;
-+	unsigned i;
-+
-+	__bio_for_each_segment(bv, bio, i, 0) {
-+		unsigned this_count = bv->bv_len;
++	int			m_space,
++				rval;
++	struct fiemap_extent	*fe;
++	mm_segment_t		old_fs = get_fs();
++	
++	/*
++	 * Now malloc the correct amount of space
++	 * needed. It's possible for the file to have changed
++	 * between calls which would require more space for
++	 * the extents. If that occurs the last extent will
++	 * not have FIEMAP_EXTENT_LAST set and the error will
++	 * be caught in extents_process().
++	 */
++	m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent);
++	fe = kmalloc(m_space, GFP_KERNEL);
++	if (!fe)
++		return False;
++	memset(fe, 0, m_space);
++	
++	fei->fi_extents_max	= fei->fi_extents_mapped;
++	fei->fi_extents_mapped	= 0;
++	fei->fi_extents_start	= fe;
++	
++	set_fs(KERNEL_DS);
++	rval = i->i_op->fiemap(i, fei, foff, len +
++	    (1 << i->i_sb->s_blocksize_bits) - 1);
++	set_fs(old_fs);
++	
++	if (rval || !fei->fi_extents_mapped) {
++		dprintk("    No extents. Wanted %d, got %d\n",
++			fei->fi_extents_max, fei->fi_extents_mapped);
++		kfree(fe);
++		fei->fi_extents_start = NULL;
++		return False;
++	} else
++		return True;
++}
 +
-+		if (likely(PAGE_SIZE == this_count))
-+			clear_highpage(bv->bv_page);
-+		else
-+			zero_user(bv->bv_page, bv->bv_offset, this_count);
++/*
++ * extents_process -- runs through the extent returned from the file system and
++ *	 creates block layout entries.
++ */
++static boolean_t
++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates,
++    struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b)
++{
++	struct fiemap_extent		*fep,
++					*fep_last	= NULL;
++	int				i;
++	pnfs_blocklayout_layout_t	*n;
++	u64				last_end,
++					rval;
++	
++	dprintk("--> %s\n", __func__);
++	for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped;
++	    i++, fep++) {
++		
++		BUG_ON(!fep->fe_physical);
++		/*
++		 * Deal with corner cases of hoel-y files.
++		 */
++		if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) !=
++				 fep->fe_logical)) {
++			
++			/*
++			 * If the last extent doesn't end logically
++			 * at the beginning of the current we've got
++			 * hole and need to create a pNFS extent.
++			 */
++			dprintk("    Got a hole at %Ld:%Ld \n", 
++			    _2SECTS(fep_last->fe_logical),
++			    _2SECTS(fep_last->fe_length));
++			last_end = fep_last->fe_logical + fep_last->fe_length;
++			rval = bll_alloc_holey(bl_candidates, last_end,
++			    fep->fe_logical - last_end, dev);
++			if (!rval)
++				return False;
++			seg->length += rval;
++		}
++		
++		n = bll_alloc(fep->fe_logical, fep->fe_length,
++		    BLOCK_LAYOUT_NEW, bl_candidates);
++		if (unlikely(n == NULL)) {
++			dprintk("%s: bll_alloc failed\n", __func__);
++			return False;
++		}
++		
++		n->bll_soff = fep->fe_physical;
++		n->bll_es = seg->iomode == IOMODE_READ ?
++		    PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA;
++		n->bll_vol_id.sbid = 0;
++		n->bll_vol_id.devid = dev;
++		seg->length += fep->fe_length;
++		print_bll(n, "New extent");
++		fep_last = fep;
 +	}
++	dprintk("<-- %s (i=%d)\n", __func__, i);
++	
++	return True;
 +}
 +
-+static int _io_check(struct objio_state *ios, bool is_write)
++static void
++extents_cleanup(struct fiemap_extent_info *fei)
 +{
-+	enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
-+	int lin_ret = 0;
-+	int i;
-+
-+	for (i = 0; i <  ios->numdevs; i++) {
-+		struct osd_sense_info osi;
-+		struct osd_request *or = ios->per_dev[i].or;
-+		int ret;
-+
-+		if (!or)
-+			continue;
-+
-+		ret = osd_req_decode_sense(or, &osi);
-+		if (likely(!ret))
-+			continue;
-+
-+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
-+			/* start read offset passed endof file */
-+			BUG_ON(is_write);
-+			_clear_bio(ios->per_dev[i].bio);
-+			dprintk("%s: start read offset passed end of file "
-+				"offset=0x%llx, length=0x%lx\n", __func__,
-+				_LLU(ios->per_dev[i].offset),
-+				ios->per_dev[i].length);
-+
-+			continue; /* we recovered */
-+		}
-+		objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev,
-+					osd_pri_2_pnfs_err(osi.osd_err_pri),
-+					ios->per_dev[i].offset,
-+					ios->per_dev[i].length,
-+					is_write);
-+
-+		if (osi.osd_err_pri >= oep) {
-+			oep = osi.osd_err_pri;
-+			lin_ret = ret;
-+		}
++	if (fei->fi_extents_start) {
++		kfree(fei->fi_extents_start);
++		fei->fi_extents_start = NULL;
 +	}
-+
-+	return lin_ret;
 +}
 +
 +/*
-+ * Common IO state helpers.
++ * device_slice -- check to see if device is a slice or DM
 + */
-+static void _io_free(struct objio_state *ios)
++static boolean_t
++device_slice(dev_t devid)
 +{
-+	unsigned i;
-+
-+	for (i = 0; i < ios->numdevs; i++) {
-+		struct _objio_per_comp *per_dev = &ios->per_dev[i];
-+
-+		if (per_dev->or) {
-+			osd_end_request(per_dev->or);
-+			per_dev->or = NULL;
-+		}
-+
-+		if (per_dev->bio) {
-+			bio_put(per_dev->bio);
-+			per_dev->bio = NULL;
-+		}
++	struct block_device	*bd	= blkdev_get_by_dev(devid, FMODE_READ, NULL);
++	boolean_t		rval	= False;
++	
++	if (bd) {
++		if (bd->bd_disk->minors > 1)
++			rval = True;
++		blkdev_put(bd, FMODE_READ);
 +	}
++	return rval;
 +}
 +
-+struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)
++/*
++ * device_dm -- check to see if device is a Device Mapper volume.
++ *
++ * Returns 1 for DM or 0 if not
++ */
++static boolean_t
++device_dm(dev_t devid)
 +{
-+	unsigned min_dev = ios->objio_seg->layout->olo_comps_index;
-+	unsigned max_dev = min_dev + ios->ol_state.num_comps;
-+
-+	BUG_ON(dev < min_dev || max_dev <= dev);
-+	return ios->objio_seg->ods[dev - min_dev];
++	boolean_t		rval = False;
++	bl_comm_msg_t		msg;
++	bl_comm_res_t		*res;
++	
++	msg.msg_type	= PNFS_UPCALL_MSG_DMCHK;
++	msg.u.msg_dev	= devid;
++	if (bl_upcall(bl_comm_global, &msg, &res)) {
++		dprintk("Failed upcall to check on DM status\n");
++	} else if (res->u.dm_vol) {
++		rval = True;
++		dprintk("Device is DM volume\n");
++	} else
++		dprintk("Device is not DM volume\n");
++	kfree(res);
++	
++	return rval;
 +}
 +
-+struct _striping_info {
-+	u64 obj_offset;
-+	u64 group_length;
-+	u64 total_group_length;
-+	u64 Major;
-+	unsigned dev;
-+	unsigned unit_off;
-+};
-+
-+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
-+			      struct _striping_info *si)
++static boolean_t
++layout_inode_add(struct inode *i, bl_layout_rec_t **p)
 +{
-+	u32	stripe_unit = ios->objio_seg->stripe_unit;
-+	u32	group_width = ios->objio_seg->group_width;
-+	u64	group_depth = ios->objio_seg->group_depth;
-+	u32	U = stripe_unit * group_width;
-+
-+	u64	T = U * group_depth;
-+	u64	S = T * ios->objio_seg->group_count;
-+	u64	M = div64_u64(file_offset, S);
-+
-+	/*
-+	G = (L - (M * S)) / T
-+	H = (L - (M * S)) % T
-+	*/
-+	u64	LmodU = file_offset - M * S;
-+	u32	G = div64_u64(LmodU, T);
-+	u64	H = LmodU - G * T;
-+
-+	u32	N = div_u64(H, U);
-+
-+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-+	si->obj_offset = si->unit_off + (N * stripe_unit) +
-+				  (M * group_depth * stripe_unit);
++	bl_layout_rec_t		*r	= NULL;
 +
-+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
-+	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-+	si->dev *= ios->objio_seg->mirrors_p1;
++	if (!i->i_op->fiemap || !i->i_fop->fallocate) {
++		printk("pNFS: file system doesn't support required fiemap or"
++		    "fallocate methods\n");
++		return False;
++	}
++	
++	r = kmalloc(sizeof (*r), GFP_KERNEL);
++	if (!r)
++		goto error;
 +
-+	si->group_length = T - H;
-+	si->total_group_length = T;
-+	si->Major = M;
++	r->blr_rdev	= i->i_sb->s_dev;
++	r->blr_inode	= i;
++	r->blr_orig_size = i->i_size;
++	r->blr_ext_size	= 0;
++	r->blr_recalled	= 0;
++	INIT_LIST_HEAD(&r->blr_layouts);
++	spin_lock_init(&r->blr_lock);
++	spin_lock(&layout_hashtbl_lock);
++	list_add_tail(&r->blr_hash, &layout_hash);
++	spin_unlock(&layout_hashtbl_lock);
++	*p = r;
++	return True;
++	
++error:
++	if (r)
++		kfree(r);
++	return False;
 +}
 +
-+static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-+		unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len)
++static bl_layout_rec_t *
++__layout_inode_find(struct inode *i)
 +{
-+	unsigned pg = *cur_pg;
-+	struct request_queue *q =
-+			osd_request_queue(_io_od(ios, per_dev->dev));
++	bl_layout_rec_t	*r;
++	
++	if (!list_empty(&layout_hash)) {
++		list_for_each_entry(r, &layout_hash, blr_hash) {
++			if ((r->blr_inode->i_ino == i->i_ino) &&
++			    (r->blr_rdev == i->i_sb->s_dev)) {
++				return r;
++			}
++		}
++	}
++	return NULL;
++}
 +
-+	per_dev->length += cur_len;
++static bl_layout_rec_t *
++layout_inode_find(struct inode *i)
++{
++	bl_layout_rec_t	*r;
 +
-+	if (per_dev->bio == NULL) {
-+		unsigned stripes = ios->ol_state.num_comps /
-+						     ios->objio_seg->mirrors_p1;
-+		unsigned pages_in_stripe = stripes *
-+				      (ios->objio_seg->stripe_unit / PAGE_SIZE);
-+		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-+				    stripes;
++	spin_lock(&layout_hashtbl_lock);
++	r = __layout_inode_find(i);
++	spin_unlock(&layout_hashtbl_lock);
++	
++	return r;
++}
 +
-+		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
-+		if (unlikely(!per_dev->bio)) {
-+			dprintk("Faild to allocate BIO size=%u\n", bio_size);
-+			return -ENOMEM;
++static void
++layout_inode_del(struct inode *i)
++{
++	bl_layout_rec_t	*r;
++	
++	spin_lock(&layout_hashtbl_lock);
++	r = __layout_inode_find(i);
++	if (r) {
++		spin_lock(&r->blr_lock);
++		if (list_empty(&r->blr_layouts)) {
++			list_del(&r->blr_hash);
++			spin_unlock(&r->blr_lock);
++			kfree(r);
++		} else {
++			spin_unlock(&r->blr_lock);
 +		}
++	} else {
++		dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n",
++			__func__, i->i_sb->s_dev, i->i_ino);
 +	}
++	spin_unlock(&layout_hashtbl_lock);
++}
 +
-+	while (cur_len > 0) {
-+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
-+		unsigned added_len;
++/*
++ * map_state2name -- converts state in ascii string.
++ *
++ * Used for debug messages only.
++ */
++static char *
++map_state2name(enum pnfs_block_extent_state4 s)
++{
++	switch (s) {
++	case PNFS_BLOCK_READWRITE_DATA:	return "     RW";
++	case PNFS_BLOCK_READ_DATA:	return "     RO";
++	case PNFS_BLOCK_INVALID_DATA:	return "INVALID";
++	case PNFS_BLOCK_NONE_DATA:	return "   NONE";
++	default:
++		BUG();
++	}
++}
 +
-+		BUG_ON(ios->ol_state.nr_pages <= pg);
-+		cur_len -= pglen;
++static pnfs_blocklayout_devinfo_t *
++bld_alloc(struct list_head *volumes, int type)
++{
++	pnfs_blocklayout_devinfo_t *bld;
++	
++	bld = kmalloc(sizeof (*bld), GFP_KERNEL);
++	if (!bld)
++		return NULL;
 +
-+		added_len = bio_add_pc_page(q, per_dev->bio,
-+					ios->ol_state.pages[pg], pglen, pgbase);
-+		if (unlikely(pglen != added_len))
-+			return -ENOMEM;
-+		pgbase = 0;
-+		++pg;
-+	}
-+	BUG_ON(cur_len);
++	memset(bld, 0, sizeof (*bld));
++	bld->bld_type = type;
++	list_add_tail(&bld->bld_list, volumes);
 +
-+	*cur_pg = pg;
-+	return 0;
++	return bld;
 +}
 +
-+static int _prepare_one_group(struct objio_state *ios, u64 length,
-+			      struct _striping_info *si, unsigned first_comp,
-+			      unsigned *last_pg)
++static void
++bld_free(pnfs_blocklayout_devinfo_t *bld)
 +{
-+	unsigned stripe_unit = ios->objio_seg->stripe_unit;
-+	unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
-+	unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1;
-+	unsigned dev = si->dev;
-+	unsigned first_dev = dev - (dev % devs_in_group);
-+	unsigned comp = first_comp + (dev - first_dev);
-+	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
-+	unsigned cur_pg = *last_pg;
-+	int ret = 0;
++	list_del(&bld->bld_list);
++	kfree(bld);
++}
 +
-+	while (length) {
-+		struct _objio_per_comp *per_dev = &ios->per_dev[comp];
-+		unsigned cur_len, page_off = 0;
++static void
++print_bll(pnfs_blocklayout_layout_t *b, char *text)
++{
++	dprintk("    BLL: %s\n", text);
++	dprintk("    foff %Lu, soff %Lu, len %Lu, state %s\n",
++	    _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len),
++	    map_state2name(b->bll_es));
++}
 +
-+		if (!per_dev->length) {
-+			per_dev->dev = dev;
-+			if (dev < si->dev) {
-+				per_dev->offset = si->obj_offset + stripe_unit -
-+								   si->unit_off;
-+				cur_len = stripe_unit;
-+			} else if (dev == si->dev) {
-+				per_dev->offset = si->obj_offset;
-+				cur_len = stripe_unit - si->unit_off;
-+				page_off = si->unit_off & ~PAGE_MASK;
-+				BUG_ON(page_off &&
-+				      (page_off != ios->ol_state.pgbase));
-+			} else { /* dev > si->dev */
-+				per_dev->offset = si->obj_offset - si->unit_off;
-+				cur_len = stripe_unit;
++static inline void
++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c)
++{
++	pnfs_blocklayout_layout_t	*n;
++	int				dbg_count	= 0;
++	u64				endpoint;
++	
++	BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA);
++	while (c->bll_list.next != &r->blr_layouts) {
++		n = list_entry(c->bll_list.next,
++			       struct pnfs_blocklayout_layout, bll_list);
++		endpoint = BLL_S_END(c);
++		if ((n->bll_soff >= c->bll_soff) &&
++		    (n->bll_soff < endpoint)) {
++			if (endpoint < BLL_S_END(n)) {
++				/*
++				 * The following is possible.
++				 *
++				 * 
++				 * Existing: +---+                 +---+
++				 *      New: +-----------------------+
++				 * The client request merge entries together
++				 * but didn't require picking up all of the
++				 * last entry. So, we still need to delete
++				 * the last entry and add the remaining space
++				 * to the new entry.
++				 */
++				c->bll_len += BLL_S_END(n) - endpoint;
 +			}
-+
-+			if (max_comp < comp)
-+				max_comp = comp;
-+
-+			dev += mirrors_p1;
-+			dev = (dev % devs_in_group) + first_dev;
++			dbg_count++;
++			list_del(&n->bll_list);
++			kfree(n);
 +		} else {
-+			cur_len = stripe_unit;
++			break;
 +		}
-+		if (cur_len >= length)
-+			cur_len = length;
-+
-+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-+				       cur_len);
-+		if (unlikely(ret))
-+			goto out;
++	}
++	/* ---- Debug only, remove before integration ---- */
++	if (dbg_count)
++		dprintk("  Collapsed %d cache entries between %Lu(s) and %Lu(s)\n",
++			dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c)));
++}
 +
-+		comp += mirrors_p1;
-+		comp = (comp % devs_in_group) + first_comp;
++static pnfs_blocklayout_layout_t *
++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h)
++{
++	pnfs_blocklayout_layout_t	*n	= NULL;
++	
++	n = kmalloc(sizeof (*n), GFP_KERNEL);
++	if (n) {
++		memset(n, 0, sizeof (*n));
++		n->bll_foff		= offset;
++		n->bll_len		= len;
++		n->bll_cache_state	= state;
++		if (h)
++			list_add_tail(&n->bll_list, h);
++	}
++	return n;
++}
 +
-+		length -= cur_len;
-+		ios->length += cur_len;
++static pnfs_blocklayout_layout_t *
++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c,
++	      struct list_head *h)
++{
++	pnfs_blocklayout_layout_t	*n	= NULL;
++	
++	n = bll_alloc(b->bll_foff, b->bll_len, c, h);
++	if (n) {
++		n->bll_es			= b->bll_es;
++		n->bll_soff			= b->bll_soff;
++		n->bll_vol_id.devid		= b->bll_vol_id.devid;
 +	}
-+out:
-+	ios->numdevs = max_comp + mirrors_p1;
-+	*last_pg = cur_pg;
-+	return ret;
++	return n;
 +}
 +
-+static int _io_rw_pagelist(struct objio_state *ios)
++static inline boolean_t
++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
++		enum pnfs_block_extent_state4 *s)
 +{
-+	u64 length = ios->ol_state.count;
-+	struct _striping_info si;
-+	unsigned devs_in_group = ios->objio_seg->group_width *
-+				 ios->objio_seg->mirrors_p1;
-+	unsigned first_comp = 0;
-+	unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps;
-+	unsigned last_pg = 0;
-+	int ret = 0;
++	/* ---- Normal case ---- */
++	*s = b->bll_es;
++	
++	switch (b->bll_es) {
++	case PNFS_BLOCK_READWRITE_DATA:
++		if (iomode == IOMODE_READ)
++			*s = PNFS_BLOCK_READ_DATA;
++		/* ---- Any use is permitted. ---- */
++		break;
++	case PNFS_BLOCK_READ_DATA:
++		/* ---- Committed as read only data. ---- */
++		if (iomode == IOMODE_RW)
++			return False;
++		break;
++	case PNFS_BLOCK_INVALID_DATA:
++		/* ---- Blocks have been allocated, but not initialized ---- */
++		if (iomode == IOMODE_READ)
++			*s = PNFS_BLOCK_NONE_DATA;
++		break;
++	case PNFS_BLOCK_NONE_DATA:
++		/* ---- Hole-y file. No backing store avail. ---- */
++		if (iomode != IOMODE_READ)
++			return False;
++		break;
++	default:
++		BUG();
++	}
++	return True;
++}
 +
-+	_calc_stripe_info(ios, ios->ol_state.offset, &si);
-+	while (length) {
-+		if (length < si.group_length)
-+			si.group_length = length;
++#endif /* CONFIG_SPNFS_BLOCK */
+diff -up linux-2.6.37.noarch/fs/nfsd/export.c.orig linux-2.6.37.noarch/fs/nfsd/export.c
+--- linux-2.6.37.noarch/fs/nfsd/export.c.orig	2011-01-28 09:37:32.554979531 -0500
++++ linux-2.6.37.noarch/fs/nfsd/export.c	2011-01-28 09:43:53.350770434 -0500
+@@ -16,11 +16,19 @@
+ #include <linux/module.h>
+ #include <linux/exportfs.h>
+ 
++#include <linux/nfsd/nfsd4_pnfs.h>
++#if defined(CONFIG_SPNFS)
++#include <linux/nfsd4_spnfs.h>
++#if defined(CONFIG_SPNFS_BLOCK)
++#include <linux/nfsd4_block.h>
++#endif
++#endif
+ #include <linux/nfsd/syscall.h>
+ #include <net/ipv6.h>
+ 
+ #include "nfsd.h"
+ #include "nfsfh.h"
++#include "pnfsd.h"
+ 
+ #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
+ 
+@@ -348,10 +356,84 @@ static int svc_export_upcall(struct cach
+ 	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+ }
+ 
++#if defined(CONFIG_PNFSD)
++static struct pnfsd_cb_operations pnfsd_cb_op = {
++	.cb_layout_recall = nfsd_layout_recall_cb,
++	.cb_device_notify = nfsd_device_notify_cb,
 +
-+		ret = _prepare_one_group(ios, si.group_length, &si, first_comp,
-+					 &last_pg);
-+		if (unlikely(ret))
-+			goto out;
++	.cb_get_state = nfs4_pnfs_cb_get_state,
++	.cb_change_state = nfs4_pnfs_cb_change_state,
++};
++
++#if defined(CONFIG_SPNFS)
++static struct pnfs_export_operations spnfs_export_ops = {
++	.layout_type = spnfs_layout_type,
++	.get_device_info = spnfs_getdeviceinfo,
++	.get_device_iter = spnfs_getdeviceiter,
++	.layout_get = spnfs_layoutget,
++	.layout_return = spnfs_layoutreturn,
++};
++
++static struct pnfs_export_operations spnfs_ds_export_ops = {
++	.get_state = spnfs_get_state,
++};
++
++#if defined(CONFIG_SPNFS_BLOCK)
++static struct pnfs_export_operations bl_export_ops = {
++	.layout_type = bl_layout_type,
++	.get_device_info = bl_getdeviceinfo,
++	.get_device_iter = bl_getdeviceiter,
++	.layout_get = bl_layoutget,
++	.layout_return = bl_layoutreturn,
++};
++#endif /* CONFIG_SPNFS_BLOCK */
++#endif /* CONFIG_SPNFS */
++#endif /* CONFIG_PNFSD */
 +
-+		length -= si.group_length;
+ static struct svc_export *svc_export_update(struct svc_export *new,
+ 					    struct svc_export *old);
+ static struct svc_export *svc_export_lookup(struct svc_export *);
+ 
++static int pnfsd_check_export(struct inode *inode, int *flags)
++{
++#if defined(CONFIG_PNFSD)
 +
-+		si.group_length = si.total_group_length;
-+		si.unit_off = 0;
-+		++si.Major;
-+		si.obj_offset = si.Major * ios->objio_seg->stripe_unit *
-+						ios->objio_seg->group_depth;
++#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
++	if (!inode->i_sb->s_pnfs_op)
++		pnfsd_lexp_init(inode);
++	return 0;
++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
 +
-+		si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
-+		si.dev %= num_comps;
++#if defined(CONFIG_SPNFS)
++#if defined(CONFIG_SPNFS_BLOCK)
++	if (pnfs_block_enabled(inode, *flags)) {
++		dprintk("set pnfs block export structure... \n");
++		inode->i_sb->s_pnfs_op = &bl_export_ops;
++	} else
++#endif /* CONFIG_SPNFS_BLOCK */
++	/*
++	 * spnfs_enabled() indicates we're an MDS.
++	 * XXX Better to check an export time option as well.
++	 */
++	if (spnfs_enabled()) {
++		dprintk("set spnfs export structure...\n");
++		inode->i_sb->s_pnfs_op = &spnfs_export_ops;
++	} else {
++		dprintk("%s spnfs not in use\n", __func__);
 +
-+		first_comp += devs_in_group;
-+		first_comp %= num_comps;
++		/*
++		 * get_state is needed if we're a DS using spnfs.
++		 * XXX Better to check an export time option instead.
++		 */
++		inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops;
 +	}
++#endif /* CONFIG_SPNFS */
 +
-+out:
-+	if (!ios->length)
-+		return ret;
++#endif /* CONFIG_PNFSD */
 +
 +	return 0;
 +}
 +
-+static ssize_t _sync_done(struct objio_state *ios)
+ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+ {
+ 
+@@ -391,8 +473,17 @@ static int check_export(struct inode *in
+ 		return -EINVAL;
+ 	}
+ 
+-	return 0;
++#if !defined(CONFIG_SPNFS)
++	if (inode->i_sb->s_pnfs_op &&
++	    (!inode->i_sb->s_pnfs_op->layout_type ||
++	     !inode->i_sb->s_pnfs_op->get_device_info ||
++	     !inode->i_sb->s_pnfs_op->layout_get)) {
++		dprintk("exp_export: export of invalid fs pnfs export ops.\n");
++		return -EINVAL;
++	}
++#endif /* !CONFIG_SPNFS */
+ 
++	return pnfsd_check_export(inode, flags);
+ }
+ 
+ #ifdef CONFIG_NFSD_V4
+@@ -582,6 +673,8 @@ static int svc_export_parse(struct cache
+ 					if (exp.ex_uuid == NULL)
+ 						err = -ENOMEM;
+ 				}
++			} else if (strcmp(buf, "pnfs") == 0) {
++				exp.ex_pnfs = 1;
+ 			} else if (strcmp(buf, "secinfo") == 0)
+ 				err = secinfo_parse(&mesg, buf, &exp);
+ 			else
+@@ -656,6 +749,8 @@ static int svc_export_show(struct seq_fi
+ 				seq_printf(m, "%02x", exp->ex_uuid[i]);
+ 			}
+ 		}
++		if (exp->ex_pnfs)
++			seq_puts(m, ",pnfs");
+ 		show_secinfo(m, exp);
+ 	}
+ 	seq_puts(m, ")\n");
+@@ -683,6 +778,7 @@ static void svc_export_init(struct cache
+ 	new->ex_fslocs.locations = NULL;
+ 	new->ex_fslocs.locations_count = 0;
+ 	new->ex_fslocs.migrated = 0;
++	new->ex_pnfs = 0;
+ }
+ 
+ static void export_update(struct cache_head *cnew, struct cache_head *citem)
+@@ -695,6 +791,7 @@ static void export_update(struct cache_h
+ 	new->ex_anon_uid = item->ex_anon_uid;
+ 	new->ex_anon_gid = item->ex_anon_gid;
+ 	new->ex_fsid = item->ex_fsid;
++	new->ex_pnfs = item->ex_pnfs;
+ 	new->ex_uuid = item->ex_uuid;
+ 	item->ex_uuid = NULL;
+ 	new->ex_pathname = item->ex_pathname;
+@@ -1662,8 +1759,17 @@ nfsd_export_init(void)
+ 	if (rv)
+ 		return rv;
+ 	rv = cache_register(&svc_expkey_cache);
+-	if (rv)
++	if (rv) {
+ 		cache_unregister(&svc_export_cache);
++		goto out;
++	}
++#if defined(CONFIG_PNFSD)
++	spin_lock(&pnfsd_cb_ctl.lock);
++	pnfsd_cb_ctl.module = THIS_MODULE;
++	pnfsd_cb_ctl.cb_op = &pnfsd_cb_op;
++	spin_unlock(&pnfsd_cb_ctl.lock);
++#endif /* CONFIG_PNFSD */
++out:
+ 	return rv;
+ 
+ }
+@@ -1691,6 +1797,12 @@ nfsd_export_shutdown(void)
+ 
+ 	exp_writelock();
+ 
++#if defined(CONFIG_PNFSD)
++	spin_lock(&pnfsd_cb_ctl.lock);
++	pnfsd_cb_ctl.module = NULL;
++	pnfsd_cb_ctl.cb_op = NULL;
++	spin_unlock(&pnfsd_cb_ctl.lock);
++#endif /* CONFIG_PNFSD */
+ 	cache_unregister(&svc_expkey_cache);
+ 	cache_unregister(&svc_export_cache);
+ 	svcauth_unix_purge();
+diff -up linux-2.6.37.noarch/fs/nfs/direct.c.orig linux-2.6.37.noarch/fs/nfs/direct.c
+--- linux-2.6.37.noarch/fs/nfs/direct.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/direct.c	2011-01-28 09:43:53.315775694 -0500
+@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_rea
+ 	.rpc_release = nfs_direct_read_release,
+ };
+ 
++static long nfs_direct_read_execute(struct nfs_read_data *data,
++				    struct rpc_task_setup *task_setup_data,
++				    struct rpc_message *msg)
 +{
-+	struct completion *waiting = ios->private;
++	struct inode *inode = data->inode;
++	struct rpc_task *task;
 +
-+	complete(waiting);
-+	return 0;
-+}
++	nfs_fattr_init(&data->fattr);
++	msg->rpc_argp = &data->args;
++	msg->rpc_resp = &data->res;
 +
-+static void _last_io(struct kref *kref)
-+{
-+	struct objio_state *ios = container_of(kref, struct objio_state, kref);
++	task_setup_data->task = &data->task;
++	task_setup_data->callback_data = data;
++	NFS_PROTO(inode)->read_setup(data, msg);
 +
-+	ios->done(ios);
++	task = rpc_run_task(task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
++
++	rpc_put_task(task);
++
++	dprintk("NFS: %5u initiated direct read call "
++		"(req %s/%lld, %u bytes @ offset %llu)\n",
++		data->task.tk_pid,
++		inode->i_sb->s_id,
++		(long long)NFS_FILEID(inode),
++		data->args.count,
++		(unsigned long long)data->args.offset);
++
++	return 0;
 +}
 +
-+static void _done_io(struct osd_request *or, void *p)
+ /*
+  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
+@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_
+ 	unsigned long user_addr = (unsigned long)iov->iov_base;
+ 	size_t count = iov->iov_len;
+ 	size_t rsize = NFS_SERVER(inode)->rsize;
+-	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_cred = ctx->cred,
+ 	};
+@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_
+ 		data->res.fattr = &data->fattr;
+ 		data->res.eof = 0;
+ 		data->res.count = bytes;
+-		nfs_fattr_init(&data->fattr);
+-		msg.rpc_argp = &data->args;
+-		msg.rpc_resp = &data->res;
+ 
+-		task_setup_data.task = &data->task;
+-		task_setup_data.callback_data = data;
+-		NFS_PROTO(inode)->read_setup(data, &msg);
+-
+-		task = rpc_run_task(&task_setup_data);
+-		if (IS_ERR(task))
+-			break;
+-		rpc_put_task(task);
+-
+-		dprintk("NFS: %5u initiated direct read call "
+-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+-				data->task.tk_pid,
+-				inode->i_sb->s_id,
+-				(long long)NFS_FILEID(inode),
+-				bytes,
+-				(unsigned long long)data->args.offset);
++		if (nfs_direct_read_execute(data, &task_setup_data, &msg))
++			break;
+ 
+ 		started += bytes;
+ 		user_addr += bytes;
+@@ -457,12 +471,15 @@ static void nfs_direct_free_writedata(st
+ }
+ 
+ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
++static long nfs_direct_write_execute(struct nfs_write_data *data,
++				     struct rpc_task_setup *task_setup_data,
++				     struct rpc_message *msg);
++
+ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+ {
+ 	struct inode *inode = dreq->inode;
+ 	struct list_head *p;
+ 	struct nfs_write_data *data;
+-	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_cred = dreq->ctx->cred,
+ 	};
+@@ -496,25 +513,7 @@ static void nfs_direct_write_reschedule(
+ 		 * Reuse data->task; data->args should not have changed
+ 		 * since the original request was sent.
+ 		 */
+-		task_setup_data.task = &data->task;
+-		task_setup_data.callback_data = data;
+-		msg.rpc_argp = &data->args;
+-		msg.rpc_resp = &data->res;
+-		NFS_PROTO(inode)->write_setup(data, &msg);
+-
+-		/*
+-		 * We're called via an RPC callback, so BKL is already held.
+-		 */
+-		task = rpc_run_task(&task_setup_data);
+-		if (!IS_ERR(task))
+-			rpc_put_task(task);
+-
+-		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+-				data->task.tk_pid,
+-				inode->i_sb->s_id,
+-				(long long)NFS_FILEID(inode),
+-				data->args.count,
+-				(unsigned long long)data->args.offset);
++		nfs_direct_write_execute(data, &task_setup_data, &msg);
+ 	}
+ 
+ 	if (put_dreq(dreq))
+@@ -557,10 +556,31 @@ static const struct rpc_call_ops nfs_com
+ 	.rpc_release = nfs_direct_commit_release,
+ };
+ 
++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq,
++				      struct nfs_write_data *data,
++				      struct rpc_task_setup *task_setup_data,
++				      struct rpc_message *msg)
 +{
-+	struct objio_state *ios = p;
-+
-+	kref_put(&ios->kref, _last_io);
-+}
++	struct rpc_task *task;
 +
-+static ssize_t _io_exec(struct objio_state *ios)
-+{
-+	DECLARE_COMPLETION_ONSTACK(wait);
-+	ssize_t status = 0; /* sync status */
-+	unsigned i;
-+	objio_done_fn saved_done_fn = ios->done;
-+	bool sync = ios->ol_state.sync;
++	NFS_PROTO(data->inode)->commit_setup(data, msg);
 +
-+	if (sync) {
-+		ios->done = _sync_done;
-+		ios->private = &wait;
-+	}
++	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
++	dreq->commit_data = NULL;
 +
-+	kref_init(&ios->kref);
++	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 +
-+	for (i = 0; i < ios->numdevs; i++) {
-+		struct osd_request *or = ios->per_dev[i].or;
++	task = rpc_run_task(task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
 +
-+		if (!or)
-+			continue;
++	rpc_put_task(task);
++	return 0;
++}
 +
-+		kref_get(&ios->kref);
-+		osd_execute_request_async(or, _done_io, ios);
-+	}
+ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+ {
+ 	struct nfs_write_data *data = dreq->commit_data;
+-	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_argp = &data->args,
+ 		.rpc_resp = &data->res,
+@@ -589,16 +609,7 @@ static void nfs_direct_commit_schedule(s
+ 	data->res.verf = &data->verf;
+ 	nfs_fattr_init(&data->fattr);
+ 
+-	NFS_PROTO(data->inode)->commit_setup(data, &msg);
+-
+-	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+-	dreq->commit_data = NULL;
+-
+-	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+-
+-	task = rpc_run_task(&task_setup_data);
+-	if (!IS_ERR(task))
+-		rpc_put_task(task);
++	nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg);
+ }
+ 
+ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+@@ -700,6 +711,36 @@ static const struct rpc_call_ops nfs_wri
+ 	.rpc_release = nfs_direct_write_release,
+ };
+ 
++static long nfs_direct_write_execute(struct nfs_write_data *data,
++				     struct rpc_task_setup *task_setup_data,
++				     struct rpc_message *msg)
++{
++	struct inode *inode = data->inode;
++	struct rpc_task *task;
 +
-+	kref_put(&ios->kref, _last_io);
++	task_setup_data->task = &data->task;
++	task_setup_data->callback_data = data;
++	msg->rpc_argp = &data->args;
++	msg->rpc_resp = &data->res;
++	NFS_PROTO(inode)->write_setup(data, msg);
 +
-+	if (sync) {
-+		wait_for_completion(&wait);
-+		status = saved_done_fn(ios);
-+	}
++	task = rpc_run_task(task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
 +
-+	return status;
-+}
++	rpc_put_task(task);
 +
-+/*
-+ * read
-+ */
-+static ssize_t _read_done(struct objio_state *ios)
-+{
-+	ssize_t status;
-+	int ret = _io_check(ios, false);
++	dprintk("NFS: %5u initiated direct write call "
++		"(req %s/%lld, %u bytes @ offset %llu)\n",
++		data->task.tk_pid,
++		inode->i_sb->s_id,
++		(long long)NFS_FILEID(inode),
++		data->args.count,
++		(unsigned long long)data->args.offset);
 +
-+	_io_free(ios);
++	return 0;
++}
 +
-+	if (likely(!ret))
-+		status = ios->length;
-+	else
-+		status = ret;
+ /*
+  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+@@ -715,7 +756,6 @@ static ssize_t nfs_direct_write_schedule
+ 	struct inode *inode = ctx->path.dentry->d_inode;
+ 	unsigned long user_addr = (unsigned long)iov->iov_base;
+ 	size_t count = iov->iov_len;
+-	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_cred = ctx->cred,
+ 	};
+@@ -782,24 +822,8 @@ static ssize_t nfs_direct_write_schedule
+ 		data->res.verf = &data->verf;
+ 		nfs_fattr_init(&data->fattr);
+ 
+-		task_setup_data.task = &data->task;
+-		task_setup_data.callback_data = data;
+-		msg.rpc_argp = &data->args;
+-		msg.rpc_resp = &data->res;
+-		NFS_PROTO(inode)->write_setup(data, &msg);
+-
+-		task = rpc_run_task(&task_setup_data);
+-		if (IS_ERR(task))
+-			break;
+-		rpc_put_task(task);
+-
+-		dprintk("NFS: %5u initiated direct write call "
+-			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+-				data->task.tk_pid,
+-				inode->i_sb->s_id,
+-				(long long)NFS_FILEID(inode),
+-				bytes,
+-				(unsigned long long)data->args.offset);
++		if (nfs_direct_write_execute(data, &task_setup_data, &msg))
++			break;
+ 
+ 		started += bytes;
+ 		user_addr += bytes;
+diff -up linux-2.6.37.noarch/fs/nfsd/Kconfig.orig linux-2.6.37.noarch/fs/nfsd/Kconfig
+--- linux-2.6.37.noarch/fs/nfsd/Kconfig.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfsd/Kconfig	2011-01-28 09:43:53.346770928 -0500
+@@ -91,3 +91,52 @@ config NFSD_V4
+ 	  available from http://linux-nfs.org/.
+ 
+ 	  If unsure, say N.
 +
-+	objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
-+	return status;
-+}
++config PNFSD
++	bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)"
++	depends on NFSD_V4 && EXPERIMENTAL
++	select EXPORTFS_FILE_LAYOUT
++	help
++	  This option enables support for the parallel NFS features of the
++	  minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1)
++	  in the kernel's NFS server.
 +
-+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
-+{
-+	struct osd_request *or = NULL;
-+	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-+	unsigned dev = per_dev->dev;
-+	struct pnfs_osd_object_cred *cred =
-+			&ios->objio_seg->layout->olo_comps[dev];
-+	struct osd_obj_id obj = {
-+		.partition = cred->oc_object_id.oid_partition_id,
-+		.id = cred->oc_object_id.oid_object_id,
-+	};
-+	int ret;
++	  Unless you're an NFS developer, say N.
 +
-+	or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
-+	if (unlikely(!or)) {
-+		ret = -ENOMEM;
-+		goto err;
-+	}
-+	per_dev->or = or;
++config PNFSD_LOCAL_EXPORT
++	bool "Enable pNFS support for exporting local filesystems for debugging purposes"
++	depends on PNFSD
++	help
++	  Say Y here if you want your pNFS server to export local file systems
++	  over the files layout type.  With this option the MDS (metadata
++	  server) functions also as a single DS (data server).  This is mostly
++	  useful for development and debugging purposes.
 +
-+	osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
++	  If unsure, say N.
 +
-+	ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
-+	if (ret) {
-+		dprintk("%s: Faild to osd_finalize_request() => %d\n",
-+			__func__, ret);
-+		goto err;
-+	}
++config SPNFS
++	bool "Provide spNFS server support (EXPERIMENTAL)"
++	depends on PNFSD
++	select RPCSEC_GSS_KRB5
++	help
++	  Say Y here if you want spNFS server support.
 +
-+	dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-+		__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-+		per_dev->length);
++	  If unsure, say N.
 +
-+err:
-+	return ret;
-+}
++config SPNFS_LAYOUTSEGMENTS
++	bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)"
++	depends on SPNFS
++	select RPCSEC_GSS_KRB5
++	help
++	  Say Y here if you want spNFS to be able to return layout segments.
 +
-+static ssize_t _read_exec(struct objio_state *ios)
-+{
-+	unsigned i;
-+	int ret;
++	  If unsure, say N.
 +
-+	for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
-+		if (!ios->per_dev[i].length)
-+			continue;
-+		ret = _read_mirrors(ios, i);
-+		if (unlikely(ret))
-+			goto err;
-+	}
++config SPNFS_BLOCK
++	bool "Provide Block Layout server support (EXPERIMENTAL)"
++	depends on SPNFS
++	select EXPORTFS_BLOCK_LAYOUT
++	help
++	  Say Y here if you want spNFS block layout support
 +
-+	ios->done = _read_done;
-+	return _io_exec(ios); /* In sync mode exec returns the io status */
++	  If unsure, say N.
+diff -up linux-2.6.37.noarch/fs/nfsd/Makefile.orig linux-2.6.37.noarch/fs/nfsd/Makefile
+--- linux-2.6.37.noarch/fs/nfsd/Makefile.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfsd/Makefile	2011-01-28 09:43:53.347770803 -0500
+@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs
+ nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
+ nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+ 			   nfs4acl.o nfs4callback.o nfs4recover.o
++nfsd-$(CONFIG_PNFSD)	+= nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o
++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o
++nfsd-$(CONFIG_SPNFS)	+= spnfs_com.o spnfs_ops.o
++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4callback.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4callback.c.orig	2011-01-28 09:37:32.557979427 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4callback.c	2011-01-28 09:43:53.352770195 -0500
+@@ -48,6 +48,8 @@ enum {
+ 	NFSPROC4_CLNT_CB_NULL = 0,
+ 	NFSPROC4_CLNT_CB_RECALL,
+ 	NFSPROC4_CLNT_CB_SEQUENCE,
++	NFSPROC4_CLNT_CB_LAYOUT,
++	NFSPROC4_CLNT_CB_DEVICE,
+ };
+ 
+ #define NFS4_MAXTAGLEN		20
+@@ -73,6 +75,19 @@ enum {
+ #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+ 					cb_sequence_dec_sz +            \
+ 					op_dec_sz)
++#define NFS4_enc_cb_layout_sz		(cb_compound_enc_hdr_sz +       \
++					cb_sequence_enc_sz +            \
++					1 + 3 +                         \
++					enc_nfs4_fh_sz + 4)
++#define NFS4_dec_cb_layout_sz		(cb_compound_dec_hdr_sz  +      \
++					cb_sequence_dec_sz +            \
++					op_dec_sz)
++#define NFS4_enc_cb_device_sz		(cb_compound_enc_hdr_sz +       \
++					cb_sequence_enc_sz +            \
++					1 + 6)
++#define NFS4_dec_cb_device_sz		(cb_compound_dec_hdr_sz  +      \
++					cb_sequence_dec_sz +            \
++					op_dec_sz)
+ 
+ struct nfs4_cb_compound_hdr {
+ 	/* args */
+@@ -361,6 +376,151 @@ static void encode_cb_recall4args(struct
+ 	hdr->nops++;
+ }
+ 
++#if defined(CONFIG_PNFSD)
 +
-+err:
-+	_io_free(ios);
-+	return ret;
-+}
++#include "pnfsd.h"
 +
-+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
++/*
++ * CB_LAYOUTRECALL4args
++ *
++ *	struct layoutrecall_file4 {
++ *		nfs_fh4         lor_fh;
++ *		offset4         lor_offset;
++ *		length4         lor_length;
++ *		stateid4        lor_stateid;
++ *	};
++ *
++ *	union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
++ *	case LAYOUTRECALL4_FILE:
++ *		layoutrecall_file4 lor_layout;
++ *	case LAYOUTRECALL4_FSID:
++ *		fsid4              lor_fsid;
++ *	case LAYOUTRECALL4_ALL:
++ *		void;
++ *	};
++ *
++ *	struct CB_LAYOUTRECALL4args {
++ *		layouttype4             clora_type;
++ *		layoutiomode4           clora_iomode;
++ *		bool                    clora_changed;
++ *		layoutrecall4           clora_recall;
++ *	};
++ */
++static void encode_cb_layout4args(struct xdr_stream *xdr,
++				  const struct nfs4_layoutrecall *clr,
++				  struct nfs4_cb_compound_hdr *hdr)
 +{
-+	struct objio_state *ios = container_of(ol_state, struct objio_state,
-+					       ol_state);
-+	int ret;
-+
-+	ret = _io_rw_pagelist(ios);
-+	if (unlikely(ret))
-+		return ret;
++	u32 *p;
 +
-+	return _read_exec(ios);
-+}
++	BUG_ON(hdr->minorversion == 0);
 +
-+/*
-+ * write
-+ */
-+static ssize_t _write_done(struct objio_state *ios)
-+{
-+	ssize_t status;
-+	int ret = _io_check(ios, true);
++	p = xdr_reserve_space(xdr, 5 * 4);
++	*p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
++	*p++ = cpu_to_be32(clr->cb.cbl_seg.layout_type);
++	*p++ = cpu_to_be32(clr->cb.cbl_seg.iomode);
++	*p++ = cpu_to_be32(clr->cb.cbl_layoutchanged);
++	*p = cpu_to_be32(clr->cb.cbl_recall_type);
++	if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) {
++		struct nfs4_fsid fsid = clr->cb.cbl_fsid;
 +
-+	_io_free(ios);
++		p = xdr_reserve_space(xdr, 2 * 8);
++		p = xdr_encode_hyper(p, fsid.major);
++		xdr_encode_hyper(p, fsid.minor);
++		dprintk("%s: type %x iomode %d changed %d recall_type %d "
++			"fsid 0x%llx-0x%llx\n",
++			__func__, clr->cb.cbl_seg.layout_type,
++			clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
++			clr->cb.cbl_recall_type, fsid.major, fsid.minor);
++	} else if (clr->cb.cbl_recall_type == RETURN_FILE) {
++		int len = clr->clr_file->fi_fhlen;
++		stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid;
 +
-+	if (likely(!ret)) {
-+		/* FIXME: should be based on the OSD's persistence model
-+		 * See OSD2r05 Section 4.13 Data persistence model */
-+		ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC;
-+		status = ios->length;
++		p = xdr_reserve_space(xdr, 4 + len + 2 * 8);
++		*p++ = cpu_to_be32(len);
++		xdr_encode_opaque_fixed(p, clr->clr_file->fi_fhval, len);
++		p += XDR_QUADLEN(len);
++		p = xdr_encode_hyper(p, clr->cb.cbl_seg.offset);
++		xdr_encode_hyper(p, clr->cb.cbl_seg.length);
++		encode_stateid4(xdr, cbl_sid);
++		dprintk("%s: type %x iomode %d changed %d recall_type %d "
++			"offset %lld length %lld stateid " STATEID_FMT "\n",
++			__func__, clr->cb.cbl_seg.layout_type,
++			clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
++			clr->cb.cbl_recall_type,
++			clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length,
++			STATEID_VAL(cbl_sid));
 +	} else {
-+		status = ret;
++		dprintk("%s: type %x iomode %d changed %d recall_type %d\n",
++			__func__, clr->cb.cbl_seg.layout_type,
++			clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
++			clr->cb.cbl_recall_type);
 +	}
-+
-+	objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
-+	return status;
++	hdr->nops++;
 +}
 +
-+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
++/*
++ * CB_NOTIFY_DEVICEID4args
++ *
++ *     typedef opaque notifylist4<>;
++ *
++ *     struct notify4 {
++ *             bitmap4         notify_mask;
++ *             notifylist4     notify_vals;
++ *     };
++ *
++ *     struct CB_NOTIFY_DEVICEID4args {
++ *             notify4 cnda_changes<>;
++ *     };
++ */
++static void encode_cb_device4args(struct xdr_stream *xdr,
++				  const struct nfs4_notify_device *nd,
++				  struct nfs4_cb_compound_hdr *hdr)
 +{
-+	struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
-+	unsigned dev = ios->per_dev[cur_comp].dev;
-+	unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1;
-+	int ret;
-+
-+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-+		struct osd_request *or = NULL;
-+		struct pnfs_osd_object_cred *cred =
-+					&ios->objio_seg->layout->olo_comps[dev];
-+		struct osd_obj_id obj = {
-+			.partition = cred->oc_object_id.oid_partition_id,
-+			.id = cred->oc_object_id.oid_object_id,
-+		};
-+		struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-+		struct bio *bio;
++	u32 *p;
++	int i;
++	int len					= nd->nd_list->cbd_len;
++	struct nfsd4_pnfs_cb_dev_item *cbd	= nd->nd_list->cbd_list;
 +
-+		or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
-+		if (unlikely(!or)) {
-+			ret = -ENOMEM;
-+			goto err;
-+		}
-+		per_dev->or = or;
++	dprintk("NFSD %s: --> num %d\n", __func__, len);
 +
-+		if (per_dev != master_dev) {
-+			bio = bio_kmalloc(GFP_KERNEL,
-+					  master_dev->bio->bi_max_vecs);
-+			if (unlikely(!bio)) {
-+				dprintk("Faild to allocate BIO size=%u\n",
-+					master_dev->bio->bi_max_vecs);
-+				ret = -ENOMEM;
-+				goto err;
-+			}
++	BUG_ON(hdr->minorversion == 0);
 +
-+			__bio_clone(bio, master_dev->bio);
-+			bio->bi_bdev = NULL;
-+			bio->bi_next = NULL;
-+			per_dev->bio = bio;
-+			per_dev->dev = dev;
-+			per_dev->length = master_dev->length;
-+			per_dev->offset =  master_dev->offset;
-+		} else {
-+			bio = master_dev->bio;
-+			/* FIXME: bio_set_dir() */
-+			bio->bi_rw |= REQ_WRITE;
-+		}
++	p = xdr_reserve_space(xdr, 2 * 4);
++	*p++ = cpu_to_be32(OP_CB_NOTIFY_DEVICEID);
++	/* notify4 cnda_changes<>; */
++	*p = cpu_to_be32(len);
++	for (i = 0; i < len; i++) {
++		dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n",
++			__func__, cbd[i].cbd_notify_type,
++			cbd[i].cbd_layout_type,
++			cbd[i].cbd_devid.sbid,
++			cbd[i].cbd_devid.devid,
++			cbd[i].cbd_immediate, i);
 +
-+		osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
++		BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
++		       cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE);
++		p = xdr_reserve_space(xdr, 4 * 4 + 2 * 8);
++		/* bitmap4         notify_mask; */
++		*p++ = cpu_to_be32(1);
++		*p++ = cpu_to_be32(cbd[i].cbd_notify_type);
++		/* opaque     notify_vals<>; */
++		if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
++			*p++ = cpu_to_be32(24);
++		else
++			*p++ = cpu_to_be32(20);
++		*p++ = cpu_to_be32(cbd[i].cbd_layout_type);
++		p = xdr_encode_hyper(p, cbd[i].cbd_devid.sbid);
++		xdr_encode_hyper(p, cbd[i].cbd_devid.devid);
 +
-+		ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
-+		if (ret) {
-+			dprintk("%s: Faild to osd_finalize_request() => %d\n",
-+				__func__, ret);
-+			goto err;
++		if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) {
++			p = xdr_reserve_space(xdr, 4);
++			*p = cpu_to_be32(cbd[i].cbd_immediate);
 +		}
-+
-+		dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-+			__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-+			per_dev->length);
 +	}
-+
-+err:
-+	return ret;
++	hdr->nops++;
 +}
++#endif /* CONFIG_PNFSD */
 +
-+static ssize_t _write_exec(struct objio_state *ios)
+ /*
+  * CB_SEQUENCE4args
+  *
+@@ -484,7 +644,7 @@ static int decode_cb_sequence4res(struct
+ out:
+ 	return status;
+ out_default:
+-	return nfs_cb_stat_to_errno(status);
++	return nfs_cb_stat_to_errno(nfserr);
+ }
+ 
+ /*
+@@ -523,6 +683,39 @@ static void nfs4_xdr_enc_cb_recall(struc
+ 	encode_cb_nops(&hdr);
+ }
+ 
++#if defined(CONFIG_PNFSD)
++static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
++				   struct xdr_stream *xdr,
++				   const struct nfsd4_callback *cb)
 +{
-+	unsigned i;
-+	int ret;
++	const struct nfs4_layoutrecall *args = cb->cb_op;
++	struct nfs4_cb_compound_hdr hdr = {
++		.ident = 0,
++		.minorversion = cb->cb_minorversion,
++	};
 +
-+	for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
-+		if (!ios->per_dev[i].length)
-+			continue;
-+		ret = _write_mirrors(ios, i);
-+		if (unlikely(ret))
-+			goto err;
-+	}
++	encode_cb_compound4args(xdr, &hdr);
++	encode_cb_sequence4args(xdr, cb, &hdr);
++	encode_cb_layout4args(xdr, args, &hdr);
++	encode_cb_nops(&hdr);
++}
 +
-+	ios->done = _write_done;
-+	return _io_exec(ios); /* In sync mode exec returns the io->status */
++static void nfs4_xdr_enc_cb_device(struct rpc_rqst *req,
++				   struct xdr_stream *xdr,
++				   const struct nfsd4_callback *cb)
++{
++	struct nfs4_notify_device *args =  cb->cb_op;
++	struct nfs4_cb_compound_hdr hdr = {
++		.ident = 0,
++		.minorversion = cb->cb_minorversion,
++	};
 +
-+err:
-+	_io_free(ios);
-+	return ret;
++	encode_cb_compound4args(xdr, &hdr);
++	encode_cb_sequence4args(xdr, cb, &hdr);
++	encode_cb_device4args(xdr, args, &hdr);
++	encode_cb_nops(&hdr);
++}
++#endif /* CONFIG_PNFSD */
+ 
+ /*
+  * NFSv4.0 and NFSv4.1 XDR decode functions
+@@ -564,13 +757,63 @@ static int nfs4_xdr_dec_cb_recall(struct
+ 	if (unlikely(status))
+ 		goto out;
+ 	if (unlikely(nfserr != NFS4_OK))
+-		goto out_default;
++		status = nfs_cb_stat_to_errno(nfserr);
++out:
++	return status;
 +}
 +
-+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
++#if defined(CONFIG_PNFSD)
++static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
++				  struct xdr_stream *xdr,
++				  struct nfsd4_callback *cb)
 +{
-+	struct objio_state *ios = container_of(ol_state, struct objio_state,
-+					       ol_state);
-+	int ret;
++	struct nfs4_cb_compound_hdr hdr;
++	enum nfsstat4 nfserr;
++	int status;
 +
-+	/* TODO: ios->stable = stable; */
-+	ret = _io_rw_pagelist(ios);
-+	if (unlikely(ret))
-+		return ret;
++	status = decode_cb_compound4res(xdr, &hdr);
++	if (unlikely(status))
++		goto out;
++	if (cb) {
++		status = decode_cb_sequence4res(xdr, cb);
++		if (unlikely(status))
++			goto out;
++	}
++	status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
++	if (unlikely(status))
++		goto out;
++	if (unlikely(nfserr != NFS4_OK))
++		status = nfs_cb_stat_to_errno(nfserr);
+ out:
+ 	return status;
+-out_default:
+-	return nfs_cb_stat_to_errno(status);
+ }
+ 
++static int nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp,
++				  struct xdr_stream *xdr,
++				  struct nfsd4_callback *cb)
++{
++	struct nfs4_cb_compound_hdr hdr;
++	enum nfsstat4 nfserr;
++	int status;
 +
-+	return _write_exec(ios);
++	status = decode_cb_compound4res(xdr, &hdr);
++	if (unlikely(status))
++		goto out;
++	if (cb) {
++		status = decode_cb_sequence4res(xdr, cb);
++		if (unlikely(status))
++			goto out;
++	}
++	status = decode_cb_op_status(xdr, OP_CB_NOTIFY_DEVICEID, &nfserr);
++	if (unlikely(status))
++		goto out;
++	if (unlikely(nfserr != NFS4_OK))
++		status = nfs_cb_stat_to_errno(nfserr);
++out:
++	return status;
++}
++#endif /* CONFIG_PNFSD */
++
+ /*
+  * RPC procedure tables
+  */
+@@ -588,6 +831,10 @@ out_default:
+ static struct rpc_procinfo nfs4_cb_procedures[] = {
+ 	PROC(CB_NULL,	NULL,		cb_null,	cb_null),
+ 	PROC(CB_RECALL,	COMPOUND,	cb_recall,	cb_recall),
++#if defined(CONFIG_PNFSD)
++	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
++	PROC(CB_DEVICE,	COMPOUND,	cb_device,	cb_device),
++#endif
+ };
+ 
+ static struct rpc_version nfs_cb_version4 = {
+@@ -785,11 +1032,10 @@ static bool nfsd41_cb_get_slot(struct nf
+  * TODO: cb_sequence should support referring call lists, cachethis, multiple
+  * slots, and mark callback channel down on communication errors.
+  */
+-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
++static void nfsd4_cb_prepare_sequence(struct rpc_task *task,
++				      struct nfsd4_callback *cb,
++				      struct nfs4_client *clp)
+ {
+-	struct nfsd4_callback *cb = calldata;
+-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+-	struct nfs4_client *clp = dp->dl_client;
+ 	u32 minorversion = clp->cl_minorversion;
+ 
+ 	cb->cb_minorversion = minorversion;
+@@ -807,12 +1053,17 @@ static void nfsd4_cb_prepare(struct rpc_
+ 	rpc_call_start(task);
+ }
+ 
+-static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata)
+ {
+ 	struct nfsd4_callback *cb = calldata;
+ 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
+-	struct nfs4_client *clp = dp->dl_client;
+ 
++	nfsd4_cb_prepare_sequence(task, cb, dp->dl_client);
 +}
 +
-+/*
-+ * Policy Operations
-+ */
-+
-+/*
-+ * Get the max [rw]size
-+ */
-+static ssize_t
-+objlayout_get_blocksize(void)
++static void nfsd4_cb_done_sequence(struct rpc_task *task,
++				   struct nfs4_client *clp)
 +{
-+	ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE;
+ 	dprintk("%s: minorversion=%d\n", __func__,
+ 		clp->cl_minorversion);
+ 
+@@ -837,7 +1088,7 @@ static void nfsd4_cb_recall_done(struct 
+ 	struct nfs4_client *clp = dp->dl_client;
+ 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
+ 
+-	nfsd4_cb_done(task, calldata);
++	nfsd4_cb_done_sequence(task, clp);
+ 
+ 	if (current_rpc_client != task->tk_client) {
+ 		/* We're shutting down or changing cl_cb_client; leave
+@@ -886,7 +1137,7 @@ static void nfsd4_cb_recall_release(void
+ }
+ 
+ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+-	.rpc_call_prepare = nfsd4_cb_prepare,
++	.rpc_call_prepare = nfsd4_cb_recall_prepare,
+ 	.rpc_call_done = nfsd4_cb_recall_done,
+ 	.rpc_release = nfsd4_cb_recall_release,
+ };
+@@ -1026,3 +1277,188 @@ void nfsd4_cb_recall(struct nfs4_delegat
+ 
+ 	run_nfsd4_cb(&dp->dl_recall);
+ }
 +
-+	return sz;
++#if defined(CONFIG_PNFSD)
++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata)
++{
++	struct nfsd4_callback *cb = calldata;
++	struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall);
++	
++	nfsd4_cb_prepare_sequence(task, cb, clr->clr_client);
 +}
 +
-+/*
-+ * Don't gather across stripes, but rather gather (coalesce) up to
-+ * the stripe size.
-+ *
-+ * FIXME: change interface to use merge_align, merge_count
-+ */
-+static struct pnfs_layoutdriver_type objlayout_type = {
-+	.id = LAYOUT_OSD2_OBJECTS,
-+	.name = "LAYOUT_OSD2_OBJECTS",
-+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata)
++{
++	struct nfsd4_callback *cb = calldata;
++	struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall);
++	struct nfs4_client *clp = clr->clr_client;
++	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 +
-+	.initialize_mountpoint   = objlayout_initialize_mountpoint,
-+	.uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
++	nfsd4_cb_done_sequence(task, clp);
 +
-+	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
-+	.free_layout_hdr         = objlayout_free_layout_hdr,
++	if (current_rpc_client != task->tk_client) {
++		/* We're shutting down or changing cl_cb_client; leave
++		 * it to nfsd4_process_cb_update to restart the call if
++		 * necessary. */
++		return;
++	}
 +
-+	.alloc_lseg              = objlayout_alloc_lseg,
-+	.free_lseg               = objlayout_free_lseg,
++	if (cb->cb_done)
++		return;
 +
-+	.get_blocksize           = objlayout_get_blocksize,
++	if (task->tk_status)
++		printk("%s: clp %p cb_client %p fp %p failed with status %d\n",
++		       __func__,
++		       clp,
++		       clp->cl_cb_client,
++		       clr->clr_file,
++		       task->tk_status);
 +
-+	.read_pagelist           = objlayout_read_pagelist,
-+	.write_pagelist          = objlayout_write_pagelist,
-+	.commit                  = objlayout_commit,
++	switch (task->tk_status) {
++	case 0:
++		goto done;
 +
-+	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
-+	.encode_layoutreturn     = objlayout_encode_layoutreturn,
-+};
++	case -NFS4ERR_NOMATCHING_LAYOUT:
++		task->tk_status = 0;
++		nomatching_layout(clr);
++		goto done;
 +
-+void *objio_init_mt(void)
-+{
-+	struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL);
++	case -NFS4ERR_DELAY:
++		/* Poll the client until it's done with the layout */
++		/* FIXME: cap number of retries.
++		 * The pnfs standard states that we need to only expire
++		 * the client after at-least "lease time" .eg lease-time * 2
++		 * when failing to communicate a recall
++		 */
++		rpc_delay(task, HZ/100); /* 10 mili-seconds */
++		task->tk_status = 0;
++		rpc_restart_call_prepare(task);
++		return;
 +
-+	if (!omt)
-+		return ERR_PTR(-ENOMEM);
++	case -NFS4ERR_BADHANDLE:
++		/* FIXME: handle more gracefully */
++		goto done;
++
++	case -NFS4ERR_BAD_STATEID:
++	case -NFS4ERR_BADIOMODE:
++	case -NFS4ERR_BADXDR:
++	case -NFS4ERR_INVAL:
++	case -NFS4ERR_NOTSUPP:
++	case -NFS4ERR_OP_NOT_IN_SESSION:
++	case -NFS4ERR_REP_TOO_BIG:
++	case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
++	case -NFS4ERR_REQ_TOO_BIG:
++	case -NFS4ERR_RETRY_UNCACHED_REP:
++	case -NFS4ERR_TOO_MANY_OPS:
++	case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
++	case -NFS4ERR_WRONG_TYPE:
++		/* We should never get these, yet it could be a result of a
++		 * buggy client, therefore no BUG here.
++		 */
++		goto done;
 +
-+	INIT_LIST_HEAD(&omt->dev_list);
-+	spin_lock_init(&omt->dev_list_lock);
-+	return omt;
-+}
++	default:
++		break;
++	}
 +
-+void objio_fini_mt(void *mountid)
-+{
-+	_dev_list_remove_all(mountid);
-+	kfree(mountid);
++	/* Network partition? */
++	nfsd4_mark_cb_down(clp, task->tk_status);
++done:
++	cb->cb_done = true;
 +}
 +
-+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
-+MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
-+MODULE_LICENSE("GPL");
-+
-+static int __init
-+objlayout_init(void)
++static void nfsd4_cb_layout_release(void *calldata)
 +{
-+	int ret = pnfs_register_layoutdriver(&objlayout_type);
++	struct nfsd4_callback *cb = calldata;
++	struct nfs4_layoutrecall *clr = container_of(cb, struct nfs4_layoutrecall, clr_recall);
 +
-+	if (ret)
-+		printk(KERN_INFO
-+			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
-+			__func__, ret);
-+	else
-+		printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
-+			__func__);
-+	return ret;
++	put_layoutrecall(clr);
 +}
 +
-+static void __exit
-+objlayout_exit(void)
-+{
-+	pnfs_unregister_layoutdriver(&objlayout_type);
-+	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
-+	       __func__);
-+}
++static const struct rpc_call_ops nfsd4_cb_layout_ops = {
++	.rpc_call_prepare = nfsd4_cb_layout_prepare,
++	.rpc_call_done = nfsd4_cb_layout_done,
++	.rpc_release = nfsd4_cb_layout_release,
++};
 +
-+module_init(objlayout_init);
-+module_exit(objlayout_exit);
-diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
-new file mode 100644
-index 0000000..b647577
---- /dev/null
-+++ b/fs/nfs/objlayout/objlayout.c
-@@ -0,0 +1,773 @@
 +/*
-+ *  objlayout.c
-+ *
-+ *  pNFS layout driver for Panasas OSDs
-+ *
-+ *  Copyright (C) 2007-2009 Panasas Inc.
-+ *  All rights reserved.
-+ *
-+ *  Benny Halevy <bhalevy at panasas.com>
-+ *  Boaz Harrosh <bharrosh at panasas.com>
-+ *
-+ *  This program is free software; you can redistribute it and/or modify
-+ *  it under the terms of the GNU General Public License version 2
-+ *  See the file COPYING included with this distribution for more details.
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the Panasas company nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ * Called with state lock.
 + */
++void
++nfsd4_cb_layout(struct nfs4_layoutrecall *clr)
++{
++	struct nfsd4_callback *cb = &clr->clr_recall;
 +
-+#include <scsi/osd_initiator.h>
-+#include "objlayout.h"
++	cb->cb_op = clr;
++	cb->cb_clp = clr->clr_client;
++	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT];
++	cb->cb_msg.rpc_argp = cb;
++	cb->cb_msg.rpc_resp = cb;
++	cb->cb_msg.rpc_cred = callback_cred;
 +
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++	cb->cb_ops = &nfsd4_cb_layout_ops;
++	run_nfsd4_cb(cb);
++}
 +
-+struct pnfs_client_operations *pnfs_client_ops;
++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata)
++{
++	struct nfsd4_callback *cb = calldata;
++	struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall);
 +
-+/*
-+ * Create a objlayout layout structure for the given inode and return it.
-+ */
-+struct pnfs_layout_hdr *
-+objlayout_alloc_layout_hdr(struct inode *inode)
++	nfsd4_cb_prepare_sequence(task, cb, cbnd->nd_client);
++}
++
++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata)
 +{
-+	struct objlayout *objlay;
++	struct nfsd4_callback *cb = calldata;
++	struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall);
++	struct nfs4_client *clp = cbnd->nd_client;
 +
-+	objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
-+	if (objlay) {
-+		spin_lock_init(&objlay->lock);
-+		INIT_LIST_HEAD(&objlay->err_list);
++	nfsd4_cb_done_sequence(task, clp);
++
++	dprintk("%s: clp %p cb_client %p: status %d\n",
++	       __func__,
++	       clp,
++	       clp->cl_cb_client,
++	       task->tk_status);
++
++	if (task->tk_status == -EIO) {
++		/* Network partition? */
++		nfsd4_mark_cb_down(clp, task->tk_status);
 +	}
-+	dprintk("%s: Return %p\n", __func__, objlay);
-+	return &objlay->pnfs_layout;
++	cb->cb_done = true;
 +}
 +
-+/*
-+ * Free an objlayout layout structure
-+ */
-+void
-+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
++static void nfsd4_cb_device_release(void *calldata)
 +{
-+	struct objlayout *objlay = OBJLAYOUT(lo);
-+
-+	dprintk("%s: objlay %p\n", __func__, objlay);
++	struct nfsd4_callback *cb = calldata;
++	struct nfs4_notify_device *cbnd = container_of(cb, struct nfs4_notify_device, nd_recall);
 +
-+	WARN_ON(!list_empty(&objlay->err_list));
-+	kfree(objlay);
++	kfree(cbnd);
 +}
 +
++static const struct rpc_call_ops nfsd4_cb_device_ops = {
++	.rpc_call_prepare = nfsd4_cb_device_prepare,
++	.rpc_call_done = nfsd4_cb_device_done,
++	.rpc_release = nfsd4_cb_device_release,
++};
++
 +/*
-+ * Unmarshall layout and store it in pnfslay.
++ * Called with state lock.
 + */
-+struct pnfs_layout_segment *
-+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
-+		     struct nfs4_layoutget_res *lgr)
++void
++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd)
 +{
-+	int status;
-+	void *layout = lgr->layout.buf;
-+	struct objlayout_segment *objlseg;
-+	struct pnfs_osd_layout *pnfs_osd_layout;
++	struct nfsd4_callback *cb = &cbnd->nd_recall;
 +
-+	dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout);
++	cb->cb_op = cbnd;
++	cb->cb_clp = cbnd->nd_client;
++	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE];
++	cb->cb_msg.rpc_argp = cb;
++	cb->cb_msg.rpc_resp = cb;
++	cb->cb_msg.rpc_cred = callback_cred;
 +
-+	BUG_ON(!layout);
++	cb->cb_ops = &nfsd4_cb_device_ops;
++	run_nfsd4_cb(cb);
++}
++#endif /* CONFIG_PNFSD */
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c.orig	2011-01-28 09:43:53.353770077 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4pnfsd.c	2011-01-28 09:43:53.353770077 -0500
+@@ -0,0 +1,1688 @@
++/******************************************************************************
++ *
++ * (c) 2007 Network Appliance, Inc.  All Rights Reserved.
++ * (c) 2009 NetApp.  All Rights Reserved.
++ *
++ * NetApp provides this source code under the GPL v2 License.
++ * The GPL v2 license is available at
++ * http://opensource.org/licenses/gpl-license.php.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *****************************************************************************/
 +
-+	status = -ENOMEM;
-+	objlseg = kzalloc(sizeof(*objlseg) +
-+			  pnfs_osd_layout_incore_sz(layout), GFP_KERNEL);
-+	if (!objlseg)
-+		goto err;
++#include "pnfsd.h"
 +
-+	pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
-+	pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout);
++#define NFSDDBG_FACILITY                NFSDDBG_PROC
 +
-+	status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg,
-+				  pnfs_osd_layout);
-+	if (status)
-+		goto err;
++/* Globals */
++static u32 current_layoutid = 1;
 +
-+	dprintk("%s: Return %p\n", __func__, &objlseg->lseg);
-+	return &objlseg->lseg;
++/*
++ * Currently used for manipulating the layout state.
++ */
++static DEFINE_SPINLOCK(layout_lock);
 +
-+ err:
-+	kfree(objlseg);
-+	return ERR_PTR(status);
-+}
++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP)
++#  define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock))
++#else
++#  define BUG_ON_UNLOCKED_LAYOUT()
++#endif
 +
 +/*
-+ * Free a layout segement
++ * Layout state - NFSv4.1 pNFS
 + */
-+void
-+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
-+{
-+	struct objlayout_segment *objlseg;
++static struct kmem_cache *pnfs_layout_slab;
++static struct kmem_cache *pnfs_layoutrecall_slab;
 +
-+	dprintk("%s: freeing layout segment %p\n", __func__, lseg);
++/* hash table for nfsd4_pnfs_deviceid.sbid */
++#define SBID_HASH_BITS	8
++#define SBID_HASH_SIZE	(1 << SBID_HASH_BITS)
++#define SBID_HASH_MASK	(SBID_HASH_SIZE - 1)
 +
-+	if (unlikely(!lseg))
-+		return;
++struct sbid_tracker {
++	u64 id;
++	struct super_block *sb;
++	struct list_head hash;
++};
 +
-+	objlseg = container_of(lseg, struct objlayout_segment, lseg);
-+	objio_free_lseg(objlseg->internal);
-+	kfree(objlseg);
-+}
++static u64 current_sbid;
++static struct list_head sbid_hashtbl[SBID_HASH_SIZE];
 +
-+/*
-+ * I/O Operations
-+ */
-+static inline u64
-+end_offset(u64 start, u64 len)
++static inline unsigned long
++sbid_hashval(struct super_block *sb)
 +{
-+	u64 end;
-+
-+	end = start + len;
-+	return end >= start ? end : NFS4_MAX_UINT64;
++	return hash_ptr(sb, SBID_HASH_BITS);
 +}
 +
-+/* last octet in a range */
-+static inline u64
-+last_byte_offset(u64 start, u64 len)
++static inline struct sbid_tracker *
++alloc_sbid(void)
 +{
-+	u64 end;
++	return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL);
++}
 +
-+	BUG_ON(!len);
-+	end = start + len;
-+	return end > start ? end - 1 : NFS4_MAX_UINT64;
++static void
++destroy_sbid(struct sbid_tracker *sbid)
++{
++	spin_lock(&layout_lock);
++	list_del(&sbid->hash);
++	spin_unlock(&layout_lock);
++	kfree(sbid);
 +}
 +
-+static struct objlayout_io_state *
-+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
-+			struct page **pages,
-+			unsigned pgbase,
-+			unsigned nr_pages,
-+			loff_t offset,
-+			size_t count,
-+			struct pnfs_layout_segment *lseg,
-+			void *rpcdata)
++void
++nfsd4_free_pnfs_slabs(void)
 +{
-+	struct objlayout_segment *objlseg =
-+		container_of(lseg, struct objlayout_segment, lseg);
-+	struct objlayout_io_state *state;
-+	u64 lseg_end_offset;
-+	size_t size_nr_pages;
++	int i;
++	struct sbid_tracker *sbid;
 +
-+	dprintk("%s: allocating io_state\n", __func__);
-+	if (objio_alloc_io_state(objlseg->internal, &state))
-+		return NULL;
++	nfsd4_free_slab(&pnfs_layout_slab);
++	nfsd4_free_slab(&pnfs_layoutrecall_slab);
 +
-+	BUG_ON(offset < lseg->range.offset);
-+	lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length);
-+	BUG_ON(offset >= lseg_end_offset);
-+	if (offset + count > lseg_end_offset) {
-+		count = lseg->range.length - (offset - lseg->range.offset);
-+		dprintk("%s: truncated count %Zd\n", __func__, count);
++	for (i = 0; i < SBID_HASH_SIZE; i++) {
++		while (!list_empty(&sbid_hashtbl[i])) {
++			sbid = list_first_entry(&sbid_hashtbl[i],
++						struct sbid_tracker,
++						hash);
++			destroy_sbid(sbid);
++		}
 +	}
++}
 +
-+	if (pgbase > PAGE_SIZE) {
-+		unsigned n = pgbase >> PAGE_SHIFT;
-+
-+		pgbase &= ~PAGE_MASK;
-+		pages += n;
-+		nr_pages -= n;
-+	}
++int
++nfsd4_init_pnfs_slabs(void)
++{
++	int i;
 +
-+	size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-+	BUG_ON(nr_pages < size_nr_pages);
-+	if (nr_pages > size_nr_pages)
-+		nr_pages = size_nr_pages;
++	pnfs_layout_slab = kmem_cache_create("pnfs_layouts",
++			sizeof(struct nfs4_layout), 0, 0, NULL);
++	if (pnfs_layout_slab == NULL)
++		return -ENOMEM;
++	pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls",
++			sizeof(struct nfs4_layoutrecall), 0, 0, NULL);
++	if (pnfs_layoutrecall_slab == NULL)
++		return -ENOMEM;
 +
-+	INIT_LIST_HEAD(&state->err_list);
-+	state->objlseg = objlseg;
-+	state->rpcdata = rpcdata;
-+	state->pages = pages;
-+	state->pgbase = pgbase;
-+	state->nr_pages = nr_pages;
-+	state->offset = offset;
-+	state->count = count;
-+	state->sync = 0;
++	for (i = 0; i < SBID_HASH_SIZE; i++) {
++		INIT_LIST_HEAD(&sbid_hashtbl[i]);
++	}
 +
-+	return state;
++	return 0;
 +}
 +
-+static void
-+objlayout_free_io_state(struct objlayout_io_state *state)
++/* XXX: Need to implement the notify types and track which
++ * clients have which devices. */
++void pnfs_set_device_notify(clientid_t *clid, unsigned int types)
 +{
-+	dprintk("%s: freeing io_state\n", __func__);
-+	if (unlikely(!state))
-+		return;
++	struct nfs4_client *clp;
++	dprintk("%s: -->\n", __func__);
 +
-+	objio_free_io_state(state);
++	nfs4_lock_state();
++	/* Indicate that client has a device so we can only notify
++	 * the correct clients */
++	clp = find_confirmed_client(clid);
++	if (clp) {
++		atomic_inc(&clp->cl_deviceref);
++		dprintk("%s: Incr device count (clnt %p) to %d\n",
++			__func__, clp, atomic_read(&clp->cl_deviceref));
++	}
++	nfs4_unlock_state();
 +}
 +
-+/*
-+ * I/O done common code
-+ */
-+static void
-+objlayout_iodone(struct objlayout_io_state *state)
++/* Clear notifications for this client
++ * XXX: Do we need to loop through a clean up all
++ *      krefs when nfsd cleans up the client? */
++void pnfs_clear_device_notify(struct nfs4_client *clp)
 +{
-+	dprintk("%s: state %p status\n", __func__, state);
-+
-+	if (likely(state->status >= 0)) {
-+		objlayout_free_io_state(state);
-+	} else {
-+		struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
-+
-+		spin_lock(&objlay->lock);
-+		objlay->delta_space_valid = OBJ_DSU_INVALID;
-+		list_add(&objlay->err_list, &state->err_list);
-+		spin_unlock(&objlay->lock);
-+	}
++	atomic_dec(&clp->cl_deviceref);
++	dprintk("%s: Decr device count (clnt %p) to %d\n",
++		__func__, clp, atomic_read(&clp->cl_deviceref));
 +}
 +
-+/*
-+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
-+ *
-+ * The @index component IO failed (error returned from target). Register
-+ * the error for later reporting at layout-return.
-+ */
-+void
-+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
-+			int osd_error, u64 offset, u64 length, bool is_write)
++static struct nfs4_layout_state *
++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp,
++			stateid_t *stateid)
 +{
-+	struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
-+
-+	BUG_ON(index >= state->num_comps);
-+	if (osd_error) {
-+		struct pnfs_osd_layout *layout =
-+			(typeof(layout))state->objlseg->pnfs_osd_layout;
-+
-+		ioerr->oer_component = layout->olo_comps[index].oc_object_id;
-+		ioerr->oer_comp_offset = offset;
-+		ioerr->oer_comp_length = length;
-+		ioerr->oer_iswrite = is_write;
-+		ioerr->oer_errno = osd_error;
++	struct nfs4_layout_state *new;
 +
-+		dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
-+			"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
-+			__func__, index, ioerr->oer_errno,
-+			ioerr->oer_iswrite,
-+			_DEVID_LO(&ioerr->oer_component.oid_device_id),
-+			_DEVID_HI(&ioerr->oer_component.oid_device_id),
-+			ioerr->oer_component.oid_partition_id,
-+			ioerr->oer_component.oid_object_id,
-+			ioerr->oer_comp_offset,
-+			ioerr->oer_comp_length);
-+	} else {
-+		/* User need not call if no error is reported */
-+		ioerr->oer_errno = 0;
-+	}
++	/* FIXME: use a kmem_cache */
++	new = kzalloc(sizeof(*new), GFP_KERNEL);
++	if (!new)
++		return new;
++	get_nfs4_file(fp);
++	INIT_LIST_HEAD(&new->ls_perfile);
++	INIT_LIST_HEAD(&new->ls_layouts);
++	kref_init(&new->ls_ref);
++	new->ls_client = clp;
++	new->ls_file = fp;
++	new->ls_stateid.si_boot = stateid->si_boot;
++	new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */
++	new->ls_stateid.si_generation = 1;
++	spin_lock(&layout_lock);
++	new->ls_stateid.si_fileid = current_layoutid++;
++	list_add(&new->ls_perfile, &fp->fi_layout_states);
++	spin_unlock(&layout_lock);
++	return new;
 +}
 +
-+static void _rpc_commit_complete(struct work_struct *work)
++static inline void
++get_layout_state(struct nfs4_layout_state *ls)
 +{
-+	struct rpc_task *task;
-+	struct nfs_write_data *wdata;
++	kref_get(&ls->ls_ref);
++}
 +
-+	dprintk("%s enter\n", __func__);
-+	task = container_of(work, struct rpc_task, u.tk_work);
-+	wdata = container_of(task, struct nfs_write_data, task);
++static void
++destroy_layout_state_common(struct nfs4_layout_state *ls)
++{
++	struct nfs4_file *fp = ls->ls_file;
 +
-+	pnfs_commit_done(wdata);
++	dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp,
++		ls->ls_client);
++	BUG_ON(!list_empty(&ls->ls_layouts));
++	kfree(ls);
++	put_nfs4_file(fp);
 +}
 +
-+/*
-+ * Commit data remotely on OSDs
-+ */
-+enum pnfs_try_status
-+objlayout_commit(struct nfs_write_data *wdata, int how)
++static void
++destroy_layout_state(struct kref *kref)
 +{
-+	int status = PNFS_ATTEMPTED;
++	struct nfs4_layout_state *ls =
++			container_of(kref, struct nfs4_layout_state, ls_ref);
 +
-+	INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete);
-+	schedule_work(&wdata->task.u.tk_work);
-+	dprintk("%s: Return %d\n", __func__, status);
-+	return status;
++	spin_lock(&layout_lock);
++	list_del(&ls->ls_perfile);
++	spin_unlock(&layout_lock);
++	destroy_layout_state_common(ls);
 +}
 +
-+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
-+ * This is because the osd completion is called with ints-off from
-+ * the block layer
-+ */
-+static void _rpc_read_complete(struct work_struct *work)
++static void
++destroy_layout_state_locked(struct kref *kref)
 +{
-+	struct rpc_task *task;
-+	struct nfs_read_data *rdata;
++	struct nfs4_layout_state *ls =
++			container_of(kref, struct nfs4_layout_state, ls_ref);
 +
-+	dprintk("%s enter\n", __func__);
-+	task = container_of(work, struct rpc_task, u.tk_work);
-+	rdata = container_of(task, struct nfs_read_data, task);
++	list_del(&ls->ls_perfile);
++	destroy_layout_state_common(ls);
++}
 +
-+	pnfs_read_done(rdata);
++static inline void
++put_layout_state(struct nfs4_layout_state *ls)
++{
++	dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
++		atomic_read(&ls->ls_ref.refcount));
++	kref_put(&ls->ls_ref, destroy_layout_state);
 +}
 +
-+void
-+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
++static inline void
++put_layout_state_locked(struct nfs4_layout_state *ls)
 +{
-+	int eof = state->eof;
-+	struct nfs_read_data *rdata;
++	dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
++		atomic_read(&ls->ls_ref.refcount));
++	kref_put(&ls->ls_ref, destroy_layout_state_locked);
++}
 +
-+	state->status = status;
-+	dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
-+	rdata = state->rpcdata;
-+	rdata->task.tk_status = status;
-+	if (status >= 0) {
-+		rdata->res.count = status;
-+		rdata->res.eof = eof;
-+	}
-+	objlayout_iodone(state);
-+	/* must not use state after this point */
++/*
++ * Search the fp->fi_layout_state list for a layout state with the clientid.
++ * If not found, then this is a 'first open/delegation/lock stateid' from
++ * the client for this file.
++ * Called under the layout_lock.
++ */
++static struct nfs4_layout_state *
++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp)
++{
++	struct nfs4_layout_state *ls;
 +
-+	if (sync)
-+		pnfs_read_done(rdata);
-+	else {
-+		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
-+		schedule_work(&rdata->task.u.tk_work);
++	BUG_ON_UNLOCKED_LAYOUT();
++	list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) {
++		if (ls->ls_client == clp) {
++			dprintk("pNFS %s: before GET ls %p ls_ref %d\n",
++				__func__, ls,
++				atomic_read(&ls->ls_ref.refcount));
++			get_layout_state(ls);
++			return ls;
++		}
 +	}
++	return NULL;
++}
++
++static __be32
++verify_stateid(struct nfs4_file *fp, stateid_t *stateid)
++{
++	struct nfs4_stateid *local = NULL;
++	struct nfs4_delegation *temp = NULL;
++
++	/* check if open or lock stateid */
++	local = find_stateid(stateid, RD_STATE);
++	if (local)
++		return 0;
++	temp = find_delegation_stateid(fp->fi_inode, stateid);
++	if (temp)
++		return 0;
++	return nfserr_bad_stateid;
 +}
 +
 +/*
-+ * Perform sync or async reads.
++ * nfs4_preocess_layout_stateid ()
++ *
++ * We have looked up the nfs4_file corresponding to the current_fh, and
++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op()
++ * that make sense with a layout stateid.
++ *
++ * Called with the state_lock held
++ * Returns zero and stateid is updated, or error.
++ *
++ * Note: the struct nfs4_layout_state pointer is only set by layoutget.
 + */
-+enum pnfs_try_status
-+objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages)
++static __be32
++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp,
++			    stateid_t *stateid, struct nfs4_layout_state **lsp)
 +{
-+	loff_t offset = rdata->args.offset;
-+	size_t count = rdata->args.count;
-+	struct objlayout_io_state *state;
-+	ssize_t status = 0;
-+	loff_t eof;
++	struct nfs4_layout_state *ls = NULL;
++	__be32 status = 0;
 +
-+	dprintk("%s: Begin inode %p offset %llu count %d\n",
-+		__func__, rdata->inode, offset, (int)count);
++	dprintk("--> %s clp %p fp %p \n", __func__, clp, fp);
 +
-+	eof = i_size_read(rdata->inode);
-+	if (unlikely(offset + count > eof)) {
-+		if (offset >= eof) {
-+			status = 0;
-+			rdata->res.count = 0;
-+			rdata->res.eof = 1;
++	dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__,
++		STATEID_VAL(stateid));
++
++	status = nfs4_check_stateid(stateid);
++	if (status)
++		goto out;
++
++	/* Is this the first use of this layout ? */
++	spin_lock(&layout_lock);
++	ls = find_get_layout_state(clp, fp);
++	spin_unlock(&layout_lock);
++	if (!ls) {
++		/* Only alloc layout state on layoutget (which sets lsp). */
++		if (!lsp) {
++			dprintk("%s ERROR: Not layoutget & no layout stateid\n",
++				__func__);
++			status = nfserr_bad_stateid;
 +			goto out;
 +		}
-+		count = eof - offset;
++		dprintk("%s Initial stateid for layout: file %p client %p\n",
++			__func__, fp, clp);
++
++		/* verify input stateid */
++		status = verify_stateid(fp, stateid);
++		if (status) {
++			dprintk("%s ERROR: invalid open/deleg/lock stateid\n",
++				__func__);
++			goto out;
++		}
++		ls = alloc_init_layout_state(clp, fp, stateid);
++		if (!ls) {
++			dprintk("%s pNFS ERROR: no memory for layout state\n",
++				__func__);
++			status = nfserr_resource;
++			goto out;
++		}
++	} else {
++		dprintk("%s Not initial stateid. Layout state %p file %p\n",
++			__func__, ls, fp);
++
++		/* BAD STATEID */
++		status = nfserr_bad_stateid;
++		if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque,
++			sizeof(stateid_opaque_t)) != 0) {
++
++			/* if a LAYOUTGET operation and stateid is a valid
++			 * open/deleg/lock stateid, accept it as a parallel
++			 * initial layout stateid
++			 */
++			if (lsp && ((verify_stateid(fp, stateid)) == 0)) {
++				dprintk("%s parallel initial layout state\n",
++					__func__);
++				goto verified;
++			}
++
++			dprintk("%s ERROR bad opaque in stateid 1\n", __func__);
++			goto out_put;
++		}
++
++		/* stateid is a valid layout stateid for this file. */
++		if (stateid->si_generation > ls->ls_stateid.si_generation) {
++			dprintk("%s bad stateid 1\n", __func__);
++			goto out_put;
++		}
 +	}
++verified:
++	status = 0;
 +
-+	state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
-+					 rdata->args.pages, rdata->args.pgbase,
-+					 nr_pages, offset, count,
-+					 rdata->pdata.lseg, rdata);
-+	if (unlikely(!state)) {
-+		status = -ENOMEM;
-+		goto out;
++	/* Return the layout state if requested */
++	if (lsp) {
++		get_layout_state(ls);
++		*lsp = ls;
 +	}
++	dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__,
++		STATEID_VAL(&ls->ls_stateid));
++out_put:
++	dprintk("%s PUT LO STATE:\n", __func__);
++	put_layout_state(ls);
++out:
++	dprintk("<-- %s status %d\n", __func__, htonl(status));
 +
-+	state->eof = state->offset + state->count >= eof;
++	return status;
++}
 +
-+	status = objio_read_pagelist(state);
-+ out:
-+	dprintk("%s: Return status %Zd\n", __func__, status);
-+	rdata->pdata.pnfs_error = status;
-+	return PNFS_ATTEMPTED;
++static inline struct nfs4_layout *
++alloc_layout(void)
++{
++	return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL);
 +}
 +
-+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
-+ * This is because the osd completion is called with ints-off from
-+ * the block layer
-+ */
-+static void _rpc_write_complete(struct work_struct *work)
++static inline void
++free_layout(struct nfs4_layout *lp)
 +{
-+	struct rpc_task *task;
-+	struct nfs_write_data *wdata;
-+
-+	dprintk("%s enter\n", __func__);
-+	task = container_of(work, struct rpc_task, u.tk_work);
-+	wdata = container_of(task, struct nfs_write_data, task);
++	kmem_cache_free(pnfs_layout_slab, lp);
++}
 +
-+	pnfs_writeback_done(wdata);
++#define update_layout_stateid(ls, sid) { \
++	update_stateid(&(ls)->ls_stateid); \
++	dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \
++		__func__, (ls)->ls_stateid.si_generation, (ls)); \
++	memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \
 +}
 +
-+void
-+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
-+		     bool sync)
++static void
++init_layout(struct nfs4_layout_state *ls,
++	    struct nfs4_layout *lp,
++	    struct nfs4_file *fp,
++	    struct nfs4_client *clp,
++	    struct svc_fh *current_fh,
++	    struct nfsd4_layout_seg *seg,
++	    stateid_t *stateid)
 +{
-+	struct nfs_write_data *wdata;
-+
-+	dprintk("%s: Begin\n", __func__);
-+	wdata = state->rpcdata;
-+	state->status = status;
-+	wdata->task.tk_status = status;
-+	if (status >= 0) {
-+		wdata->res.count = status;
-+		wdata->verf.committed = state->committed;
-+		dprintk("%s: Return status %d committed %d\n",
-+			__func__, wdata->task.tk_status,
-+			wdata->verf.committed);
-+	} else
-+		dprintk("%s: Return status %d\n",
-+			__func__, wdata->task.tk_status);
-+	objlayout_iodone(state);
-+	/* must not use state after this point */
++	dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__,
++		ls, lp, clp, fp, fp->fi_inode);
 +
-+	if (sync)
-+		pnfs_writeback_done(wdata);
-+	else {
-+		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
-+		schedule_work(&wdata->task.u.tk_work);
-+	}
++	get_nfs4_file(fp);
++	lp->lo_client = clp;
++	lp->lo_file = fp;
++	get_layout_state(ls);
++	lp->lo_state = ls;
++	memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg));
++	spin_lock(&layout_lock);
++	update_layout_stateid(ls, stateid);
++	list_add_tail(&lp->lo_perstate, &ls->ls_layouts);
++	list_add_tail(&lp->lo_perclnt, &clp->cl_layouts);
++	list_add_tail(&lp->lo_perfile, &fp->fi_layouts);
++	spin_unlock(&layout_lock);
++	dprintk("pNFS %s end\n", __func__);
 +}
 +
-+/*
-+ * Perform sync or async writes.
-+ */
-+enum pnfs_try_status
-+objlayout_write_pagelist(struct nfs_write_data *wdata,
-+			 unsigned nr_pages,
-+			 int how)
++static void
++dequeue_layout(struct nfs4_layout *lp)
 +{
-+	struct objlayout_io_state *state;
-+	ssize_t status;
-+
-+	dprintk("%s: Begin inode %p offset %llu count %u\n",
-+		__func__, wdata->inode, wdata->args.offset, wdata->args.count);
++	BUG_ON_UNLOCKED_LAYOUT();
++	list_del(&lp->lo_perclnt);
++	list_del(&lp->lo_perfile);
++	list_del(&lp->lo_perstate);
++}
 +
-+	state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
-+					 wdata->args.pages,
-+					 wdata->args.pgbase,
-+					 nr_pages,
-+					 wdata->args.offset,
-+					 wdata->args.count,
-+					 wdata->pdata.lseg, wdata);
-+	if (unlikely(!state)) {
-+		status = -ENOMEM;
-+		goto out;
-+	}
++static void
++destroy_layout(struct nfs4_layout *lp)
++{
++	struct nfs4_client *clp;
++	struct nfs4_file *fp;
++	struct nfs4_layout_state *ls;
 +
-+	state->sync = how & FLUSH_SYNC;
++	BUG_ON_UNLOCKED_LAYOUT();
++	clp = lp->lo_client;
++	fp = lp->lo_file;
++	ls = lp->lo_state;
++	dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n",
++		__func__, lp, clp, fp, fp->fi_inode,
++		list_empty(&ls->ls_layouts));
 +
-+	status = objio_write_pagelist(state, how & FLUSH_STABLE);
-+ out:
-+	dprintk("%s: Return status %Zd\n", __func__, status);
-+	wdata->pdata.pnfs_error = status;
-+	return PNFS_ATTEMPTED;
++	kmem_cache_free(pnfs_layout_slab, lp);
++	/* release references taken by init_layout */
++	put_layout_state_locked(ls);
++	put_nfs4_file(fp);
 +}
 +
-+void
-+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
-+			      struct xdr_stream *xdr,
-+			      const struct nfs4_layoutcommit_args *args)
++void fs_layout_return(struct super_block *sb, struct inode *ino,
++		      struct nfsd4_pnfs_layoutreturn *lrp, int flags,
++		      void *recall_cookie)
 +{
-+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
-+	struct pnfs_osd_layoutupdate lou;
-+	__be32 *start;
-+
-+	dprintk("%s: Begin\n", __func__);
-+
-+	spin_lock(&objlay->lock);
-+	lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
-+	lou.dsu_delta = objlay->delta_space_used;
-+	objlay->delta_space_used = 0;
-+	objlay->delta_space_valid = OBJ_DSU_INIT;
-+	lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
-+	spin_unlock(&objlay->lock);
++	int ret;
 +
-+	start = xdr_reserve_space(xdr, 4);
++	if (unlikely(!sb->s_pnfs_op->layout_return))
++		return;
 +
-+	BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
++	lrp->lr_flags = flags;
++	lrp->args.lr_cookie = recall_cookie;
 +
-+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
++	if (!ino) /* FSID or ALL */
++		ino = sb->s_root->d_inode;
 +
-+	dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
-+		lou.dsu_delta, lou.olu_ioerr_flag);
++	ret = sb->s_pnfs_op->layout_return(ino, &lrp->args);
++	dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx "
++		"cookie = %p flags 0x%x status=%d\n",
++		__func__, ino->i_ino, lrp->args.lr_seg.iomode,
++		lrp->args.lr_seg.offset, lrp->args.lr_seg.length,
++		recall_cookie, flags, ret);
 +}
 +
-+static int
-+err_prio(u32 oer_errno)
++static u64
++alloc_init_sbid(struct super_block *sb)
 +{
-+	switch (oer_errno) {
-+	case 0:
-+		return 0;
++	struct sbid_tracker *sbid;
++	struct sbid_tracker *new = alloc_sbid();
++	unsigned long hash_idx = sbid_hashval(sb);
++	u64 id = 0;
 +
-+	case PNFS_OSD_ERR_RESOURCE:
-+		return OSD_ERR_PRI_RESOURCE;
-+	case PNFS_OSD_ERR_BAD_CRED:
-+		return OSD_ERR_PRI_BAD_CRED;
-+	case PNFS_OSD_ERR_NO_ACCESS:
-+		return OSD_ERR_PRI_NO_ACCESS;
-+	case PNFS_OSD_ERR_UNREACHABLE:
-+		return OSD_ERR_PRI_UNREACHABLE;
-+	case PNFS_OSD_ERR_NOT_FOUND:
-+		return OSD_ERR_PRI_NOT_FOUND;
-+	case PNFS_OSD_ERR_NO_SPACE:
-+		return OSD_ERR_PRI_NO_SPACE;
-+	default:
-+		WARN_ON(1);
-+		/* fallthrough */
-+	case PNFS_OSD_ERR_EIO:
-+		return OSD_ERR_PRI_EIO;
++	if (likely(new)) {
++		spin_lock(&layout_lock);
++		id = ++current_sbid;
++		new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK);
++		id = new->id;
++		BUG_ON(id == 0);
++		new->sb = sb;
++
++		list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash)
++			if (sbid->sb == sb) {
++				kfree(new);
++				id = sbid->id;
++				spin_unlock(&layout_lock);
++				return id;
++			}
++		list_add(&new->hash, &sbid_hashtbl[hash_idx]);
++		spin_unlock(&layout_lock);
 +	}
++	return id;
 +}
 +
-+static void
-+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
-+	    const struct pnfs_osd_ioerr *src_err)
++struct super_block *
++find_sbid_id(u64 id)
 +{
-+	u64 dest_end, src_end;
-+
-+	if (!dest_err->oer_errno) {
-+		*dest_err = *src_err;
-+		/* accumulated device must be blank */
-+		memset(&dest_err->oer_component.oid_device_id, 0,
-+			sizeof(dest_err->oer_component.oid_device_id));
++	struct sbid_tracker *sbid;
++	struct super_block *sb = NULL;
++	unsigned long hash_idx = id & SBID_HASH_MASK;
++	int pos = 0;
 +
-+		return;
++	spin_lock(&layout_lock);
++	list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
++		pos++;
++		if (sbid->id != id)
++			continue;
++		if (pos > 1)
++			list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
++		sb = sbid->sb;
++		break;
 +	}
++	spin_unlock(&layout_lock);
++	return sb;
++}
 +
-+	if (dest_err->oer_component.oid_partition_id !=
-+				src_err->oer_component.oid_partition_id)
-+		dest_err->oer_component.oid_partition_id = 0;
-+
-+	if (dest_err->oer_component.oid_object_id !=
-+				src_err->oer_component.oid_object_id)
-+		dest_err->oer_component.oid_object_id = 0;
-+
-+	if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
-+		dest_err->oer_comp_offset = src_err->oer_comp_offset;
-+
-+	dest_end = end_offset(dest_err->oer_comp_offset,
-+			      dest_err->oer_comp_length);
-+	src_end =  end_offset(src_err->oer_comp_offset,
-+			      src_err->oer_comp_length);
-+	if (dest_end < src_end)
-+		dest_end = src_end;
-+
-+	dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
++u64
++find_create_sbid(struct super_block *sb)
++{
++	struct sbid_tracker *sbid;
++	unsigned long hash_idx = sbid_hashval(sb);
++	int pos = 0;
++	u64 id = 0;
 +
-+	if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
-+	    (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
-+			dest_err->oer_errno = src_err->oer_errno;
-+	} else if (src_err->oer_iswrite) {
-+		dest_err->oer_iswrite = true;
-+		dest_err->oer_errno = src_err->oer_errno;
++	spin_lock(&layout_lock);
++	list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
++		pos++;
++		if (sbid->sb != sb)
++			continue;
++		if (pos > 1)
++			list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
++		id = sbid->id;
++		break;
 +	}
++	spin_unlock(&layout_lock);
++
++	if (!id)
++		id = alloc_init_sbid(sb);
++
++	return id;
 +}
 +
-+static void
-+encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr)
++/*
++ * Create a layoutrecall structure
++ * An optional layoutrecall can be cloned (except for the layoutrecall lists)
++ */
++static struct nfs4_layoutrecall *
++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl,
++			struct nfs4_client *clp,
++			struct nfs4_file *lrfile)
 +{
-+	struct objlayout_io_state *state, *tmp;
-+	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
-+
-+	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
-+		unsigned i;
++	struct nfs4_layoutrecall *clr;
 +
-+		for (i = 0; i < state->num_comps; i++) {
-+			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
++	dprintk("NFSD %s\n", __func__);
++	clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL);
++	if (clr == NULL)
++		return clr;
 +
-+			if (!ioerr->oer_errno)
-+				continue;
++	dprintk("NFSD %s -->\n", __func__);
 +
-+			printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
-+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
-+				"offset=0x%llx length=0x%llx\n",
-+				__func__, i, ioerr->oer_errno,
-+				ioerr->oer_iswrite,
-+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
-+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
-+				ioerr->oer_component.oid_partition_id,
-+				ioerr->oer_component.oid_object_id,
-+				ioerr->oer_comp_offset,
-+				ioerr->oer_comp_length);
++	memset(clr, 0, sizeof(*clr));
++	if (lrfile)
++		get_nfs4_file(lrfile);
++	clr->clr_client = clp;
++	clr->clr_file = lrfile;
++	clr->cb = *cbl;
 +
-+			merge_ioerr(&accumulated_err, ioerr);
-+		}
-+		list_del(&state->err_list);
-+		objlayout_free_io_state(state);
-+	}
++	kref_init(&clr->clr_ref);
++	INIT_LIST_HEAD(&clr->clr_perclnt);
++	INIT_WORK(&clr->clr_recall.cb_work, nfsd4_do_callback_rpc);
 +
-+	BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err));
++	dprintk("NFSD %s return %p\n", __func__, clr);
++	return clr;
 +}
 +
-+void
-+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
-+			      struct xdr_stream *xdr,
-+			      const struct nfs4_layoutreturn_args *args)
++static void
++get_layoutrecall(struct nfs4_layoutrecall *clr)
 +{
-+	struct objlayout *objlay = OBJLAYOUT(pnfslay);
-+	struct objlayout_io_state *state, *tmp;
-+	__be32 *start, *uninitialized_var(last_xdr);
++	dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
++		atomic_read(&clr->clr_ref.refcount));
++	kref_get(&clr->clr_ref);
++}
 +
-+	dprintk("%s: Begin\n", __func__);
-+	start = xdr_reserve_space(xdr, 4);
-+	BUG_ON(!start);
++static void
++destroy_layoutrecall(struct kref *kref)
++{
++	struct nfs4_layoutrecall *clr =
++			container_of(kref, struct nfs4_layoutrecall, clr_ref);
++	dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr,
++		clr->clr_file, clr->clr_client);
++	BUG_ON(!list_empty(&clr->clr_perclnt));
++	if (clr->clr_file)
++		put_nfs4_file(clr->clr_file);
++	kmem_cache_free(pnfs_layoutrecall_slab, clr);
++}
 +
-+	spin_lock(&objlay->lock);
++int
++put_layoutrecall(struct nfs4_layoutrecall *clr)
++{
++	dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
++		atomic_read(&clr->clr_ref.refcount));
++	return kref_put(&clr->clr_ref, destroy_layoutrecall);
++}
 +
-+	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
-+		unsigned i;
-+		int res = 0;
++void *
++layoutrecall_done(struct nfs4_layoutrecall *clr)
++{
++	void *recall_cookie = clr->cb.cbl_cookie;
++	struct nfs4_layoutrecall *parent = clr->parent;
 +
-+		for (i = 0; i < state->num_comps && !res; i++) {
-+			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
++	dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
++		atomic_read(&clr->clr_ref.refcount));
++	BUG_ON_UNLOCKED_LAYOUT();
++	list_del_init(&clr->clr_perclnt);
++	put_layoutrecall(clr);
 +
-+			if (!ioerr->oer_errno)
-+				continue;
++	if (parent && !put_layoutrecall(parent))
++		recall_cookie = NULL;
 +
-+			dprintk("%s: err[%d]: errno=%d is_write=%d "
-+				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
-+				"offset=0x%llx length=0x%llx\n",
-+				__func__, i, ioerr->oer_errno,
-+				ioerr->oer_iswrite,
-+				_DEVID_LO(&ioerr->oer_component.oid_device_id),
-+				_DEVID_HI(&ioerr->oer_component.oid_device_id),
-+				ioerr->oer_component.oid_partition_id,
-+				ioerr->oer_component.oid_object_id,
-+				ioerr->oer_comp_offset,
-+				ioerr->oer_comp_length);
++	return recall_cookie;
++}
 +
-+			last_xdr = xdr->p;
-+			res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
-+		}
-+		if (unlikely(res)) {
-+			/* no space for even one error descriptor */
-+			BUG_ON(last_xdr == start + 1);
++/*
++ * get_state() and cb_get_state() are
++ */
++void
++release_pnfs_ds_dev_list(struct nfs4_stateid *stp)
++{
++	struct pnfs_ds_dev_entry *ddp;
 +
-+			/* we've encountered a situation with lots and lots of
-+			 * errors and no space to encode them all. Use the last
-+			 * available slot to report the union of all the
-+			 * remaining errors.
-+			 */
-+			xdr_rewind_stream(xdr, last_xdr -
-+					       pnfs_osd_ioerr_xdr_sz() / 4);
-+			encode_accumulated_error(objlay, xdr);
-+			goto loop_done;
-+		}
-+		list_del(&state->err_list);
-+		objlayout_free_io_state(state);
++	while (!list_empty(&stp->st_pnfs_ds_id)) {
++		ddp = list_entry(stp->st_pnfs_ds_id.next,
++				 struct pnfs_ds_dev_entry, dd_dev_entry);
++		list_del(&ddp->dd_dev_entry);
++		kfree(ddp);
 +	}
-+loop_done:
-+	spin_unlock(&objlay->lock);
-+
-+	*start = cpu_to_be32((xdr->p - start - 1) * 4);
-+	dprintk("%s: Return\n", __func__);
 +}
 +
-+struct objlayout_deviceinfo {
-+	struct page *page;
-+	struct pnfs_osd_deviceaddr da; /* This must be last */
-+};
-+
-+/* Initialize and call nfs_getdeviceinfo, then decode and return a
-+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
-+ * should be called.
-+ */
-+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-+	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr)
++static int
++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid)
 +{
-+	struct objlayout_deviceinfo *odi;
-+	struct pnfs_device pd;
-+	struct super_block *sb;
-+	struct page *page;
-+	size_t sz;
-+	u32 *p;
-+	int err;
++	struct pnfs_ds_dev_entry *ddp;
 +
-+	page = alloc_page(GFP_KERNEL);
-+	if (!page)
++	ddp = kmalloc(sizeof(*ddp), GFP_KERNEL);
++	if (!ddp)
 +		return -ENOMEM;
 +
-+	pd.area = page_address(page);
-+
-+	memcpy(&pd.dev_id, d_id, sizeof(*d_id));
-+	pd.layout_type = LAYOUT_OSD2_OBJECTS;
-+	pd.pages = &page;
-+	pd.pgbase = 0;
-+	pd.pglen = PAGE_SIZE;
-+	pd.mincount = 0;
-+
-+	sb = pnfslay->inode->i_sb;
-+	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->inode), &pd);
-+	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
-+	if (err)
-+		goto err_out;
-+
-+	p = pd.area;
-+	sz = pnfs_osd_xdr_deviceaddr_incore_sz(p);
-+	odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL);
-+	if (!odi) {
-+		err = -ENOMEM;
-+		goto err_out;
-+	}
-+	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
-+	odi->page = page;
-+	*deviceaddr = &odi->da;
++	INIT_LIST_HEAD(&ddp->dd_dev_entry);
++	list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id);
++	ddp->dd_dsid = dsid;
 +	return 0;
++}
 +
-+err_out:
-+	__free_page(page);
-+	return err;
++/*
++ * are two octet ranges overlapping?
++ * start1            last1
++ *   |-----------------|
++ *                start2            last2
++ *                  |----------------|
++ */
++static inline int
++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
++{
++	u64 start1 = l1->offset;
++	u64 last1 = last_byte_offset(start1, l1->length);
++	u64 start2 = l2->offset;
++	u64 last2 = last_byte_offset(start2, l2->length);
++	int ret;
++
++	/* if last1 == start2 there's a single byte overlap */
++	ret = (last2 >= start1) && (last1 >= start2);
++	dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__,
++		l1->offset, l1->length, l2->offset, l2->length, ret);
++	return ret;
 +}
 +
-+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
++static inline int
++same_fsid_major(struct nfs4_fsid *fsid, u64 major)
 +{
-+	struct objlayout_deviceinfo *odi = container_of(deviceaddr,
-+						struct objlayout_deviceinfo,
-+						da);
++	return fsid->major == major;
++}
 +
-+	__free_page(odi->page);
-+	kfree(odi);
++static inline int
++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh)
++{
++	return same_fsid_major(fsid, current_fh->fh_export->ex_fsid);
 +}
 +
 +/*
-+ * Initialize a mountpoint by retrieving the list of
-+ * available devices for it.
-+ * Return the pnfs_mount_type structure so the
-+ * pNFS_client can refer to the mount point later on.
++ * find a layout recall conflicting with the specified layoutget
 + */
-+int
-+objlayout_initialize_mountpoint(struct nfs_server *server,
-+				const struct nfs_fh *mntfh)
++static int
++is_layout_recalled(struct nfs4_client *clp,
++		   struct svc_fh *current_fh,
++		   struct nfsd4_layout_seg *seg)
 +{
-+	void *data;
++	struct nfs4_layoutrecall *clr;
 +
-+	data = objio_init_mt();
-+	if (IS_ERR(data)) {
-+		printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n",
-+		       __func__, PTR_ERR(data));
-+		return PTR_ERR(data);
++	spin_lock(&layout_lock);
++	list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) {
++		if (clr->cb.cbl_seg.layout_type != seg->layout_type)
++			continue;
++		if (clr->cb.cbl_recall_type == RETURN_ALL)
++			goto found;
++		if (clr->cb.cbl_recall_type == RETURN_FSID) {
++			if (same_fsid(&clr->cb.cbl_fsid, current_fh))
++				goto found;
++			else
++				continue;
++		}
++		BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE);
++		if (clr->cb.cbl_seg.clientid == seg->clientid &&
++		    lo_seg_overlapping(&clr->cb.cbl_seg, seg))
++			goto found;
 +	}
-+	server->pnfs_ld_data = data;
-+
-+	dprintk("%s: Return data=%p\n", __func__, data);
++	spin_unlock(&layout_lock);
 +	return 0;
++found:
++	spin_unlock(&layout_lock);
++	return 1;
 +}
 +
 +/*
-+ * Uninitialize a mountpoint
++ * are two octet ranges overlapping or adjacent?
 + */
-+int
-+objlayout_uninitialize_mountpoint(struct nfs_server *server)
++static inline int
++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
 +{
-+	dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data);
-+	objio_fini_mt(server->pnfs_ld_data);
-+	return 0;
-+}
-diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
-new file mode 100644
-index 0000000..cad24a4
---- /dev/null
-+++ b/fs/nfs/objlayout/objlayout.h
-@@ -0,0 +1,206 @@
-+/*
-+ *  objlayout.h
-+ *
-+ *  Data types and function declerations for interfacing with the
-+ *  pNFS standard object layout driver.
-+ *
-+ *  Copyright (C) 2007-2009 Panasas Inc.
-+ *  All rights reserved.
-+ *
-+ *  Benny Halevy <bhalevy at panasas.com>
-+ *  Boaz Harrosh <bharrosh at panasas.com>
-+ *
-+ *  This program is free software; you can redistribute it and/or modify
-+ *  it under the terms of the GNU General Public License version 2
-+ *  See the file COPYING included with this distribution for more details.
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the Panasas company nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
-+
-+#ifndef _OBJLAYOUT_H
-+#define _OBJLAYOUT_H
-+
-+#include <linux/nfs_fs.h>
-+#include <linux/pnfs_osd_xdr.h>
-+#include "../pnfs.h"
-+
-+/*
-+ * in-core layout segment
-+ */
-+struct objlayout_segment {
-+	struct pnfs_layout_segment lseg;
-+	void *internal;    /* for provider internal use */
-+	u8 pnfs_osd_layout[];
-+};
-+
-+/*
-+ * per-inode layout
-+ */
-+struct objlayout {
-+	struct pnfs_layout_hdr pnfs_layout;
-+
-+	 /* for layout_commit */
-+	enum osd_delta_space_valid_enum {
-+		OBJ_DSU_INIT = 0,
-+		OBJ_DSU_VALID,
-+		OBJ_DSU_INVALID,
-+	} delta_space_valid;
-+	s64 delta_space_used;  /* consumed by write ops */
-+
-+	 /* for layout_return */
-+	spinlock_t lock;
-+	struct list_head err_list;
-+};
++	u64 start1 = l1->offset;
++	u64 end1 = end_offset(start1, l1->length);
++	u64 start2 = l2->offset;
++	u64 end2 = end_offset(start2, l2->length);
 +
-+static inline struct objlayout *
-+OBJLAYOUT(struct pnfs_layout_hdr *lo)
-+{
-+	return container_of(lo, struct objlayout, pnfs_layout);
++	/* is end1 == start2 ranges are adjacent */
++	return (end2 >= start1) && (end1 >= start2);
 +}
 +
-+/*
-+ * per-I/O operation state
-+ * embedded in objects provider io_state data structure
-+ */
-+struct objlayout_io_state {
-+	struct objlayout_segment *objlseg;
++static void
++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg)
++{
++	u64 lo_start = lo->offset;
++	u64 lo_end = end_offset(lo_start, lo->length);
++	u64 lg_start = lg->offset;
++	u64 lg_end = end_offset(lg_start, lg->length);
 +
-+	struct page **pages;
-+	unsigned pgbase;
-+	unsigned nr_pages;
-+	unsigned long count;
-+	loff_t offset;
-+	bool sync;
++	/* lo already covers lg? */
++	if (lo_start <= lg_start && lg_end <= lo_end)
++		return;
 +
-+	void *rpcdata;
-+	int status;             /* res */
-+	int eof;                /* res */
-+	int committed;          /* res */
++	/* extend start offset */
++	if (lo_start > lg_start)
++		lo_start = lg_start;
 +
-+	/* Error reporting (layout_return) */
-+	struct list_head err_list;
-+	unsigned num_comps;
-+	/* Pointer to array of error descriptors of size num_comps.
-+	 * It should contain as many entries as devices in the osd_layout
-+	 * that participate in the I/O. It is up to the io_engine to allocate
-+	 * needed space and set num_comps.
-+	 */
-+	struct pnfs_osd_ioerr *ioerrs;
-+};
++	/* extend end offset */
++	if (lo_end < lg_end)
++		lo_end = lg_end;
 +
-+/*
-+ * Raid engine I/O API
-+ */
-+extern void *objio_init_mt(void);
-+extern void objio_fini_mt(void *mt);
++	lo->offset = lo_start;
++	lo->length = (lo_end == NFS4_MAX_UINT64) ?
++		      lo_end : lo_end - lo_start;
++}
 +
-+extern int objio_alloc_lseg(void **outp,
-+	struct pnfs_layout_hdr *pnfslay,
-+	struct pnfs_layout_segment *lseg,
-+	struct pnfs_osd_layout *layout);
-+extern void objio_free_lseg(void *p);
++static struct nfs4_layout *
++merge_layout(struct nfs4_file *fp,
++	     struct nfs4_client *clp,
++	     struct nfsd4_layout_seg *seg)
++{
++	struct nfs4_layout *lp = NULL;
 +
-+extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp);
-+extern void objio_free_io_state(struct objlayout_io_state *state);
++	spin_lock(&layout_lock);
++	list_for_each_entry (lp, &fp->fi_layouts, lo_perfile)
++		if (lp->lo_seg.layout_type == seg->layout_type &&
++		    lp->lo_seg.clientid == seg->clientid &&
++		    lp->lo_seg.iomode == seg->iomode &&
++		    lo_seg_mergeable(&lp->lo_seg, seg)) {
++			extend_layout(&lp->lo_seg, seg);
++			break;
++		}
++	spin_unlock(&layout_lock);
 +
-+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
-+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
-+				    bool stable);
++	return lp;
++}
 +
-+/*
-+ * callback API
-+ */
-+extern void objlayout_io_set_result(struct objlayout_io_state *state,
-+				    unsigned index, int osd_error,
-+				    u64 offset, u64 length, bool is_write);
++__be32
++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp,
++		     struct exp_xdr_stream *xdr)
++{
++	u32 status;
++	__be32 nfserr;
++	struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode;
++	struct super_block *sb = ino->i_sb;
++	int can_merge;
++	struct nfs4_file *fp;
++	struct nfs4_client *clp;
++	struct nfs4_layout *lp = NULL;
++	struct nfs4_layout_state *ls = NULL;
++	struct nfsd4_pnfs_layoutget_arg args = {
++		.lg_minlength = lgp->lg_minlength,
++		.lg_fh = &lgp->lg_fhp->fh_handle,
++	};
++	struct nfsd4_pnfs_layoutget_res res = {
++		.lg_seg = lgp->lg_seg,
++	};
 +
-+static inline void
-+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
-+{
-+	struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
++	dprintk("NFSD: %s Begin\n", __func__);
 +
-+	/* If one of the I/Os errored out and the delta_space_used was
-+	 * invalid we render the complete report as invalid. Protocol mandate
-+	 * the DSU be accurate or not reported.
-+	 */
-+	spin_lock(&objlay->lock);
-+	if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
-+		objlay->delta_space_valid = OBJ_DSU_VALID;
-+		objlay->delta_space_used += space_used;
++	args.lg_sbid = find_create_sbid(sb);
++	if (!args.lg_sbid) {
++		nfserr = nfserr_layouttrylater;
++		goto out;
 +	}
-+	spin_unlock(&objlay->lock);
-+}
-+
-+extern void objlayout_read_done(struct objlayout_io_state *state,
-+				ssize_t status, bool sync);
-+extern void objlayout_write_done(struct objlayout_io_state *state,
-+				 ssize_t status, bool sync);
-+
-+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-+	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr);
-+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
 +
-+/*
-+ * exported generic objects function vectors
-+ */
++	can_merge = sb->s_pnfs_op->can_merge_layouts != NULL &&
++		    sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type);
 +
-+extern int objlayout_initialize_mountpoint(
-+	struct nfs_server *,
-+	const struct nfs_fh *);
-+extern int objlayout_uninitialize_mountpoint(struct nfs_server *);
++	nfs4_lock_state();
++	fp = find_alloc_file(ino, lgp->lg_fhp);
++	clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid);
++	dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp);
++	if (!fp || !clp) {
++		nfserr = nfserr_inval;
++		goto out_unlock;
++	}
 +
-+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *);
-+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
++	/* Check decoded layout stateid */
++	nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls);
++	if (nfserr)
++		goto out_unlock;
 +
-+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
-+	struct pnfs_layout_hdr *,
-+	struct nfs4_layoutget_res *);
-+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
++	if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) {
++		nfserr = nfserr_recallconflict;
++		goto out;
++	}
 +
-+extern enum pnfs_try_status objlayout_read_pagelist(
-+	struct nfs_read_data *,
-+	unsigned nr_pages);
++	/* pre-alloc layout in case we can't merge after we call
++	 * the file system
++	 */
++	lp = alloc_layout();
++	if (!lp) {
++		nfserr = nfserr_layouttrylater;
++		goto out_unlock;
++	}
 +
-+extern enum pnfs_try_status objlayout_write_pagelist(
-+	struct nfs_write_data *,
-+	unsigned nr_pages,
-+	int how);
++	dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd "
++		"iomode %u offset %llu length %llu\n",
++		__func__, lgp->lg_seg.layout_type,
++		exp_xdr_qbytes(xdr->end - xdr->p),
++		lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length);
 +
-+extern enum pnfs_try_status objlayout_commit(
-+	struct nfs_write_data *,
-+	int how);
++	/* FIXME: need to eliminate the use of the state lock */
++	nfs4_unlock_state();
++	status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res);
++	nfs4_lock_state();
 +
-+extern void objlayout_encode_layoutcommit(
-+	struct pnfs_layout_hdr *,
-+	struct xdr_stream *,
-+	const struct nfs4_layoutcommit_args *);
++	dprintk("pNFS %s: post-export status %u "
++		"iomode %u offset %llu length %llu\n",
++		__func__, status, res.lg_seg.iomode,
++		res.lg_seg.offset, res.lg_seg.length);
 +
-+extern void objlayout_encode_layoutreturn(
-+	struct pnfs_layout_hdr *,
-+	struct xdr_stream *,
-+	const struct nfs4_layoutreturn_args *);
++	/*
++	 * The allowable error codes for the layout_get pNFS export
++	 * operations vector function (from the file system) can be
++	 * expanded as needed to include other errors defined for
++	 * the RFC 5561 LAYOUTGET operation.
++	 */
++	switch (status) {
++	case 0:
++		nfserr = NFS4_OK;
++		break;
++	case NFS4ERR_ACCESS:
++	case NFS4ERR_BADIOMODE:
++		/* No support for LAYOUTIOMODE4_RW layouts */
++	case NFS4ERR_BADLAYOUT:
++		/* No layout matching loga_minlength rules */
++	case NFS4ERR_INVAL:
++	case NFS4ERR_IO:
++	case NFS4ERR_LAYOUTTRYLATER:
++	case NFS4ERR_LAYOUTUNAVAILABLE:
++	case NFS4ERR_LOCKED:
++	case NFS4ERR_NOSPC:
++	case NFS4ERR_RECALLCONFLICT:
++	case NFS4ERR_SERVERFAULT:
++	case NFS4ERR_TOOSMALL:
++		/* Requested layout too big for loga_maxcount */
++	case NFS4ERR_WRONG_TYPE:
++		/* Not a regular file */
++		nfserr = cpu_to_be32(status);
++		goto out_freelayout;
++	default:
++		BUG();
++		nfserr = nfserr_serverfault;
++	}
 +
-+#endif /* _OBJLAYOUT_H */
-diff --git a/fs/nfs/objlayout/panfs_shim.c b/fs/nfs/objlayout/panfs_shim.c
-new file mode 100644
-index 0000000..4d31856
---- /dev/null
-+++ b/fs/nfs/objlayout/panfs_shim.c
-@@ -0,0 +1,702 @@
-+/*
-+ *  panfs_shim.c
-+ *
-+ *  Shim layer for interfacing with the Panasas DirectFlow module I/O stack
-+ *
-+ *  Copyright (C) 2007-2009 Panasas Inc.
-+ *  All rights reserved.
-+ *
-+ *  Benny Halevy <bhalevy at panasas.com>
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the Panasas company nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ */
++	lgp->lg_seg = res.lg_seg;
++	lgp->lg_roc = res.lg_return_on_close;
 +
-+#include <linux/module.h>
-+#include <linux/slab.h>
-+#include <asm/byteorder.h>
++	/* SUCCESS!
++	 * Can the new layout be merged into an existing one?
++	 * If so, free unused layout struct
++	 */
++	if (can_merge && merge_layout(fp, clp, &res.lg_seg))
++		goto out_freelayout;
 +
-+#include "objlayout.h"
-+#include "panfs_shim.h"
++	/* Can't merge, so let's initialize this new layout */
++	init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid);
++out_unlock:
++	if (ls)
++		put_layout_state(ls);
++	if (fp)
++		put_nfs4_file(fp);
++	nfs4_unlock_state();
++out:
++	dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp,
++		be32_to_cpu(nfserr));
++	return nfserr;
++out_freelayout:
++	free_layout(lp);
++	goto out_unlock;
++}
 +
-+#include <linux/panfs_shim_api.h>
++static void
++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr)
++{
++	u64 lo_start = lo->offset;
++	u64 lo_end = end_offset(lo_start, lo->length);
++	u64 lr_start = lr->offset;
++	u64 lr_end = end_offset(lr_start, lr->length);
 +
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++	dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__,
++		lo->offset, lo->length, lr->offset, lr->length);
 +
-+struct panfs_export_operations *panfs_export_ops;
++	/* lr fully covers lo? */
++	if (lr_start <= lo_start && lo_end <= lr_end) {
++		lo->length = 0;
++		goto out;
++	}
 +
-+void *
-+objio_init_mt(void)
-+{
-+	return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL;
-+}
++	/*
++	 * split not supported yet. retain layout segment.
++	 * remains must be returned by the client
++	 * on the final layout return.
++	 */
++	if (lo_start < lr_start && lr_end < lo_end) {
++		dprintk("%s: split not supported\n", __func__);
++		goto out;
++	}
 +
-+void objio_fini_mt(void *mountid)
-+{
-+}
++	if (lo_start < lr_start)
++		lo_end = lr_start - 1;
++	else /* lr_end < lo_end */
++		lo_start = lr_end + 1;
 +
-+static int
-+panfs_shim_conv_raid01(struct pnfs_osd_layout *layout,
-+		       struct pnfs_osd_data_map *lo_map,
-+		       pan_agg_layout_hdr_t *hdr)
-+{
-+	if (lo_map->odm_mirror_cnt) {
-+		hdr->type = PAN_AGG_RAID1;
-+		hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1;
-+	} else if (layout->olo_num_comps > 1) {
-+		hdr->type = PAN_AGG_RAID0;
-+		hdr->hdr.raid0.num_comps = layout->olo_num_comps;
-+		hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit;
-+	} else
-+		hdr->type = PAN_AGG_SIMPLE;
-+	return 0;
++	lo->offset = lo_start;
++	lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start;
++out:
++	dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length);
 +}
 +
 +static int
-+panfs_shim_conv_raid5(struct pnfs_osd_layout *layout,
-+		      struct pnfs_osd_data_map *lo_map,
-+		      pan_agg_layout_hdr_t *hdr)
++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp,
++			 struct nfsd4_pnfs_layoutreturn *lrp,
++			 struct nfs4_layout_state *ls)
 +{
-+	if (lo_map->odm_mirror_cnt)
-+		goto err;
-+
-+	if (lo_map->odm_group_width || lo_map->odm_group_depth) {
-+		if (!lo_map->odm_group_width || !lo_map->odm_group_depth)
-+			goto err;
++	int layouts_found = 0;
++	struct nfs4_layout *lp, *nextlp;
 +
-+		hdr->type = PAN_AGG_GRP_RAID5_LEFT;
-+		hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps;
-+		if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps)
-+			goto err;
-+		hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit;
-+		hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width;
-+		hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth;
-+		/* this is a guess, panasas server is not supposed to
-+		   hand out layotu otherwise */
-+		hdr->hdr.grp_raid5_left.group_layout_policy =
-+			PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN;
-+	} else {
-+		hdr->type = PAN_AGG_RAID5_LEFT;
-+		hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps;
-+		if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps)
-+			goto err;
-+		hdr->hdr.raid5_left.stripe_unit2 =
-+		hdr->hdr.raid5_left.stripe_unit1 =
-+		hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit;
++	dprintk("%s: clp %p fp %p\n", __func__, clp, fp);
++	spin_lock(&layout_lock);
++	list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) {
++		dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n",
++			__func__, lp,
++			lp->lo_client, clp,
++			lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type,
++			lp->lo_seg.iomode, lrp->args.lr_seg.iomode);
++		if (lp->lo_client != clp ||
++		    lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type ||
++		    (lp->lo_seg.iomode != lrp->args.lr_seg.iomode &&
++		     lrp->args.lr_seg.iomode != IOMODE_ANY) ||
++		     !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg))
++			continue;
++		layouts_found++;
++		trim_layout(&lp->lo_seg, &lrp->args.lr_seg);
++		if (!lp->lo_seg.length) {
++			lrp->lrs_present = 0;
++			dequeue_layout(lp);
++			destroy_layout(lp);
++		}
 +	}
++	if (ls && layouts_found && lrp->lrs_present)
++		update_layout_stateid(ls, &lrp->lr_sid);
++	spin_unlock(&layout_lock);
 +
-+	return 0;
-+err:
-+	return -EINVAL;
++	return layouts_found;
 +}
 +
-+/*
-+ * Convert a pnfs_osd data map into Panasas aggregation layout header
-+ */
 +static int
-+panfs_shim_conv_pnfs_osd_data_map(
-+	struct pnfs_osd_layout *layout,
-+	pan_agg_layout_hdr_t *hdr)
++pnfs_return_client_layouts(struct nfs4_client *clp,
++			   struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid)
 +{
-+	int status = -EINVAL;
-+	struct pnfs_osd_data_map *lo_map = &layout->olo_map;
-+
-+	if (!layout->olo_num_comps) {
-+		dprintk("%s: !!layout.n_comps(%u)\n", __func__,
-+			layout->olo_num_comps);
-+		goto err;
-+	}
++	int layouts_found = 0;
++	struct nfs4_layout *lp, *nextlp;
 +
-+	switch (lo_map->odm_raid_algorithm) {
-+	case PNFS_OSD_RAID_0:
-+		if (layout->olo_num_comps != lo_map->odm_num_comps ||
-+		    layout->olo_comps_index) {
-+			dprintk("%s: !!PNFS_OSD_RAID_0 "
-+				"layout.n_comps(%u) map.n_comps(%u) "
-+				"comps_index(%u)\n", __func__,
-+				layout->olo_num_comps,
-+				lo_map->odm_num_comps,
-+				layout->olo_comps_index);
-+			goto err;
-+		}
-+		status = panfs_shim_conv_raid01(layout, lo_map, hdr);
-+		break;
++	spin_lock(&layout_lock);
++	list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) {
++		if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type ||
++		   (lrp->args.lr_seg.iomode != lp->lo_seg.iomode &&
++		    lrp->args.lr_seg.iomode != IOMODE_ANY))
++			continue;
 +
-+	case PNFS_OSD_RAID_5:
-+		if (!lo_map->odm_group_width) {
-+			if (layout->olo_num_comps != lo_map->odm_num_comps ||
-+			    layout->olo_comps_index) {
-+				dprintk("%s: !!PNFS_OSD_RAID_5 !group_width "
-+					"layout.n_comps(%u)!=map.n_comps(%u) "
-+					"|| comps_index(%u)\n", __func__,
-+					layout->olo_num_comps,
-+					lo_map->odm_num_comps,
-+					layout->olo_comps_index);
-+				goto err;
-+			}
-+		} else if ((layout->olo_num_comps != lo_map->odm_num_comps &&
-+			    layout->olo_num_comps > lo_map->odm_group_width) ||
-+			   (layout->olo_comps_index % lo_map->odm_group_width)){
-+				dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) "
-+					"layout.n_comps(%u) map.n_comps(%u) "
-+					"comps_index(%u)\n", __func__,
-+					lo_map->odm_group_width,
-+					layout->olo_num_comps,
-+					lo_map->odm_num_comps,
-+					layout->olo_comps_index);
-+				goto err;
-+			}
-+		status = panfs_shim_conv_raid5(layout, lo_map, hdr);
-+		break;
++		if (lrp->args.lr_return_type == RETURN_FSID &&
++		    !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid))
++			continue;
 +
-+	case PNFS_OSD_RAID_4:
-+	case PNFS_OSD_RAID_PQ:
-+	default:
-+		dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__,
-+			lo_map->odm_raid_algorithm);
-+		goto err;
++		layouts_found++;
++		dequeue_layout(lp);
++		destroy_layout(lp);
 +	}
++	spin_unlock(&layout_lock);
 +
-+	return 0;
-+
-+err:
-+	return status;
++	return layouts_found;
 +}
 +
-+/*
-+ * Convert pnfs_osd layout into Panasas map and caps type
-+ */
-+int
-+objio_alloc_lseg(void **outp,
-+	struct pnfs_layout_hdr *pnfslay,
-+	struct pnfs_layout_segment *lseg,
-+	struct pnfs_osd_layout *layout)
++static int
++recall_return_perfect_match(struct nfs4_layoutrecall *clr,
++			    struct nfsd4_pnfs_layoutreturn *lrp,
++			    struct nfs4_file *fp,
++			    struct svc_fh *current_fh)
 +{
-+	int i, total_comps;
-+	int status;
-+	struct pnfs_osd_object_cred *lo_comp;
-+	pan_size_t alloc_sz, local_sz;
-+	pan_sm_map_cap_t *mcs = NULL;
-+	u8 *buf;
-+	pan_agg_comp_obj_t *pan_comp;
-+	pan_sm_sec_t *pan_sec;
++	if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode ||
++	    clr->cb.cbl_recall_type != lrp->args.lr_return_type)
++		return 0;
 +
-+	status = -EINVAL;
-+	if (layout->olo_num_comps < layout->olo_map.odm_group_width) {
-+		total_comps = layout->olo_comps_index + layout->olo_num_comps;
-+	} else {
-+		/* allocate full map, otherwise SAM gets confused */
-+		total_comps = layout->olo_map.odm_num_comps;
-+	}
-+	alloc_sz = total_comps *
-+		   (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t));
-+	for (i = 0; i < layout->olo_num_comps; i++) {
-+		void *p = layout->olo_comps[i].oc_cap.cred;
-+		if (panfs_export_ops->sm_sec_t_get_size_otw(
-+			(pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL))
-+			goto err;
-+		alloc_sz += local_sz;
-+	}
++	return (clr->cb.cbl_recall_type == RETURN_FILE &&
++		clr->clr_file == fp &&
++		clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset &&
++		clr->cb.cbl_seg.length == lrp->args.lr_seg.length) ||
 +
-+	status = -ENOMEM;
-+	mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL);
-+	if (!mcs)
-+		goto err;
-+	buf = (u8 *)&mcs[1];
++		(clr->cb.cbl_recall_type == RETURN_FSID &&
++		 same_fsid(&clr->cb.cbl_fsid, current_fh)) ||
 +
-+	mcs->offset = lseg->range.offset;
-+	mcs->length = lseg->range.length;
-+#if 0
-+	/* FIXME: for now */
-+	mcs->expiration_time.ts_sec  = 0;
-+	mcs->expiration_time.ts_nsec = 0;
-+#endif
-+	mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL;
-+	status = panfs_shim_conv_pnfs_osd_data_map(layout,
-+						   &mcs->full_map.layout_hdr);
-+	if (status)
-+		goto err;
++		clr->cb.cbl_recall_type == RETURN_ALL;
++}
 +
-+	mcs->full_map.components.size = total_comps;
-+	mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf;
-+	buf += total_comps * sizeof(pan_agg_comp_obj_t);
++static int
++recall_return_partial_match(struct nfs4_layoutrecall *clr,
++			    struct nfsd4_pnfs_layoutreturn *lrp,
++			    struct nfs4_file *fp,
++			    struct svc_fh *current_fh)
++{
++	/* iomode matching? */
++	if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode &&
++	    clr->cb.cbl_seg.iomode != IOMODE_ANY &&
++	    lrp->args.lr_seg.iomode != IOMODE_ANY)
++		return 0;
 +
-+	mcs->secs.size = total_comps;
-+	mcs->secs.data = (pan_sm_sec_t *)buf;
-+	buf += total_comps * sizeof(pan_sm_sec_t);
++	if (clr->cb.cbl_recall_type == RETURN_ALL ||
++	    lrp->args.lr_return_type == RETURN_ALL)
++		return 1;
 +
-+	lo_comp = layout->olo_comps;
-+	pan_comp = mcs->full_map.components.data + layout->olo_comps_index;
-+	pan_sec = mcs->secs.data + layout->olo_comps_index;
-+	for (i = 0; i < layout->olo_num_comps; i++) {
-+		void *p;
-+		pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id;
-+		struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id;
-+		u64 dev_id = __be64_to_cpup(
-+			(__be64 *)oc_obj_id->oid_device_id.data + 1);
++	/* fsid matches? */
++	if (clr->cb.cbl_recall_type == RETURN_FSID ||
++	    lrp->args.lr_return_type == RETURN_FSID)
++		return same_fsid(&clr->cb.cbl_fsid, current_fh);
++
++	/* file matches, range overlapping? */
++	return clr->clr_file == fp &&
++	       lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg);
++}
++
++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh,
++			    struct nfsd4_pnfs_layoutreturn *lrp)
++{
++	int status = 0;
++	int layouts_found = 0;
++	struct inode *ino = current_fh->fh_dentry->d_inode;
++	struct nfs4_file *fp = NULL;
++	struct nfs4_client *clp;
++	struct nfs4_layout_state *ls = NULL;
++	struct nfs4_layoutrecall *clr, *nextclr;
++	u64 ex_fsid = current_fh->fh_export->ex_fsid;
++	void *recall_cookie = NULL;
 +
-+		dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n",
-+			__func__, i,
-+			__be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data),
-+			__be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1),
-+			oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id);
++	dprintk("NFSD: %s\n", __func__);
 +
-+		if (i == 0) {
-+			/* make up mgr_id to calm sam down */
-+			pan_mgr_id_construct_artificial(PAN_MGR_SM, 0,
-+							&obj_id->dev_id);
-+			obj_id->grp_id = oc_obj_id->oid_partition_id;
-+			obj_id->obj_id = oc_obj_id->oid_object_id;
-+		}
++	nfs4_lock_state();
++	clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid);
++	if (!clp)
++		goto out;
 +
-+		if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) {
-+			dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n",
-+				__func__, i, (u64)obj_id->grp_id,
-+				lo_comp->oc_object_id.oid_partition_id);
-+			status = -EINVAL;
-+			goto err;
++	if (lrp->args.lr_return_type == RETURN_FILE) {
++		fp = find_file(ino);
++		if (!fp) {
++			printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for "
++				"ino %p:%lu\n",
++				__func__, ino, ino ? ino->i_ino : 0L);
++			goto out;
 +		}
 +
-+		if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) {
-+			dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n",
-+				__func__, i, obj_id->obj_id,
-+				lo_comp->oc_object_id.oid_object_id);
-+			status = -EINVAL;
-+			goto err;
-+		}
++		/* Check the stateid */
++		dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino);
++		status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls);
++		if (status)
++			goto out_put_file;
 +
-+		pan_comp->dev_id = dev_id;
-+		if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) {
-+			dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n",
-+				__func__, i, obj_id->dev_id);
-+			status = -EINVAL;
-+			goto err;
-+		}
-+		if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) {
-+			dprintk("%s: degraded maps not supported yet\n",
-+				__func__);
-+			status = -ENOTSUPP;
-+			goto err;
-+		}
-+		pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL;
-+		if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) {
-+			dprintk("%s: cap key security not supported yet\n",
-+				__func__);
-+			status = -ENOTSUPP;
-+			goto err;
-+		}
++		/* update layouts */
++		layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls);
++		/* optimize for the all-empty case */
++		if (list_empty(&fp->fi_layouts))
++			recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
++	} else {
++		layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid);
++	}
 +
-+		p = lo_comp->oc_cap.cred;
-+		panfs_export_ops->sm_sec_t_unmarshall(
-+			(pan_sm_sec_otw_t *)&p,
-+			pan_sec,
-+			buf,
-+			alloc_sz,
-+			NULL,
-+			&local_sz);
-+		buf += local_sz;
-+		alloc_sz -= local_sz;
++	dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d "
++		"return_type %d fsid 0x%llx offset %llu length %llu: "
++		"layouts_found %d\n",
++		__func__, clp, fp, lrp->args.lr_seg.layout_type,
++		lrp->args.lr_seg.iomode, lrp->args.lr_return_type,
++		ex_fsid,
++		lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found);
 +
-+		lo_comp++;
-+		pan_comp++;
-+		pan_sec++;
++	/* update layoutrecalls
++	 * note: for RETURN_{FSID,ALL}, fp may be NULL
++	 */
++	spin_lock(&layout_lock);
++	list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls,
++				  clr_perclnt) {
++		if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type)
++			continue;
++
++		if (recall_return_perfect_match(clr, lrp, fp, current_fh))
++			recall_cookie = layoutrecall_done(clr);
++		else if (layouts_found &&
++			 recall_return_partial_match(clr, lrp, fp, current_fh))
++			clr->clr_time = CURRENT_TIME;
 +	}
++	spin_unlock(&layout_lock);
 +
-+	*outp = mcs;
-+	dprintk("%s:Return mcs=%p\n", __func__, mcs);
-+	return 0;
++out_put_file:
++	if (fp)
++		put_nfs4_file(fp);
++	if (ls)
++		put_layout_state(ls);
++out:
++	nfs4_unlock_state();
 +
-+err:
-+	objio_free_lseg(mcs);
-+	dprintk("%s:Error %d\n", __func__, status);
-+	return status;
-+}
++	/* call exported filesystem layout_return (ignore return-code) */
++	fs_layout_return(sb, ino, lrp, 0, recall_cookie);
 +
-+/*
-+ * Free a Panasas map and caps type
-+ */
-+void
-+objio_free_lseg(void *p)
-+{
-+	kfree(p);
++	dprintk("pNFS %s: exit status %d \n", __func__, status);
++	return status;
 +}
 +
 +/*
-+ * I/O routines
++ * PNFS Metadata server export operations callback for get_state
++ *
++ * called by the cluster fs when it receives a get_state() from a data
++ * server.
++ * returns status, or pnfs_get_state* with pnfs_get_state->status set.
++ *
 + */
 +int
-+objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg)
 +{
-+	struct panfs_shim_io_state *p;
++	struct nfs4_stateid *stp;
++	int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */
++	int status = -EINVAL;
++	struct inode *ino;
++	struct nfs4_delegation *dl;
++	stateid_t *stid = (stateid_t *)&arg->stid;
 +
-+	dprintk("%s: allocating io_state\n", __func__);
-+	p = kzalloc(sizeof(*p), GFP_KERNEL);
-+	if (!p)
-+		return -ENOMEM;
++	dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__,
++		STATEID_VAL(stid), arg->ino);
 +
-+	*outp = &p->ol_state;
-+	return 0;
-+}
++	nfs4_lock_state();
++	stp = find_stateid(stid, flags);
++	if (!stp) {
++		ino = iget_locked(sb, arg->ino);
++		if (!ino)
++			goto out;
 +
-+/*
-+ * Free an I/O state
-+ */
-+void
-+objio_free_io_state(struct objlayout_io_state *ol_state)
-+{
-+	struct panfs_shim_io_state *state = container_of(ol_state,
-+					struct panfs_shim_io_state, ol_state);
-+	int i;
++		if (ino->i_state & I_NEW) {
++			iget_failed(ino);
++			goto out;
++		}
 +
-+	dprintk("%s: freeing io_state\n", __func__);
-+	for (i = 0; i < state->ol_state.nr_pages; i++)
-+		kunmap(state->ol_state.pages[i]);
++		dl = find_delegation_stateid(ino, stid);
++		if (dl)
++			status = 0;
 +
-+	if (state->ucreds)
-+		panfs_export_ops->ucreds_put(state->ucreds);
-+	kfree(state->sg_list);
-+	kfree(state);
++		iput(ino);
++	} else {
++		/* XXX ANDROS: marc removed nfs4_check_fh - how come? */
++
++		/* arg->devid is the Data server id, set by the cluster fs */
++		status = nfs4_add_pnfs_ds_dev(stp, arg->dsid);
++		if (status)
++			goto out;
++
++		arg->access = stp->st_access_bmap;
++		*(clientid_t *)&arg->clid =
++			stp->st_stateowner->so_client->cl_clientid;
++	}
++out:
++	nfs4_unlock_state();
++	return status;
 +}
 +
 +static int
-+panfs_shim_pages_to_sg(
-+	struct panfs_shim_io_state *state,
-+	struct page **pages,
-+	unsigned int pgbase,
-+	unsigned nr_pages,
-+	size_t count)
++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile,
++		   stateid_t *lsid)
 +{
-+	unsigned i, n;
-+	pan_sg_entry_t *sg;
-+
-+	dprintk("%s pgbase %u nr_pages %u count %d "
-+		"pg0 %p flags 0x%x index %llu\n",
-+		__func__, pgbase, nr_pages, (int)count, pages[0],
-+		(unsigned)pages[0]->flags, (unsigned long long)pages[0]->index);
-+
-+	sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL);
-+	if (sg == NULL)
-+		return -ENOMEM;
++	int found = 0;
++	struct nfs4_layout *lp;
++	struct nfs4_layout_state *ls;
 +
-+	dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n",
-+		__func__, sg, pages, pgbase, nr_pages);
++	spin_lock(&layout_lock);
++	list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) {
++		if (lp->lo_file != lrfile)
++			continue;
 +
-+	for (i = 0; i < nr_pages; i++) {
-+		sg[i].buffer = (char *)kmap(pages[i]) + pgbase;
-+		n = PAGE_SIZE - pgbase;
-+		pgbase = 0;
-+		if (n > count)
-+			n = count;
-+		sg[i].chunk_size = n;
-+		count -= n;
-+		if (likely(count)) {
-+			sg[i].next = &sg[i+1];
-+		} else {
-+			/* we're done */
-+			sg[i].next = NULL;
++		ls = find_get_layout_state(clp, lrfile);
++		if (!ls) {
++			/* This shouldn't happen as the file should have a
++			 * layout stateid if it has a layout.
++			 */
++			printk(KERN_ERR "%s: file %p has no layout stateid\n",
++				__func__, lrfile);
++			WARN_ON(1);
 +			break;
 +		}
++		update_layout_stateid(ls, lsid);
++		put_layout_state_locked(ls);
++		found = 1;
++		break;
 +	}
-+	BUG_ON(count);
++	spin_unlock(&layout_lock);
 +
-+	state->sg_list = sg;
-+	return 0;
++	return found;
 +}
 +
-+/*
-+ * Callback function for async reads
-+ */
-+static void
-+panfs_shim_read_done(
-+	void *arg1,
-+	void *arg2,
-+	pan_sam_read_res_t *res_p,
-+	pan_status_t rc)
++static int
++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid)
 +{
-+	struct panfs_shim_io_state *state = arg1;
-+	ssize_t status;
++	int found = 0;
++	struct nfs4_layout *lp;
 +
-+	dprintk("%s: Begin\n", __func__);
-+	if (!res_p)
-+		res_p = &state->u.read.res;
-+	if (rc == PAN_SUCCESS)
-+		rc = res_p->result;
-+	if (rc == PAN_SUCCESS) {
-+		status = res_p->length;
-+		WARN_ON(status < 0);
-+	} else {
-+		status = -panfs_export_ops->convert_rc(rc);
-+		dprintk("%s: pan_sam_read rc %d: status %Zd\n",
-+			__func__, rc, status);
-+	}
-+	dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
-+	objlayout_read_done(&state->ol_state, status, true);
++	/* note: minor version unused */
++	spin_lock(&layout_lock);
++	list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt)
++		if (lp->lo_file->fi_fsid.major == fsid->major) {
++			found = 1;
++			break;
++		}
++	spin_unlock(&layout_lock);
++	return found;
 +}
 +
-+ssize_t
-+objio_read_pagelist(struct objlayout_io_state *ol_state)
++static int
++cl_has_any_layout(struct nfs4_client *clp)
 +{
-+	struct panfs_shim_io_state *state = container_of(ol_state,
-+					struct panfs_shim_io_state, ol_state);
-+	pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
-+	ssize_t status = 0;
-+	pan_status_t rc = PAN_SUCCESS;
-+
-+	dprintk("%s: Begin\n", __func__);
-+
-+	status = panfs_shim_pages_to_sg(state, ol_state->pages,
-+					ol_state->pgbase, ol_state->nr_pages,
-+					ol_state->count);
-+	if (unlikely(status))
-+		goto err;
-+
-+	state->obj_sec.min_security = 0;
-+	state->obj_sec.map_ccaps = mcs;
++	return !list_empty(&clp->cl_layouts);
++}
 +
-+	rc = panfs_export_ops->ucreds_get(&state->ucreds);
-+	if (unlikely(rc)) {
-+		status = -EACCES;
-+		goto err;
++static int
++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl,
++	      struct nfs4_file *lrfile, stateid_t *lsid)
++{
++	switch (cbl->cbl_recall_type) {
++	case RETURN_FILE:
++		return cl_has_file_layout(clp, lrfile, lsid);
++	case RETURN_FSID:
++		return cl_has_fsid_layout(clp, &cbl->cbl_fsid);
++	default:
++		return cl_has_any_layout(clp);
 +	}
-+
-+	state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id;
-+	state->u.read.args.offset = ol_state->offset;
-+	rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP,
-+					&state->u.read.args,
-+					&state->obj_sec,
-+					state->sg_list,
-+					state->ucreds,
-+					ol_state->sync ?
-+						NULL : panfs_shim_read_done,
-+					state, NULL,
-+					&state->u.read.res);
-+	if (rc != PAN_ERR_IN_PROGRESS)
-+		panfs_shim_read_done(state, NULL, &state->u.read.res, rc);
-+ err:
-+	dprintk("%s: Return %Zd\n", __func__, status);
-+	return status;
 +}
 +
 +/*
-+ * Callback function for async writes
++ * Called without the layout_lock.
 + */
-+static void
-+panfs_shim_write_done(
-+	void *arg1,
-+	void *arg2,
-+	pan_sam_write_res_t *res_p,
-+	pan_status_t rc)
++void
++nomatching_layout(struct nfs4_layoutrecall *clr)
 +{
-+	struct panfs_shim_io_state *state = arg1;
-+	ssize_t status;
-+
-+	dprintk("%s: Begin\n", __func__);
-+	if (!res_p)
-+		res_p = &state->u.write.res;
-+	if (rc == PAN_SUCCESS)
-+		rc = res_p->result;
-+	if (rc == PAN_SUCCESS) {
-+/*		state->ol_state.committed = NFS_FILE_SYNC;*/
-+		state->ol_state.committed = NFS_UNSTABLE;
-+		status = res_p->length;
-+		WARN_ON(status < 0);
++	struct nfsd4_pnfs_layoutreturn lr = {
++		.args.lr_return_type = clr->cb.cbl_recall_type,
++		.args.lr_seg = clr->cb.cbl_seg,
++	};
++	struct inode *inode;
++	void *recall_cookie;
 +
-+		objlayout_add_delta_space_used(&state->ol_state,
-+					       res_p->delta_capacity_used);
++	if (clr->clr_file) {
++		inode = igrab(clr->clr_file->fi_inode);
++		if (WARN_ON(!inode))
++			return;
 +	} else {
-+		status = -panfs_export_ops->convert_rc(rc);
-+		dprintk("%s: pan_sam_write rc %u: status %Zd\n",
-+			__func__, rc, status);
++		inode = NULL;
 +	}
-+	dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
-+	objlayout_write_done(&state->ol_state, status, true);
-+}
-+
-+ssize_t
-+objio_write_pagelist(struct objlayout_io_state *ol_state,
-+		     bool stable /* unused, PanOSD writes are stable */)
-+{
-+	struct panfs_shim_io_state *state = container_of(ol_state,
-+					struct panfs_shim_io_state, ol_state);
-+	pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
-+	ssize_t status = 0;
-+	pan_status_t rc = PAN_SUCCESS;
-+
-+	dprintk("%s: Begin\n", __func__);
 +
-+	status = panfs_shim_pages_to_sg(state, ol_state->pages,
-+					ol_state->pgbase, ol_state->nr_pages,
-+					ol_state->count);
-+	if (unlikely(status))
-+		goto err;
++	dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__,
++		clr->clr_client, clr->clr_file);
 +
-+	state->obj_sec.min_security = 0;
-+	state->obj_sec.map_ccaps = mcs;
++	if (clr->cb.cbl_recall_type == RETURN_FILE)
++		pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr,
++					 NULL);
++	else
++		pnfs_return_client_layouts(clr->clr_client, &lr,
++					   clr->cb.cbl_fsid.major);
 +
-+	rc = panfs_export_ops->ucreds_get(&state->ucreds);
-+	if (unlikely(rc)) {
-+		status = -EACCES;
-+		goto err;
-+	}
++	spin_lock(&layout_lock);
++	recall_cookie = layoutrecall_done(clr);
++	spin_unlock(&layout_lock);
 +
-+	state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id;
-+	state->u.write.args.offset = ol_state->offset;
-+	rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE,
-+					 &state->u.write.args,
-+					 &state->obj_sec,
-+					 state->sg_list,
-+					 state->ucreds,
-+					 ol_state->sync ?
-+						NULL : panfs_shim_write_done,
-+					 state,
-+					 NULL,
-+					 &state->u.write.res);
-+	if (rc != PAN_ERR_IN_PROGRESS)
-+		panfs_shim_write_done(state, NULL, &state->u.write.res, rc);
-+ err:
-+	dprintk("%s: Return %Zd\n", __func__, status);
-+	return status;
++	fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN,
++			 recall_cookie);
++	iput(inode);
 +}
 +
-+int
-+panfs_shim_register(struct panfs_export_operations *ops)
++void pnfs_expire_client(struct nfs4_client *clp)
 +{
-+	if (panfs_export_ops) {
-+		printk(KERN_INFO
-+		       "%s: panfs already registered (panfs ops %p)\n",
-+		       __func__, panfs_export_ops);
-+		return -EINVAL;
++	for (;;) {
++		struct nfs4_layoutrecall *lrp = NULL;
++
++		spin_lock(&layout_lock);
++		if (!list_empty(&clp->cl_layoutrecalls)) {
++			lrp = list_entry(clp->cl_layoutrecalls.next,
++					 struct nfs4_layoutrecall, clr_perclnt);
++			get_layoutrecall(lrp);
++		}
++		spin_unlock(&layout_lock);
++		if (!lrp)
++			break;
++
++		dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file);
++		BUG_ON(lrp->clr_client != clp);
++		nomatching_layout(lrp);
++		put_layoutrecall(lrp);
 +	}
 +
-+	printk(KERN_INFO "%s: registering panfs ops %p\n",
-+	       __func__, ops);
++	for (;;) {
++		struct nfs4_layout *lp = NULL;
++		struct inode *inode = NULL;
++		struct nfsd4_pnfs_layoutreturn lr;
++		bool empty = false;
 +
-+	panfs_export_ops = ops;
-+	return 0;
-+}
-+EXPORT_SYMBOL(panfs_shim_register);
++		spin_lock(&layout_lock);
++		if (!list_empty(&clp->cl_layouts)) {
++			lp = list_entry(clp->cl_layouts.next,
++					struct nfs4_layout, lo_perclnt);
++			inode = igrab(lp->lo_file->fi_inode);
++			memset(&lr, 0, sizeof(lr));
++			lr.args.lr_return_type = RETURN_FILE;
++			lr.args.lr_seg = lp->lo_seg;
++			empty = list_empty(&lp->lo_file->fi_layouts);
++			BUG_ON(lp->lo_client != clp);
++			dequeue_layout(lp);
++			destroy_layout(lp); /* do not access lp after this */
++		}
++		spin_unlock(&layout_lock);
++		if (!lp)
++			break;
 +
-+int
-+panfs_shim_unregister(void)
-+{
-+	if (!panfs_export_ops) {
-+		printk(KERN_INFO "%s: panfs is not registered\n", __func__);
-+		return -EINVAL;
-+	}
++		if (WARN_ON(!inode))
++			break;
 +
-+	printk(KERN_INFO "%s: unregistering panfs ops %p\n",
-+	       __func__, panfs_export_ops);
++		dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino,
++			lp, clp);
 +
-+	panfs_export_ops = NULL;
-+	return 0;
++		fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE,
++				 empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL);
++		iput(inode);
++	}
 +}
-+EXPORT_SYMBOL(panfs_shim_unregister);
-+
-+/*
-+ * Policy Operations
-+ */
 +
-+#define PANLAYOUT_DEF_STRIPE_UNIT    (64*1024)
-+#define PANLAYOUT_DEF_STRIPE_WIDTH   9
-+#define PANLAYOUT_MAX_STRIPE_WIDTH   11
-+#define PANLAYOUT_MAX_GATHER_STRIPES 8
++struct create_recall_list_arg {
++	struct nfsd4_pnfs_cb_layout *cbl;
++	struct nfs4_file *lrfile;
++	struct list_head *todolist;
++	unsigned todo_count;
++};
 +
 +/*
-+ * Get the max [rw]size
++ * look for matching layout for the given client
++ * and add a pending layout recall to the todo list
++ * if found any.
++ * returns:
++ *   0 if layouts found or negative error.
 + */
-+static ssize_t
-+panlayout_get_blocksize(void)
++static int
++lo_recall_per_client(struct nfs4_client *clp, void *p)
 +{
-+	ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) *
-+		      PANLAYOUT_DEF_STRIPE_UNIT *
-+		      PANLAYOUT_MAX_GATHER_STRIPES;
-+	dprintk("%s: Return %Zd\n", __func__, sz);
-+	return sz;
-+}
-+
-+/*
-+ * Don't gather across stripes, but rather gather (coalesce) up to
-+ * the stripe size.
-+ *
-+ * FIXME: change interface to use merge_align, merge_count
-+ */
-+#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS)
-+
-+static struct pnfs_layoutdriver_type panlayout_type = {
-+	.id = PNFS_LAYOUT_PANOSD,
-+	.name = "PNFS_LAYOUT_PANOSD",
-+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
-+
-+	.initialize_mountpoint   = objlayout_initialize_mountpoint,
-+	.uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
-+
-+	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
-+	.free_layout_hdr         = objlayout_free_layout_hdr,
++	stateid_t lsid;
++	struct nfs4_layoutrecall *pending;
++	struct create_recall_list_arg *arg = p;
 +
-+	.alloc_lseg              = objlayout_alloc_lseg,
-+	.free_lseg               = objlayout_free_lseg,
++	memset(&lsid, 0, sizeof(lsid));
++	if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid))
++		return 0;
 +
-+	.get_blocksize           = panlayout_get_blocksize,
++	/* Matching put done by layoutreturn */
++	pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile);
++	/* out of memory, drain todo queue */
++	if (!pending)
++		return -ENOMEM;
 +
-+	.read_pagelist           = objlayout_read_pagelist,
-+	.write_pagelist          = objlayout_write_pagelist,
-+	.commit                  = objlayout_commit,
++	*(stateid_t *)&pending->cb.cbl_sid = lsid;
++	list_add(&pending->clr_perclnt, arg->todolist);
++	arg->todo_count++;
++	return 0;
++}
 +
-+	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
-+	.encode_layoutreturn     = objlayout_encode_layoutreturn,
-+};
++/* Create a layoutrecall structure for each client based on the
++ * original structure. */
++int
++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len,
++			  struct nfsd4_pnfs_cb_layout *cbl,
++			  struct nfs4_file *lrfile)
++{
++	struct nfs4_client *clp;
++	struct create_recall_list_arg arg = {
++		.cbl = cbl,
++		.lrfile = lrfile,
++		.todolist = todolist,
++	};
++	int status = 0;
 +
-+MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs");
-+MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
-+MODULE_LICENSE("GPL");
++	dprintk("%s: -->\n", __func__);
 +
-+static int __init
-+panlayout_init(void)
-+{
-+	int ret = pnfs_register_layoutdriver(&panlayout_type);
++	/* If client given by fs, just do single client */
++	if (cbl->cbl_seg.clientid) {
++		clp = find_confirmed_client(
++				(clientid_t *)&cbl->cbl_seg.clientid);
++		if (!clp) {
++			status = -ENOENT;
++			dprintk("%s: clientid %llx not found\n", __func__,
++				(unsigned long long)cbl->cbl_seg.clientid);
++			goto out;
++		}
 +
-+	if (ret)
-+		printk(KERN_INFO
-+			"%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n",
-+			__func__, ret);
-+	else
-+		printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n",
-+			__func__);
-+	return ret;
-+}
++		status = lo_recall_per_client(clp, &arg);
++	} else {
++		/* Check all clients for layout matches */
++		status = filter_confirmed_clients(lo_recall_per_client, &arg);
++	}
 +
-+static void __exit
-+panlayout_exit(void)
-+{
-+	pnfs_unregister_layoutdriver(&panlayout_type);
-+	printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n",
-+	       __func__);
++out:
++	*todo_len = arg.todo_count;
++	dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status);
++	return status;
 +}
 +
-+module_init(panlayout_init);
-+module_exit(panlayout_exit);
-diff --git a/fs/nfs/objlayout/panfs_shim.h b/fs/nfs/objlayout/panfs_shim.h
-new file mode 100644
-index 0000000..18ef6db
---- /dev/null
-+++ b/fs/nfs/objlayout/panfs_shim.h
-@@ -0,0 +1,482 @@
 +/*
-+ *  panfs_shim.h
-+ *
-+ *  Data types and external function declerations for interfacing with
-+ *  panfs (Panasas DirectFlow) I/O stack
-+ *
-+ *  Copyright (C) 2007 Panasas Inc.
-+ *  All rights reserved.
-+ *
-+ *  Benny Halevy <bhalevy at panasas.com>
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the Panasas company nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ * See the file COPYING included with this distribution for more details.
-+ *
++ * Recall layouts asynchronously
++ * Called with state lock.
 + */
++static int
++spawn_layout_recall(struct super_block *sb, struct list_head *todolist,
++		    unsigned todo_len)
++{
++	struct nfs4_layoutrecall *pending;
++	struct nfs4_layoutrecall *parent = NULL;
++	int status = 0;
 +
-+#ifndef _PANLAYOUT_PANFS_SHIM_H
-+#define _PANLAYOUT_PANFS_SHIM_H
-+
-+typedef s8 pan_int8_t;
-+typedef u8 pan_uint8_t;
-+typedef s16 pan_int16_t;
-+typedef u16 pan_uint16_t;
-+typedef s32 pan_int32_t;
-+typedef u32 pan_uint32_t;
-+typedef s64 pan_int64_t;
-+typedef u64 pan_uint64_t;
++	dprintk("%s: -->\n", __func__);
 +
-+/*
-+ * from pan_base_types.h
-+ */
-+typedef  pan_uint64_t pan_rpc_none_t;
-+typedef pan_uint32_t  pan_rpc_arrdim_t;
-+typedef pan_uint32_t  pan_status_t;
-+typedef pan_uint8_t   pan_otw_t;
-+typedef pan_uint8_t   pan_pad_t;
++	if (todo_len > 1) {
++		pending = list_entry(todolist->next, struct nfs4_layoutrecall,
++				     clr_perclnt);
 +
-+typedef pan_uint32_t  pan_timespec_sec_t;
-+typedef pan_uint32_t  pan_timespec_nsec_t;
++		parent = alloc_init_layoutrecall(&pending->cb, NULL,
++						 pending->clr_file);
++		if (unlikely(!parent)) {
++			/* We want forward progress. If parent cannot be
++			 * allocated take the first one as parent but don't
++			 * execute it.  Caller must check for -EAGAIN, if so
++			 * When the partial recalls return,
++			 * nfsd_layout_recall_cb should be called again.
++			 */
++			list_del_init(&pending->clr_perclnt);
++			if (todo_len > 2) {
++				parent = pending;
++			} else {
++				parent = NULL;
++				put_layoutrecall(pending);
++			}
++			--todo_len;
++				status = -ENOMEM;
++		}
++	}
 +
-+typedef  struct pan_timespec_s  pan_timespec_t;
-+struct pan_timespec_s {
-+  pan_timespec_sec_t   ts_sec;
-+  pan_timespec_nsec_t  ts_nsec;
-+};
++	while (!list_empty(todolist)) {
++		pending = list_entry(todolist->next, struct nfs4_layoutrecall,
++				     clr_perclnt);
++		list_del_init(&pending->clr_perclnt);
++		dprintk("%s: clp %p cb_client %p fp %p\n", __func__,
++			pending->clr_client,
++			pending->clr_client->cl_cb_client,
++			pending->clr_file);
++		if (unlikely(!pending->clr_client->cl_cb_client)) {
++			printk(KERN_INFO
++				"%s: clientid %08x/%08x has no callback path\n",
++				__func__,
++				pending->clr_client->cl_clientid.cl_boot,
++				pending->clr_client->cl_clientid.cl_id);
++			put_layoutrecall(pending);
++			continue;
++		}
 +
-+/*
-+ * from pan_std_types.h
-+ */
-+typedef pan_uint32_t pan_size_t;
-+typedef  int  pan_bool_t;
++		pending->clr_time = CURRENT_TIME;
++		pending->clr_sb = sb;
++		if (parent) {
++			/* If we created a parent its initial ref count is 1.
++			 * We will need to de-ref it eventually. So we just
++			 * don't increment on behalf of the last one.
++			 */
++			if (todo_len != 1)
++				get_layoutrecall(parent);
++		}
++		pending->parent = parent;
++		get_layoutrecall(pending);
++		/* Add to list so corresponding layoutreturn can find req */
++		list_add(&pending->clr_perclnt,
++			 &pending->clr_client->cl_layoutrecalls);
 +
-+/*
-+ * from pan_common_error.h
-+ */
-+#define PAN_SUCCESS                                         ((pan_status_t)0)
-+#define PAN_ERR_IN_PROGRESS                                 ((pan_status_t)55)
++		nfsd4_cb_layout(pending);
++		--todo_len;
++	}
 +
-+/*
-+ * from pan_sg.h
-+ */
-+typedef struct pan_sg_entry_s pan_sg_entry_t;
-+struct pan_sg_entry_s {
-+  void                  *buffer;       /* pointer to memory */
-+  pan_uint32_t           chunk_size;   /* size of each chunk (bytes) */
-+  pan_sg_entry_t        *next;
-+};
++	return status;
++}
 +
 +/*
-+ * from pan_storage.h
++ * Spawn a thread to perform a recall layout
++ *
 + */
-+typedef pan_uint64_t pan_stor_dev_id_t;
-+typedef pan_uint32_t pan_stor_obj_grp_id_t;
-+typedef pan_uint64_t pan_stor_obj_uniq_t;
-+typedef pan_uint32_t pan_stor_action_t;
-+typedef pan_uint8_t pan_stor_cap_key_t[20];
-+
-+typedef pan_uint8_t pan_stor_key_type_t;
-+typedef pan_uint64_t pan_stor_len_t;
-+typedef pan_int64_t pan_stor_delta_len_t;
-+typedef pan_uint64_t pan_stor_offset_t;
-+typedef pan_uint16_t pan_stor_op_t;
-+
-+typedef pan_uint16_t pan_stor_sec_level_t;
++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode,
++			  struct nfsd4_pnfs_cb_layout *cbl)
++{
++	int status;
++	struct nfs4_file *lrfile = NULL;
++	struct list_head todolist;
++	unsigned todo_len = 0;
 +
-+struct pan_stor_obj_id_s {
-+  pan_stor_dev_id_t      dev_id;
-+  pan_stor_obj_uniq_t    obj_id;
-+  pan_stor_obj_grp_id_t  grp_id;
-+};
++	dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl);
++	BUG_ON(!cbl);
++	BUG_ON(cbl->cbl_recall_type != RETURN_FILE &&
++	       cbl->cbl_recall_type != RETURN_FSID &&
++	       cbl->cbl_recall_type != RETURN_ALL);
++	BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode);
++	BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ &&
++	       cbl->cbl_seg.iomode != IOMODE_RW &&
++	       cbl->cbl_seg.iomode != IOMODE_ANY);
 +
-+typedef struct pan_stor_obj_id_s pan_stor_obj_id_t;
++	if (nfsd_serv == NULL) {
++		dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n");
++		return -ENOENT;
++	}
 +
-+#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U)
-+#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U)
-+#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U)
-+#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U)
-+#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U)
-+#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U)
-+#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U)
-+#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U)
++	nfs4_lock_state();
++	status = -ENOENT;
++	if (inode) {
++		lrfile = find_file(inode);
++		if (!lrfile) {
++			dprintk("NFSD nfsd_layout_recall_cb: "
++				"nfs4_file not found\n");
++			goto err;
++		}
++		if (cbl->cbl_recall_type == RETURN_FSID)
++			cbl->cbl_fsid = lrfile->fi_fsid;
++	}
 +
-+/*
-+ * from pan_aggregation_map.h
-+ */
-+typedef pan_uint8_t pan_agg_type_t;
-+typedef pan_uint64_t pan_agg_map_version_t;
-+typedef pan_uint8_t pan_agg_obj_state_t;
-+typedef pan_uint8_t pan_agg_comp_state_t;
-+typedef pan_uint8_t pan_agg_comp_flag_t;
++	INIT_LIST_HEAD(&todolist);
 +
-+#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00)
-+#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01)
-+#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02)
-+#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03)
-+#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04)
-+#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05)
-+#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06)
-+#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07)
-+#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00)
-+#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01)
-+#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02)
-+#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03)
-+#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00)
-+#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01)
-+#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02)
-+#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04)
++	/* If no cookie provided by FS, return a default one */
++	if (!cbl->cbl_cookie)
++		cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
 +
-+struct pan_aggregation_map_s {
-+  pan_agg_map_version_t  version;
-+  pan_agg_obj_state_t    avail_state;
-+  pan_stor_obj_id_t      obj_id;
-+};
++	status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile);
++	if (list_empty(&todolist)) {
++		status = -ENOENT;
++	} else {
++		/* process todolist even if create_layout_recall_list
++		 * returned an error */
++		int status2 = spawn_layout_recall(sb, &todolist, todo_len);
++		if (status2)
++			status = status2;
++	}
 +
-+typedef struct pan_aggregation_map_s pan_aggregation_map_t;
++err:
++	nfs4_unlock_state();
++	if (lrfile)
++		put_nfs4_file(lrfile);
++	return (todo_len && status) ? -EAGAIN : status;
++}
 +
-+struct pan_agg_comp_obj_s {
-+  pan_stor_dev_id_t     dev_id;
-+  pan_agg_comp_state_t  avail_state;
-+  pan_agg_comp_flag_t   comp_flags;
++struct create_device_notify_list_arg {
++	struct list_head *todolist;
++	struct nfsd4_pnfs_cb_dev_list *ndl;
 +};
 +
-+typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t;
-+
-+struct pan_agg_simple_header_s {
-+  pan_uint8_t  unused;
-+};
++static int
++create_device_notify_per_cl(struct nfs4_client *clp, void *p)
++{
++	struct nfs4_notify_device *cbnd;
++	struct create_device_notify_list_arg *arg = p;
 +
-+typedef struct pan_agg_simple_header_s pan_agg_simple_header_t;
++	if (atomic_read(&clp->cl_deviceref) <= 0)
++		return 0;
 +
-+struct pan_agg_raid1_header_s {
-+  pan_uint16_t  num_comps;
-+};
++	cbnd = kzalloc(sizeof(*cbnd), GFP_KERNEL);
++	if (!cbnd)
++		return -ENOMEM;
 +
-+typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t;
++	cbnd->nd_list = arg->ndl;
++	cbnd->nd_client = clp;
++	INIT_WORK(&cbnd->nd_recall.cb_work, nfsd4_do_callback_rpc);
++	list_add(&cbnd->nd_perclnt, arg->todolist);
++	return 0;
++}
 +
-+struct pan_agg_raid0_header_s {
-+  pan_uint16_t  num_comps;
-+  pan_uint32_t  stripe_unit;
-+};
++/* Create a list of clients to send device notifications. */
++int
++create_device_notify_list(struct list_head *todolist,
++			  struct nfsd4_pnfs_cb_dev_list *ndl)
++{
++	int status;
++	struct create_device_notify_list_arg arg = {
++		.todolist = todolist,
++		.ndl = ndl,
++	};
 +
-+typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t;
++	nfs4_lock_state();
++	status = filter_confirmed_clients(create_device_notify_per_cl, &arg);
++	nfs4_unlock_state();
 +
-+struct pan_agg_raid5_left_header_s {
-+  pan_uint16_t  num_comps;
-+  pan_uint32_t  stripe_unit0;
-+  pan_uint32_t  stripe_unit1;
-+  pan_uint32_t  stripe_unit2;
-+};
++	return status;
++}
 +
-+typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t;
++/*
++ * For each client that a device, send a device notification.
++ * XXX: Need to track which clients have which devices.
++ */
++int nfsd_device_notify_cb(struct super_block *sb,
++			  struct nfsd4_pnfs_cb_dev_list *ndl)
++{
++	struct nfs4_notify_device *cbnd;
++	struct nfs4_client *nd_client;
++	unsigned int notify_num = 0;
++	int status = 0;
++	struct list_head todolist;
 +
-+typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t;
++	BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list);
 +
-+struct pan_agg_grp_raid5_left_header_s {
-+  pan_uint16_t  num_comps;
-+  pan_uint32_t  stripe_unit;
-+  pan_uint16_t  rg_width;
-+  pan_uint16_t  rg_depth;
-+  pan_uint8_t   group_layout_policy;
-+};
++	dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len);
 +
-+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00)
-+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01)
++	if (nfsd_serv == NULL)
++		return -ENOENT;
 +
-+#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00)
-+#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01)
-+#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02)
-+#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03)
-+#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04)
-+#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06)
-+#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01)
-+#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06)
++	INIT_LIST_HEAD(&todolist);
 +
-+struct pan_agg_layout_hdr_s {
-+  pan_agg_type_t type;
-+  pan_pad_t pad[3];
-+  union {
-+    pan_uint64_t                        null;
-+    pan_agg_simple_header_t             simple;
-+    pan_agg_raid1_header_t              raid1;
-+    pan_agg_raid0_header_t              raid0;
-+    pan_agg_raid5_left_header_t         raid5_left;
-+    pan_agg_grp_raid5_left_header_t     grp_raid5_left;
-+  } hdr;
-+};
++	status = create_device_notify_list(&todolist, ndl);
 +
-+typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t;
++	while (!list_empty(&todolist)) {
++		cbnd = list_entry(todolist.next, struct nfs4_notify_device,
++				  nd_perclnt);
++		list_del_init(&cbnd->nd_perclnt);
++		nd_client = cbnd->nd_client;
++		nfsd4_cb_notify_device(cbnd);
++		pnfs_clear_device_notify(nd_client);
++		notify_num++;
++	}
 +
-+struct pan_agg_comp_obj_a_s {
-+  pan_rpc_arrdim_t size;
-+  pan_agg_comp_obj_t *data;
-+};
-+typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a;
++	dprintk("NFSD %s: status %d clients %u\n",
++		__func__, status, notify_num);
++	return status;
++}
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c.orig	2011-01-28 09:43:53.354769959 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4pnfsdlm.c	2011-01-28 09:43:53.354769959 -0500
+@@ -0,0 +1,461 @@
++/******************************************************************************
++ *
++ * (c) 2007 Network Appliance, Inc.  All Rights Reserved.
++ * (c) 2009 NetApp.  All Rights Reserved.
++ *
++ * NetApp provides this source code under the GPL v2 License.
++ * The GPL v2 license is available at
++ * http://opensource.org/licenses/gpl-license.php.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ ******************************************************************************/
 +
-+struct pan_agg_full_map_s {
-+  pan_aggregation_map_t  map_hdr;
-+  pan_agg_layout_hdr_t   layout_hdr;
-+  pan_agg_comp_obj_a     components;
-+};
++#include <linux/nfs4.h>
++#include <linux/nfsd/const.h>
++#include <linux/nfsd/debug.h>
++#include <linux/nfsd/nfs4pnfsdlm.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/sunrpc/clnt.h>
 +
-+typedef struct pan_agg_full_map_s pan_agg_full_map_t;
++#include "nfsfh.h"
++#include "nfsd.h"
 +
-+/*
-+ * from pan_obsd_rpc_types.h
++#define NFSDDBG_FACILITY                NFSDDBG_PROC
++
++/* Just use a linked list. Do not expect more than 32 dlm_device_entries
++ * the first implementation will just use one device per cluster file system
 + */
-+typedef pan_uint8_t pan_obsd_security_key_a[16];
 +
-+typedef pan_uint8_t pan_obsd_capability_key_a[20];
++static LIST_HEAD(dlm_device_list);
++static DEFINE_SPINLOCK(dlm_device_list_lock);
 +
-+typedef pan_uint8_t pan_obsd_key_holder_id_t;
++struct dlm_device_entry {
++	struct list_head	dlm_dev_list;
++	char			disk_name[DISK_NAME_LEN];
++	int			num_ds;
++	char			ds_list[NFSD_DLM_DS_LIST_MAX];
++};
 +
-+#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01)
-+#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02)
++static struct dlm_device_entry *
++_nfsd4_find_pnfs_dlm_device(char *disk_name)
++{
++	struct dlm_device_entry *dlm_pdev;
 +
-+struct pan_obsd_key_holder_s {
-+  pan_obsd_key_holder_id_t select;
-+  pan_pad_t pad[3];
-+  union {
-+    pan_obsd_security_key_a    basis_key;
-+    pan_obsd_capability_key_a  cap_key;
-+  } key;
-+};
++	dprintk("--> %s  disk name %s\n", __func__, disk_name);
++	spin_lock(&dlm_device_list_lock);
++	list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) {
++		dprintk("%s Look for dlm_pdev %s\n", __func__,
++			dlm_pdev->disk_name);
++		if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) {
++			spin_unlock(&dlm_device_list_lock);
++			return dlm_pdev;
++		}
++	}
++	spin_unlock(&dlm_device_list_lock);
++	return NULL;
++}
 +
-+typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t;
++static struct dlm_device_entry *
++nfsd4_find_pnfs_dlm_device(struct super_block *sb) {
++	char dname[BDEVNAME_SIZE];
 +
-+/*
-+ * from pan_sm_sec.h
-+ */
-+typedef pan_uint8_t pan_sm_sec_type_t;
-+typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t;
++	bdevname(sb->s_bdev, dname);
++	return _nfsd4_find_pnfs_dlm_device(dname);
++}
 +
-+struct pan_obsd_capability_generic_otw_t_s {
-+  pan_rpc_arrdim_t size;
-+  pan_uint8_t *data;
-+};
-+typedef struct pan_obsd_capability_generic_otw_t_s
-+				pan_obsd_capability_generic_otw_t;
++ssize_t
++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen)
++{
++	char *pos = buf;
++	ssize_t size = 0;
++	struct dlm_device_entry *dlm_pdev;
++	int ret = -EINVAL;
 +
-+struct pan_sm_sec_obsd_s {
-+  pan_obsd_key_holder_t              key;
-+  pan_obsd_capability_generic_otw_t  cap_otw;
-+  pan_sm_sec_otw_allo_mode_t         allo_mode;
-+};
++	spin_lock(&dlm_device_list_lock);
++	list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list)
++	{
++		int advanced;
++		advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list);
++		if (advanced >= buflen - size)
++			goto out;
++		size += advanced;
++		pos += advanced;
++	}
++	ret = size;
 +
-+typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t;
++out:
++	spin_unlock(&dlm_device_list_lock);
++	return ret;
++}
 +
-+struct pan_sm_sec_s {
-+  pan_sm_sec_type_t type;
-+  pan_pad_t pad[3];
-+  union {
-+    pan_rpc_none_t     none;
-+    pan_sm_sec_obsd_t  obsd;
-+  } variant;
-+};
++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds)
++{
++	char *start = ds_list;
 +
-+typedef struct pan_sm_sec_s pan_sm_sec_t;
++	*num_ds = 0;
 +
-+struct pan_sm_sec_a_s {
-+  pan_rpc_arrdim_t size;
-+  pan_sm_sec_t *data;
-+};
-+typedef struct pan_sm_sec_a_s pan_sm_sec_a;
-+typedef pan_otw_t *pan_sm_sec_otw_t;
++	while (*start) {
++		struct sockaddr_storage tempAddr;
++		int ipLen = strcspn(start, ",");
++
++		if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr)))
++			return false;
++		(*num_ds)++;
++		start += ipLen + 1;
++	}
++	return true;
++}
 +
 +/*
-+ * from pan_sm_types.h
++ * pnfs_dlm_device string format:
++ *     block-device-path:<ds1 ipv4 address>,<ds2 ipv4 address>
++ *
++ * Examples
++ *     /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with
++ *     two data servers for the dlm cluster file system mounted on /dev/sda.
++ *
++ *     /dev/sda:192.168.1.96,192.168.1.100'
++ *     replaces the data server list for /dev/sda
++ *
++ *     Only the deviceid == 1 is supported. Can add device id to
++ *     pnfs_dlm_device string when needed.
++ *
++ *     Only the round robin each data server once stripe index is supported.
 + */
-+typedef pan_uint64_t pan_sm_cap_handle_t;
++int
++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len)
 +
-+struct pan_sm_map_cap_s {
-+  pan_agg_full_map_t   full_map;
-+  pan_stor_offset_t    offset;
-+  pan_stor_len_t       length;
-+  pan_sm_sec_a         secs;
-+  pan_sm_cap_handle_t  handle;
-+  pan_timespec_t       expiration_time;
-+  pan_stor_action_t    action_mask;
-+  pan_uint32_t         flags;
-+};
++{
++	struct dlm_device_entry *new, *found;
++	char *bufp = pnfs_dlm_device;
++	char *endp = bufp + strlen(bufp);
++	int err = -ENOMEM;
 +
-+typedef struct pan_sm_map_cap_s pan_sm_map_cap_t;
++	dprintk("--> %s len %d\n", __func__, len);
 +
-+/*
-+ * from pan_sm_ops.h
-+ */
-+typedef pan_rpc_none_t pan_sm_cache_ptr_t;
++	new = kzalloc(sizeof(*new), GFP_KERNEL);
++	if (!new)
++		return err;
 +
-+/*
-+ * from pan_sam_api.h
-+ */
-+typedef pan_uint32_t    pan_sam_access_flags_t;
++	err = -EINVAL;
++	/* disk_name */
++	/* FIXME: need to check for valid disk_name. search superblocks?
++	 * check for slash dev slash ?
++	 */
++	len = strcspn(bufp, ":");
++	if (len > DISK_NAME_LEN)
++		goto out_free;
++	memcpy(new->disk_name, bufp, len);
 +
-+typedef struct pan_sam_dev_error_s  pan_sam_dev_error_t;
-+struct pan_sam_dev_error_s {
-+    pan_stor_dev_id_t       dev_id;
-+    pan_stor_op_t           stor_op;
-+    pan_status_t            error;
-+};
++	err = -EINVAL;
++	bufp += len + 1;
++	if (bufp >= endp)
++		goto out_free;
 +
-+typedef struct pan_sam_ext_status_s pan_sam_ext_status_t;
-+struct pan_sam_ext_status_s {
-+    pan_uint32_t        available;
-+    pan_uint32_t        size;
-+    pan_sam_dev_error_t *errors;
-+};
++	/* data server list */
++	/* FIXME: need to check for comma separated valid ip format */
++	len = strcspn(bufp, ":");
++	if (len > NFSD_DLM_DS_LIST_MAX)
++		goto out_free;
++	memcpy(new->ds_list, bufp, len);
 +
-+enum pan_sam_rpc_sec_sel_e {
-+    PAN_SAM_RPC_SEC_DEFAULT,
-+    PAN_SAM_RPC_SEC_ATLEAST,
-+    PAN_SAM_RPC_SEC_EXACTLY
-+};
-+typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t;
 +
-+typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t;
-+struct pan_sam_obj_sec_s {
-+    pan_stor_sec_level_t    min_security;
-+    pan_sm_map_cap_t        *map_ccaps;
-+};
++	/*  validate the ips */
++	if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds)))
++		goto out_free;
 +
-+typedef struct  pan_sam_rpc_sec_s   pan_sam_rpc_sec_t;
-+struct pan_sam_rpc_sec_s {
-+    pan_sam_rpc_sec_sel_t   selector;
-+};
++	dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__,
++		new->disk_name, new->num_ds, new->ds_list);
 +
-+typedef struct pan_sam_read_args_s pan_sam_read_args_t;
-+struct pan_sam_read_args_s {
-+    pan_stor_obj_id_t                obj_id;
-+    pan_sm_cache_ptr_t               obj_ent;
-+    void                            *return_attr;
-+    void                            *checksum;
-+    pan_stor_offset_t                offset;
-+    pan_uint16_t                     sm_options;
-+    void                            *callout;
-+    void                            *callout_arg;
-+};
++	found = _nfsd4_find_pnfs_dlm_device(new->disk_name);
++	if (found) {
++		/* FIXME: should compare found->ds_list with new->ds_list
++		 * and if it is different, kick off a CB_NOTIFY change
++		 * deviceid.
++		 */
++		dprintk("%s pnfs_dlm_device %s:%s already in cache "
++			" replace ds_list with new ds_list %s\n", __func__,
++			found->disk_name, found->ds_list, new->ds_list);
++		memset(found->ds_list, 0, DISK_NAME_LEN);
++		memcpy(found->ds_list, new->ds_list, strlen(new->ds_list));
++		found->num_ds = new->num_ds;
++		kfree(new);
++	} else {
++		dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__,
++				new->disk_name, new->ds_list);
++		spin_lock(&dlm_device_list_lock);
++		list_add(&new->dlm_dev_list, &dlm_device_list);
++		spin_unlock(&dlm_device_list_lock);
++	}
++	dprintk("<-- %s Success\n", __func__);
++	return 0;
 +
-+typedef struct pan_sam_read_res_s pan_sam_read_res_t;
-+struct pan_sam_read_res_s {
-+    pan_status_t             result;
-+    pan_sam_ext_status_t     ext_status;
-+    pan_stor_len_t           length;
-+    void                    *attr;
-+    void                    *checksum;
-+};
++out_free:
++	kfree(new);
++	dprintk("<-- %s returns %d\n", __func__, err);
++	return err;
++}
 +
-+typedef void (*pan_sam_read_cb_t)(
-+    void                *user_arg1,
-+    void                *user_arg2,
-+    pan_sam_read_res_t  *res_p,
-+    pan_status_t        status);
++void nfsd4_pnfs_dlm_shutdown(void)
++{
++	struct dlm_device_entry *dlm_pdev, *next;
 +
-+#define PAN_SAM_ACCESS_NONE                             0x0000
-+#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP                 0x0020
++	dprintk("--> %s\n", __func__);
 +
-+typedef struct pan_sam_write_args_s pan_sam_write_args_t;
-+struct pan_sam_write_args_s {
-+    pan_stor_obj_id_t   obj_id;
-+    pan_sm_cache_ptr_t  obj_ent;
-+    pan_stor_offset_t   offset;
-+    void                *attr;
-+    void                *return_attr;
-+};
++	spin_lock(&dlm_device_list_lock);
++	list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list,
++				  dlm_dev_list) {
++		list_del(&dlm_pdev->dlm_dev_list);
++		kfree(dlm_pdev);
++	}
++	spin_unlock(&dlm_device_list_lock);
++}
 +
-+typedef struct pan_sam_write_res_s pan_sam_write_res_t;
-+struct pan_sam_write_res_s {
-+    pan_status_t            result;
-+    pan_sam_ext_status_t    ext_status;
-+    pan_stor_len_t          length;
-+    pan_stor_delta_len_t    delta_capacity_used;
-+    pan_bool_t              parity_dirty;
-+    void                   *attr;
-+};
++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb,
++				     u32 layout_type,
++				     struct nfsd4_pnfs_dev_iter_res *res)
++{
++	if (layout_type != LAYOUT_NFSV4_1_FILES) {
++		printk(KERN_ERR "%s: ERROR: layout type isn't 'file' "
++			"(type: %x)\n", __func__, layout_type);
++		return -ENOTSUPP;
++	}
 +
-+typedef void (*pan_sam_write_cb_t)(
-+    void                *user_arg1,
-+    void                *user_arg2,
-+    pan_sam_write_res_t *res_p,
-+    pan_status_t        status);
++	res->gd_eof = 1;
++	if (res->gd_cookie)
++		return -ENOENT;
 +
-+/*
-+ * from pan_mgr_types.h
-+ */
-+#define PAN_MGR_ID_TYPE_SHIFT 56
-+#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL)
-+#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL)
++	res->gd_cookie = 1;
++	res->gd_verf = 1;
++	res->gd_devid = 1;
++	return 0;
++}
 +
-+typedef pan_uint16_t pan_mgr_type_t;
-+typedef pan_uint64_t pan_mgr_id_t;
++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb,
++				     struct exp_xdr_stream *xdr,
++				     u32 layout_type,
++				     const struct nfsd4_pnfs_deviceid *devid)
++{
++	int err, len, i = 0;
++	struct pnfs_filelayout_device fdev;
++	struct pnfs_filelayout_devaddr *daddr;
++	struct dlm_device_entry *dlm_pdev;
++	char   *bufp;
 +
-+#define PAN_MGR_SM ((pan_mgr_type_t) 2U)
-+#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U)
++	err = -ENOTSUPP;
++	if (layout_type != LAYOUT_NFSV4_1_FILES) {
++		dprintk("%s: ERROR: layout type isn't 'file' "
++			"(type: %x)\n", __func__, layout_type);
++		return err;
++	}
 +
-+/*
-+ * from pan_mgr_types_c.h
-+ */
-+#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \
-+  pan_mgr_id_t  _id1, _id2; \
-+\
-+  _id1 = (_mgr_type_); \
-+  _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \
-+  _id1 &= PAN_MGR_ID_TYPE_MASK; \
-+  _id2 = (_mgr_uniq_); \
-+  _id2 &= PAN_MGR_ID_UNIQ_MASK; \
-+  _id1 |= _id2; \
-+  *(_mgr_id_p_) = _id1; \
-+}
++	/* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO
++	 * with a gdia_device_id != 1 is invalid.
++	 */
++	err = -EINVAL;
++	if (devid->devid != 1) {
++		dprintk("%s: WARNING: didn't receive a deviceid of "
++			"1 (got: 0x%llx)\n", __func__, devid->devid);
++		return err;
++	}
 +
-+/*
-+ * from pan_storage_c.h
-+ */
-+#define pan_stor_is_device_id_an_obsd_id(_device_id_) \
-+    ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \
-+	== PAN_MGR_OBSD)
++	/*
++	 * If the DS list has not been established, return -EINVAL
++	 */
++	dlm_pdev = nfsd4_find_pnfs_dlm_device(sb);
++	if (!dlm_pdev) {
++		dprintk("%s: DEBUG: disk %s Not Found\n", __func__,
++			sb->s_bdev->bd_disk->disk_name);
++		return err;
++	}
 +
-+/*
-+ * pnfs_shim internal definitions
-+ */
++	dprintk("%s: Found disk %s with DS list |%s|\n",
++		__func__, dlm_pdev->disk_name, dlm_pdev->ds_list);
 +
-+struct panfs_shim_io_state {
-+	struct objlayout_io_state ol_state;
++	memset(&fdev, '\0', sizeof(fdev));
++	fdev.fl_device_length = dlm_pdev->num_ds;
 +
-+	pan_sg_entry_t *sg_list;
-+	pan_sam_obj_sec_t obj_sec;
-+	void *ucreds;
-+	union {
-+		struct {
-+			pan_sam_read_args_t args;
-+			pan_sam_read_res_t res;
-+		} read;
-+		struct {
-+			pan_sam_write_args_t args;
-+			pan_sam_write_res_t res;
-+		} write;
-+	} u;
-+};
++	err = -ENOMEM;
++	len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length;
++	fdev.fl_device_list = kzalloc(len, GFP_KERNEL);
++	if (!fdev.fl_device_list) {
++		printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list "
++			"buffer for %d DSes.\n", __func__, i);
++		fdev.fl_device_length = 0;
++		goto out;
++	}
 +
-+#endif /* _PANLAYOUT_PANFS_SHIM_H */
-diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
-new file mode 100644
-index 0000000..d05c6be
---- /dev/null
-+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
-@@ -0,0 +1,435 @@
-+/*
-+ *  pnfs_osd_xdr.c
-+ *
-+ *  Object-Based pNFS Layout XDR layer
-+ *
-+ *  Copyright (C) 2007-2009 Panasas Inc.
-+ *  All rights reserved.
-+ *
-+ *  Benny Halevy <bhalevy at panasas.com>
-+ *
-+ *  This program is free software; you can redistribute it and/or modify
-+ *  it under the terms of the GNU General Public License version 2
-+ *  See the file COPYING included with this distribution for more details.
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the Panasas company nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
++	/* Set a simple stripe indicie */
++	fdev.fl_stripeindices_length = fdev.fl_device_length;
++	fdev.fl_stripeindices_list = kzalloc(sizeof(u32) *
++				     fdev.fl_stripeindices_length, GFP_KERNEL);
 +
-+#include <linux/pnfs_osd_xdr.h>
++	if (!fdev.fl_stripeindices_list) {
++		printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices "
++			"list buffer for %d DSes.\n", __func__, i);
++		goto out;
++	}
++	for (i = 0; i < fdev.fl_stripeindices_length; i++)
++		fdev.fl_stripeindices_list[i] = i;
 +
-+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++	/* Transfer the data server list with a single multipath entry */
++	bufp = dlm_pdev->ds_list;
++	for (i = 0; i < fdev.fl_device_length; i++) {
++		daddr = kmalloc(sizeof(*daddr), GFP_KERNEL);
++		if (!daddr) {
++			printk(KERN_ERR "%s: ERROR: unable to kmalloc a device "
++				"addr buffer.\n", __func__);
++			goto out;
++		}
 +
-+/*
-+ * The following implementation is based on these Internet Drafts:
-+ *
-+ * draft-ietf-nfsv4-minorversion-21
-+ * draft-ietf-nfsv4-pnfs-obj-12
-+ */
++		daddr->r_netid.data = "tcp";
++		daddr->r_netid.len = 3;
 +
-+/*
-+ * struct pnfs_osd_objid {
-+ * 	struct pnfs_deviceid	oid_device_id;
-+ * 	u64			oid_partition_id;
-+ * 	u64			oid_object_id;
-+ * };
-+ */
-+static inline u32 *
-+pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid)
-+{
-+	COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data));
-+	READ64(objid->oid_partition_id);
-+	READ64(objid->oid_object_id);
-+	return p;
++		len = strcspn(bufp, ",");
++		daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL);
++		memcpy(daddr->r_addr.data, bufp, len);
++		/*
++		 * append the port number.  interpreted as two more bytes
++		 * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049.
++		 */
++		memcpy(daddr->r_addr.data + len, ".8.1", 4);
++		daddr->r_addr.len = len + 4;
++
++		fdev.fl_device_list[i].fl_multipath_length = 1;
++		fdev.fl_device_list[i].fl_multipath_list = daddr;
++
++		dprintk("%s: encoding DS |%s|\n", __func__, bufp);
++
++		bufp += len + 1;
++	}
++
++	/* have nfsd encode the device info */
++	err = filelayout_encode_devinfo(xdr, &fdev);
++out:
++	for (i = 0; i < fdev.fl_device_length; i++)
++		kfree(fdev.fl_device_list[i].fl_multipath_list);
++	kfree(fdev.fl_device_list);
++	kfree(fdev.fl_stripeindices_list);
++	dprintk("<-- %s returns %d\n", __func__, err);
++	return err;
 +}
 +
-+static inline u32 *
-+pnfs_osd_xdr_decode_opaque_cred(u32 *p,
-+				struct pnfs_osd_opaque_cred *opaque_cred)
++static int get_stripe_unit(int blocksize)
 +{
-+	READ32(opaque_cred->cred_len);
-+	COPYMEM(opaque_cred->cred, opaque_cred->cred_len);
-+	return p;
++	if (blocksize >= NFSSVC_MAXBLKSIZE)
++		return blocksize;
++	return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
 +}
 +
 +/*
-+ * struct pnfs_osd_object_cred {
-+ * 	struct pnfs_osd_objid		oc_object_id;
-+ * 	u32				oc_osd_version;
-+ * 	u32				oc_cap_key_sec;
-+ * 	struct pnfs_osd_opaque_cred	oc_cap_key
-+ * 	struct pnfs_osd_opaque_cred	oc_cap;
-+ * };
++ * Look up inode block device in pnfs_dlm_device list.
++ * Hash on the inode->i_ino and number of data servers.
 + */
-+static inline u32 *
-+pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp,
-+				u8 **credp)
++static int dlm_ino_hash(struct inode *ino)
 +{
-+	u8 *cred;
++	struct dlm_device_entry *de;
++	u32 hash_mask = 0;
 +
-+	p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id);
-+	READ32(comp->oc_osd_version);
-+	READ32(comp->oc_cap_key_sec);
++	/* If can't find the inode block device in the pnfs_dlm_deivce list
++	 * then don't hand out a layout
++	 */
++	de = nfsd4_find_pnfs_dlm_device(ino->i_sb);
++	if (!de)
++		return -1;
++	hash_mask = de->num_ds - 1;
++	return ino->i_ino & hash_mask;
++}
 +
-+	cred = *credp;
-+	comp->oc_cap_key.cred = cred;
-+	p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key);
-+	cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len));
-+	comp->oc_cap.cred = cred;
-+	p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap);
-+	cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len));
-+	*credp = cred;
++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode,
++			   struct exp_xdr_stream *xdr,
++			   const struct nfsd4_pnfs_layoutget_arg *args,
++			   struct nfsd4_pnfs_layoutget_res *res)
++{
++	struct pnfs_filelayout_layout *layout = NULL;
++	struct knfsd_fh *fhp = NULL;
++	int index;
++	enum nfsstat4 rc = NFS4_OK;
 +
-+	return p;
++	dprintk("%s: LAYOUT_GET\n", __func__);
++
++	/* DLM exported file systems only support layouts for READ */
++	if (res->lg_seg.iomode == IOMODE_RW)
++		return NFS4ERR_BADIOMODE;
++
++	index = dlm_ino_hash(inode);
++	dprintk("%s first stripe index %d i_ino %lu\n", __func__, index,
++		inode->i_ino);
++	if (index < 0)
++		return NFS4ERR_LAYOUTUNAVAILABLE;
++
++	res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
++	/* Always give out whole file layouts */
++	res->lg_seg.offset = 0;
++	res->lg_seg.length = NFS4_MAX_UINT64;
++	/* Always give out READ ONLY layouts */
++	res->lg_seg.iomode = IOMODE_READ;
++
++	layout = kzalloc(sizeof(*layout), GFP_KERNEL);
++	if (layout == NULL) {
++		rc = NFS4ERR_LAYOUTTRYLATER;
++		goto error;
++	}
++
++	/* Set file layout response args */
++	layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
++	layout->lg_stripe_type = STRIPE_SPARSE;
++	layout->lg_commit_through_mds = false;
++	layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
++	layout->lg_fh_length = 1;
++	layout->device_id.sbid = args->lg_sbid;
++	layout->device_id.devid = 1;                                /*FSFTEMP*/
++	layout->lg_first_stripe_index = index;                      /*FSFTEMP*/
++	layout->lg_pattern_offset = 0;
++
++	fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
++	if (fhp == NULL) {
++		rc = NFS4ERR_LAYOUTTRYLATER;
++		goto error;
++	}
++
++	memcpy(fhp, args->lg_fh, sizeof(*fhp));
++	pnfs_fh_mark_ds(fhp);
++	layout->lg_fh_list = fhp;
++
++	/* Call nfsd to encode layout */
++	rc = filelayout_encode_layout(xdr, layout);
++exit:
++	kfree(layout);
++	kfree(fhp);
++	return rc;
++
++error:
++	res->lg_seg.length = 0;
++	goto exit;
 +}
 +
-+/*
-+ * struct pnfs_osd_data_map {
-+ * 	u32	odm_num_comps;
-+ * 	u64	odm_stripe_unit;
-+ * 	u32	odm_group_width;
-+ * 	u32	odm_group_depth;
-+ * 	u32	odm_mirror_cnt;
-+ * 	u32	odm_raid_algorithm;
-+ * };
-+ */
-+static inline u32 *
-+pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map)
++static int
++nfsd4_pnfs_dlm_layouttype(struct super_block *sb)
 +{
-+	READ32(data_map->odm_num_comps);
-+	READ64(data_map->odm_stripe_unit);
-+	READ32(data_map->odm_group_width);
-+	READ32(data_map->odm_group_depth);
-+	READ32(data_map->odm_mirror_cnt);
-+	READ32(data_map->odm_raid_algorithm);
-+	dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
-+		"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
-+		__func__,
-+		data_map->odm_num_comps,
-+		(unsigned long long)data_map->odm_stripe_unit,
-+		data_map->odm_group_width,
-+		data_map->odm_group_depth,
-+		data_map->odm_mirror_cnt,
-+		data_map->odm_raid_algorithm);
-+	return p;
++	return LAYOUT_NFSV4_1_FILES;
 +}
 +
-+struct pnfs_osd_layout *
-+pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p)
-+{
-+	int i;
-+	u32 *start = p;
-+	struct pnfs_osd_object_cred *comp;
-+	u8 *cred;
++/* For use by DLM cluster file systems exported by pNFSD */
++const struct pnfs_export_operations pnfs_dlm_export_ops = {
++	.layout_type = nfsd4_pnfs_dlm_layouttype,
++	.get_device_info = nfsd4_pnfs_dlm_getdevinfo,
++	.get_device_iter = nfsd4_pnfs_dlm_getdeviter,
++	.layout_get = nfsd4_pnfs_dlm_layoutget,
++};
++EXPORT_SYMBOL(pnfs_dlm_export_ops);
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c.orig	2011-01-28 09:43:53.355769845 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4pnfsds.c	2011-01-28 09:43:53.355769845 -0500
+@@ -0,0 +1,620 @@
++/*
++*  linux/fs/nfsd/nfs4pnfsds.c
++*
++*  Copyright (c) 2005 The Regents of the University of Michigan.
++*  All rights reserved.
++*
++*  Andy Adamson <andros at umich.edu>
++*
++*  Redistribution and use in source and binary forms, with or without
++*  modification, are permitted provided that the following conditions
++*  are met:
++*
++*  1. Redistributions of source code must retain the above copyright
++*     notice, this list of conditions and the following disclaimer.
++*  2. Redistributions in binary form must reproduce the above copyright
++*     notice, this list of conditions and the following disclaimer in the
++*     documentation and/or other materials provided with the distribution.
++*  3. Neither the name of the University nor the names of its
++*     contributors may be used to endorse or promote products derived
++*     from this software without specific prior written permission.
++*
++*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*
++*/
++#if defined(CONFIG_PNFSD)
++
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
 +
-+	p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map);
-+	READ32(layout->olo_comps_index);
-+	READ32(layout->olo_num_comps);
-+	layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1);
-+	comp = layout->olo_comps;
-+	cred = (u8 *)(comp + layout->olo_num_comps);
-+	dprintk("%s: comps_index=%u num_comps=%u\n",
-+		__func__, layout->olo_comps_index, layout->olo_num_comps);
-+	for (i = 0; i < layout->olo_num_comps; i++) {
-+		p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred);
-+		dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx "
-+			"key_len=%u cap_len=%u\n",
-+			__func__, i,
-+			_DEVID_LO(&comp->oc_object_id.oid_device_id),
-+			_DEVID_HI(&comp->oc_object_id.oid_device_id),
-+			comp->oc_object_id.oid_partition_id,
-+			comp->oc_object_id.oid_object_id,
-+			comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
-+		comp++;
-+	}
-+	dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__,
-+	       (char *)p - (char *)start, cred, (char *)cred - (char *)layout);
-+	return layout;
-+}
++#include <linux/param.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/debug.h>
++#include <linux/nfs4.h>
++#include <linux/exportfs.h>
++#include <linux/sched.h>
++
++#include "nfsd.h"
++#include "pnfsd.h"
++#include "state.h"
 +
 +/*
-+ * Get Device Information Decoding
++ *******************
++ *   	 PNFS
++ *******************
++ */
++/*
++ * Hash tables for pNFS Data Server state
++ *
++ * mds_nodeid:	list of struct pnfs_mds_id one per Metadata server (MDS) using
++ *		this data server (DS).
++ *
++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained
++ *			from any MDS.
++ *
++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained
++ *			from any MDS.
 + *
-+ * Note: since Device Information is currently done synchronously, most
-+ *       of the actual fields are left inside the rpc buffer and are only
-+ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
-+ *       should not be freed while the returned information is in use.
 + */
++/* Hash tables for clientid state */
++#define CLIENT_HASH_BITS                 4
++#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
++#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
 +
-+u32 *__xdr_read_calc_nfs4_string(
-+	u32 *p, struct nfs4_string *str, u8 **freespace)
-+{
-+	u32 len;
-+	char *data;
-+	bool need_copy;
-+
-+	READ32(len);
-+	data = (char *)p;
-+
-+	if (data[len]) { /* Not null terminated we'll need extra space */
-+		data = *freespace;
-+		*freespace += len + 1;
-+		need_copy = true;
-+	} else {
-+		need_copy = false;
-+	}
++#define clientid_hashval(id) \
++	((id) & CLIENT_HASH_MASK)
 +
-+	if (str) {
-+		str->len = len;
-+		str->data = data;
-+		if (need_copy) {
-+			memcpy(data, p, len);
-+			data[len] = 0;
-+		}
-+	}
++/* hash table for pnfs_ds_stateid */
++#define STATEID_HASH_BITS              10
++#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
++#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
 +
-+	p += XDR_QUADLEN(len);
-+	return p;
-+}
++#define stateid_hashval(owner_id, file_id)  \
++	(((owner_id) + (file_id)) & STATEID_HASH_MASK)
 +
-+u32 *__xdr_read_calc_u8_opaque(
-+	u32 *p, struct nfs4_string *str)
-+{
-+	u32 len;
++static struct list_head mds_id_tbl;
++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE];
++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE];
 +
-+	READ32(len);
++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp);
++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp);
 +
-+	if (str) {
-+		str->len = len;
-+		str->data = (char *)p;
-+	}
++/* Mutex for data server state.  Needs to be separate from
++ * mds state mutex since a node can be both mds and ds */
++static DEFINE_MUTEX(ds_mutex);
++static struct thread_info *ds_mutex_owner;
 +
-+	p += XDR_QUADLEN(len);
-+	return p;
++static void
++ds_lock_state(void)
++{
++	mutex_lock(&ds_mutex);
++	ds_mutex_owner = current_thread_info();
 +}
 +
-+/*
-+ * struct pnfs_osd_targetid {
-+ * 	u32			oti_type;
-+ * 	struct nfs4_string	oti_scsi_device_id;
-+ * };
-+ */
-+u32 *__xdr_read_calc_targetid(
-+	u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace)
++static void
++ds_unlock_state(void)
 +{
-+	u32 oti_type;
-+
-+	READ32(oti_type);
-+	if (targetid)
-+		targetid->oti_type = oti_type;
-+
-+	switch (oti_type) {
-+	case OBJ_TARGET_SCSI_NAME:
-+	case OBJ_TARGET_SCSI_DEVICE_ID:
-+		p = __xdr_read_calc_u8_opaque(p,
-+			targetid ? &targetid->oti_scsi_device_id : NULL);
-+	}
++	BUG_ON(ds_mutex_owner != current_thread_info());
++	ds_mutex_owner = NULL;
++	mutex_unlock(&ds_mutex);
++}
 +
-+	return p;
++static int
++cmp_clid(const clientid_t *cl1, const clientid_t *cl2)
++{
++	return (cl1->cl_boot == cl2->cl_boot) &&
++	       (cl1->cl_id == cl2->cl_id);
 +}
 +
-+/*
-+ * struct pnfs_osd_net_addr {
-+ * 	struct nfs4_string	r_netid;
-+ * 	struct nfs4_string	r_addr;
-+ * };
-+ */
-+u32 *__xdr_read_calc_net_addr(
-+	u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace)
++void
++nfs4_pnfs_state_init(void)
 +{
++	int i;
 +
-+	p = __xdr_read_calc_nfs4_string(p,
-+			netaddr ? &netaddr->r_netid : NULL,
-+			freespace);
++	for (i = 0; i < CLIENT_HASH_SIZE; i++)
++		INIT_LIST_HEAD(&mds_clid_hashtbl[i]);
 +
-+	p = __xdr_read_calc_nfs4_string(p,
-+			netaddr ? &netaddr->r_addr : NULL,
-+			freespace);
++	for (i = 0; i < STATEID_HASH_SIZE; i++)
++		INIT_LIST_HEAD(&ds_stid_hashtbl[i]);
 +
-+	return p;
++	INIT_LIST_HEAD(&mds_id_tbl);
 +}
 +
-+/*
-+ * struct pnfs_osd_targetaddr {
-+ * 	u32				ota_available;
-+ * 	struct pnfs_osd_net_addr	ota_netaddr;
-+ * };
-+ */
-+u32 *__xdr_read_calc_targetaddr(
-+	u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace)
++static struct pnfs_mds_id *
++find_pnfs_mds_id(u32 mdsid)
 +{
-+	u32 ota_available;
-+
-+	READ32(ota_available);
-+	if (targetaddr)
-+		targetaddr->ota_available = ota_available;
++	struct pnfs_mds_id *local = NULL;
 +
-+	if (ota_available) {
-+		p = __xdr_read_calc_net_addr(p,
-+				targetaddr ? &targetaddr->ota_netaddr : NULL,
-+				freespace);
++	dprintk("pNFSD: %s\n", __func__);
++	list_for_each_entry(local, &mds_id_tbl, di_hash) {
++		if (local->di_mdsid == mdsid)
++			return local;
 +	}
-+
-+	return p;
++	return NULL;
 +}
 +
-+/*
-+ * struct pnfs_osd_deviceaddr {
-+ * 	struct pnfs_osd_targetid	oda_targetid;
-+ * 	struct pnfs_osd_targetaddr	oda_targetaddr;
-+ * 	u8				oda_lun[8];
-+ * 	struct nfs4_string		oda_systemid;
-+ * 	struct pnfs_osd_object_cred	oda_root_obj_cred;
-+ * 	struct nfs4_string		oda_osdname;
-+ * };
-+ */
-+u32 *__xdr_read_calc_deviceaddr(
-+	u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace)
++static struct pnfs_ds_clientid *
++find_pnfs_ds_clientid(const clientid_t *clid)
 +{
-+	p = __xdr_read_calc_targetid(p,
-+			deviceaddr ? &deviceaddr->oda_targetid : NULL,
-+			freespace);
-+
-+	p = __xdr_read_calc_targetaddr(p,
-+			deviceaddr ? &deviceaddr->oda_targetaddr : NULL,
-+			freespace);
-+
-+	if (deviceaddr)
-+		COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun));
-+	else
-+		p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun));
-+
-+	p = __xdr_read_calc_u8_opaque(p,
-+			deviceaddr ? &deviceaddr->oda_systemid : NULL);
-+
-+	if (deviceaddr) {
-+		p = pnfs_osd_xdr_decode_object_cred(p,
-+				&deviceaddr->oda_root_obj_cred, freespace);
-+	} else {
-+		*freespace += pnfs_osd_object_cred_incore_sz(p);
-+		p += pnfs_osd_object_cred_xdr_sz(p);
-+	}
++	struct pnfs_ds_clientid *local = NULL;
++	unsigned int hashval;
 +
-+	p = __xdr_read_calc_u8_opaque(p,
-+			deviceaddr ? &deviceaddr->oda_osdname : NULL);
++	dprintk("pNFSD: %s\n", __func__);
 +
-+	return p;
++	hashval = clientid_hashval(clid->cl_id);
++	list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) {
++		if (cmp_clid(&local->dc_mdsclid, clid))
++			return local;
++	}
++	return NULL;
 +}
 +
-+size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p)
++static struct pnfs_ds_stateid *
++find_pnfs_ds_stateid(stateid_t *stid)
 +{
-+	u8 *null_freespace = NULL;
-+	size_t sz;
++	struct pnfs_ds_stateid *local = NULL;
++	u32 st_id = stid->si_stateownerid;
++	u32 f_id = stid->si_fileid;
++	unsigned int hashval;
 +
-+	__xdr_read_calc_deviceaddr(p, NULL, &null_freespace);
-+	sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace;
++	dprintk("pNFSD: %s\n", __func__);
 +
-+	return sz;
++	hashval = stateid_hashval(st_id, f_id);
++	list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash)
++		if ((local->ds_stid.si_stateownerid == st_id) &&
++				(local->ds_stid.si_fileid == f_id) &&
++				(local->ds_stid.si_boot == stid->si_boot)) {
++			stateid_t *sid = &local->ds_stid;
++			dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n",
++				__func__, local, local->ds_flags,
++				STATEID_VAL(sid));
++			return local;
++		}
++	return NULL;
 +}
 +
-+void pnfs_osd_xdr_decode_deviceaddr(
-+	struct pnfs_osd_deviceaddr *deviceaddr, u32 *p)
++static void
++release_ds_mdsid(struct kref *kref)
 +{
-+	u8 *freespace = (u8 *)(deviceaddr + 1);
++	struct pnfs_mds_id *mdp =
++		container_of(kref, struct pnfs_mds_id, di_ref);
++	dprintk("pNFSD: %s\n", __func__);
 +
-+	__xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
++	list_del(&mdp->di_hash);
++	list_del(&mdp->di_mdsclid);
++	kfree(mdp);
 +}
 +
-+/*
-+ * struct pnfs_osd_layoutupdate {
-+ * 	u32	dsu_valid;
-+ * 	s64	dsu_delta;
-+ * 	u32	olu_ioerr_flag;
-+ * };
-+ */
-+int
-+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
-+				 struct pnfs_osd_layoutupdate *lou)
++static void
++release_ds_clientid(struct kref *kref)
 +{
-+	__be32 *p = xdr_reserve_space(xdr, 16);
++	struct pnfs_ds_clientid *dcp =
++		container_of(kref, struct pnfs_ds_clientid, dc_ref);
++	struct pnfs_mds_id *mdp;
++	dprintk("pNFSD: %s\n", __func__);
 +
-+	if (!p)
-+		return -E2BIG;
++	mdp = find_pnfs_mds_id(dcp->dc_mdsid);
++	if (mdp)
++		put_ds_mdsid(mdp);
 +
-+	*p++ = cpu_to_be32(lou->dsu_valid);
-+	if (lou->dsu_valid)
-+		p = xdr_encode_hyper(p, lou->dsu_delta);
-+	*p++ = cpu_to_be32(lou->olu_ioerr_flag);
-+	return 0;
++	list_del(&dcp->dc_hash);
++	list_del(&dcp->dc_stateid);
++	list_del(&dcp->dc_permdsid);
++	kfree(dcp);
 +}
 +
-+/*
-+ * struct pnfs_osd_objid {
-+ * 	struct pnfs_deviceid	oid_device_id;
-+ * 	u64			oid_partition_id;
-+ * 	u64			oid_object_id;
-+ */
-+static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
-+					    struct pnfs_osd_objid *object_id)
++static void
++release_ds_stateid(struct kref *kref)
 +{
-+	__be32 *p;
-+
-+	p = xdr_reserve_space(xdr, 32);
-+	if (!p)
-+		return -E2BIG;
++	struct pnfs_ds_stateid *dsp =
++		container_of(kref, struct pnfs_ds_stateid, ds_ref);
++	struct pnfs_ds_clientid *dcp;
++	dprintk("pNFS %s: dsp %p\n", __func__, dsp);
 +
-+	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
-+				    sizeof(object_id->oid_device_id.data));
-+	p = xdr_encode_hyper(p, object_id->oid_partition_id);
-+	p = xdr_encode_hyper(p, object_id->oid_object_id);
++	dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid);
++	if (dcp)
++		put_ds_clientid(dcp);
 +
-+	return 0;
++	list_del(&dsp->ds_hash);
++	list_del(&dsp->ds_perclid);
++	kfree(dsp);
 +}
 +
-+/*
-+ * struct pnfs_osd_ioerr {
-+ * 	struct pnfs_osd_objid	oer_component;
-+ * 	u64			oer_comp_offset;
-+ * 	u64			oer_comp_length;
-+ * 	u32			oer_iswrite;
-+ * 	u32			oer_errno;
-+ * };
-+ */
-+int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
-+			      struct pnfs_osd_ioerr *ioerr)
++static inline void
++put_ds_clientid(struct pnfs_ds_clientid *dcp)
 +{
-+	__be32 *p;
-+	int ret;
-+
-+	ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
-+	if (ret)
-+		return ret;
-+
-+	p = xdr_reserve_space(xdr, 24);
-+	if (!p)
-+		return -E2BIG;
-+
-+	p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
-+	p = xdr_encode_hyper(p, ioerr->oer_comp_length);
-+	*p++ = cpu_to_be32(ioerr->oer_iswrite);
-+	*p   = cpu_to_be32(ioerr->oer_errno);
-+
-+	return 0;
++	dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
++		atomic_read(&dcp->dc_ref.refcount));
++	kref_put(&dcp->dc_ref, release_ds_clientid);
 +}
-diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
-index 9194902..96e375e 100644
---- a/fs/nfs/pagelist.c
-+++ b/fs/nfs/pagelist.c
-@@ -20,6 +20,7 @@
- #include <linux/nfs_mount.h>
- 
- #include "internal.h"
-+#include "pnfs.h"
- 
- static struct kmem_cache *nfs_page_cachep;
- 
-@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p)
- struct nfs_page *
- nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
- 		   struct page *page,
--		   unsigned int offset, unsigned int count)
-+		   unsigned int offset, unsigned int count,
-+		   struct pnfs_layout_segment *lseg)
- {
- 	struct nfs_page		*req;
- 
-@@ -81,6 +83,9 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
- 	req->wb_context = get_nfs_open_context(ctx);
- 	req->wb_lock_context = nfs_get_lock_context(ctx);
- 	kref_init(&req->wb_kref);
-+	req->wb_lseg    = lseg;
-+	if (lseg)
-+		get_lseg(lseg);
- 	return req;
- }
- 
-@@ -156,9 +161,12 @@ void nfs_clear_request(struct nfs_page *req)
- 		put_nfs_open_context(ctx);
- 		req->wb_context = NULL;
- 	}
-+	if (req->wb_lseg != NULL) {
-+		put_lseg(req->wb_lseg);
-+		req->wb_lseg = NULL;
-+	}
- }
- 
--
- /**
-  * nfs_release_request - Release the count on an NFS read/write request
-  * @req: request to release
-@@ -237,7 +245,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
-  * Return 'true' if this is the case, else return 'false'.
-  */
- static int nfs_can_coalesce_requests(struct nfs_page *prev,
--				     struct nfs_page *req)
-+				     struct nfs_page *req,
-+				     struct nfs_pageio_descriptor *pgio)
- {
- 	if (req->wb_context->cred != prev->wb_context->cred)
- 		return 0;
-@@ -251,6 +260,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
- 		return 0;
- 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- 		return 0;
-+	if (req->wb_lseg != prev->wb_lseg)
-+		return 0;
-+#ifdef CONFIG_NFS_V4_1
-+	if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
-+		return 0;
-+#endif /* CONFIG_NFS_V4_1 */
- 	return 1;
- }
- 
-@@ -283,7 +298,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
- 		if (newlen > desc->pg_bsize)
- 			return 0;
- 		prev = nfs_list_entry(desc->pg_list.prev);
--		if (!nfs_can_coalesce_requests(prev, req))
-+		if (!nfs_can_coalesce_requests(prev, req, desc))
- 			return 0;
- 	} else
- 		desc->pg_base = req->wb_pgbase;
-@@ -372,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
-  * @idx_start: lower bound of page->index to scan
-  * @npages: idx_start + npages sets the upper bound to scan.
-  * @tag: tag to scan for
-+ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver
-  *
-  * Moves elements from one of the inode request lists.
-  * If the number of requests is set to 0, the entire address_space
-@@ -381,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
-  */
- int nfs_scan_list(struct nfs_inode *nfsi,
- 		struct list_head *dst, pgoff_t idx_start,
--		unsigned int npages, int tag)
-+		  unsigned int npages, int tag, int *use_pnfs)
- {
- 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
- 	struct nfs_page *req;
-@@ -412,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi,
- 				radix_tree_tag_clear(&nfsi->nfs_page_tree,
- 						req->wb_index, tag);
- 				nfs_list_add_request(req, dst);
-+				if (req->wb_lseg)
-+					*use_pnfs = 1;
- 				res++;
- 				if (res == INT_MAX)
- 					goto out;
-diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
-new file mode 100644
-index 0000000..dfdf661
---- /dev/null
-+++ b/fs/nfs/pnfs.c
-@@ -0,0 +1,1723 @@
-+/*
-+ *  pNFS functions to call and manage layout drivers.
-+ *
-+ *  Copyright (c) 2002 [year of first publication]
-+ *  The Regents of the University of Michigan
-+ *  All Rights Reserved
-+ *
-+ *  Dean Hildebrand <dhildebz at umich.edu>
-+ *
-+ *  Permission is granted to use, copy, create derivative works, and
-+ *  redistribute this software and such derivative works for any purpose,
-+ *  so long as the name of the University of Michigan is not used in
-+ *  any advertising or publicity pertaining to the use or distribution
-+ *  of this software without specific, written prior authorization. If
-+ *  the above copyright notice or any other identification of the
-+ *  University of Michigan is included in any copy of any portion of
-+ *  this software, then the disclaimer below must also be included.
-+ *
-+ *  This software is provided as is, without representation or warranty
-+ *  of any kind either express or implied, including without limitation
-+ *  the implied warranties of merchantability, fitness for a particular
-+ *  purpose, or noninfringement.  The Regents of the University of
-+ *  Michigan shall not be liable for any damages, including special,
-+ *  indirect, incidental, or consequential damages, with respect to any
-+ *  claim arising out of or in connection with the use of the software,
-+ *  even if it has been or is hereafter advised of the possibility of
-+ *  such damages.
-+ */
-+
-+#include <linux/nfs_fs.h>
-+#include "internal.h"
-+#include "pnfs.h"
-+#include "iostat.h"
-+
-+#define NFSDBG_FACILITY		NFSDBG_PNFS
-+
-+/* Locking:
-+ *
-+ * pnfs_spinlock:
-+ *      protects pnfs_modules_tbl.
-+ */
-+static DEFINE_SPINLOCK(pnfs_spinlock);
-+
-+/*
-+ * pnfs_modules_tbl holds all pnfs modules
-+ */
-+static LIST_HEAD(pnfs_modules_tbl);
 +
-+/* Return the registered pnfs layout driver module matching given id */
-+static struct pnfs_layoutdriver_type *
-+find_pnfs_driver_locked(u32 id)
++static inline void
++get_ds_clientid(struct pnfs_ds_clientid *dcp)
 +{
-+	struct pnfs_layoutdriver_type *local;
++	dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
++		atomic_read(&dcp->dc_ref.refcount));
++	kref_get(&dcp->dc_ref);
++}
 +
-+	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
-+		if (local->id == id)
-+			goto out;
-+	local = NULL;
-+out:
-+	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
-+	return local;
++static inline void
++put_ds_mdsid(struct pnfs_mds_id *mdp)
++{
++	dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
++		atomic_read(&mdp->di_ref.refcount));
++	kref_put(&mdp->di_ref, release_ds_mdsid);
 +}
 +
-+static struct pnfs_layoutdriver_type *
-+find_pnfs_driver(u32 id)
++static inline void
++get_ds_mdsid(struct pnfs_mds_id *mdp)
 +{
-+	struct pnfs_layoutdriver_type *local;
-+
-+	spin_lock(&pnfs_spinlock);
-+	local = find_pnfs_driver_locked(id);
-+	spin_unlock(&pnfs_spinlock);
-+	return local;
++	dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
++		atomic_read(&mdp->di_ref.refcount));
++	kref_get(&mdp->di_ref);
 +}
 +
-+/* Set cred to indicate we require a layoutcommit
-+ * If we don't even have a layout, we don't need to commit it.
-+ */
-+void
-+pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx)
++static inline void
++put_ds_stateid(struct pnfs_ds_stateid *dsp)
 +{
-+	dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx);
-+	spin_lock(&nfsi->vfs_inode.i_lock);
-+	if (has_layout(nfsi) &&
-+	    !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state)) {
-+		nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred);
-+		__set_bit(NFS_LAYOUT_NEED_LCOMMIT,
-+			  &nfsi->layout->state);
-+		nfsi->change_attr++;
-+		spin_unlock(&nfsi->vfs_inode.i_lock);
-+		dprintk("%s: Set layoutcommit\n", __func__);
-+		return;
-+	}
-+	spin_unlock(&nfsi->vfs_inode.i_lock);
++	dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
++		atomic_read(&dsp->ds_ref.refcount));
++	kref_put(&dsp->ds_ref, release_ds_stateid);
 +}
 +
-+/* Update last_write_offset for layoutcommit.
-+ * TODO: We should only use commited extents, but the current nfs
-+ * implementation does not calculate the written range in nfs_commit_done.
-+ * We therefore update this field in writeback_done.
-+ */
-+void
-+pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent)
++static inline void
++get_ds_stateid(struct pnfs_ds_stateid *dsp)
 +{
-+	loff_t end_pos;
-+
-+	spin_lock(&nfsi->vfs_inode.i_lock);
-+	if (offset < nfsi->layout->write_begin_pos)
-+		nfsi->layout->write_begin_pos = offset;
-+	end_pos = offset + extent - 1; /* I'm being inclusive */
-+	if (end_pos > nfsi->layout->write_end_pos)
-+		nfsi->layout->write_end_pos = end_pos;
-+	dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n",
-+		__func__,
-+		(unsigned long) extent,
-+		(unsigned long) offset ,
-+		(unsigned long) nfsi->layout->write_begin_pos,
-+		(unsigned long) nfsi->layout->write_end_pos);
-+	spin_unlock(&nfsi->vfs_inode.i_lock);
++	dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
++		atomic_read(&dsp->ds_ref.refcount));
++	kref_get(&dsp->ds_ref);
 +}
 +
 +void
-+unset_pnfs_layoutdriver(struct nfs_server *nfss)
++nfs4_pnfs_state_shutdown(void)
 +{
-+	if (nfss->pnfs_curr_ld) {
-+		nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss);
-+		module_put(nfss->pnfs_curr_ld->owner);
++	struct pnfs_ds_stateid *dsp;
++	int i;
++
++	dprintk("pNFSD %s: -->\n", __func__);
++
++	ds_lock_state();
++	for (i = 0; i < STATEID_HASH_SIZE; i++) {
++		while (!list_empty(&ds_stid_hashtbl[i])) {
++			dsp = list_entry(ds_stid_hashtbl[i].next,
++					 struct pnfs_ds_stateid, ds_hash);
++			put_ds_stateid(dsp);
++		}
 +	}
-+	nfss->pnfs_curr_ld = NULL;
++	ds_unlock_state();
 +}
 +
-+/*
-+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
-+ * Currently only one pNFS layout driver per filesystem is supported.
-+ *
-+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
-+ */
-+void
-+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
-+		      u32 id)
++static struct pnfs_mds_id *
++alloc_init_mds_id(struct pnfs_get_state *gsp)
 +{
-+	struct pnfs_layoutdriver_type *ld_type = NULL;
-+
-+	if (id == 0)
-+		goto out_no_driver;
-+	if ((server->nfs_client->rpc_ops->version != 4) ||
-+	    (server->nfs_client->cl_minorversion != 1))
-+		goto out_no_driver;
-+	if (!(server->nfs_client->cl_exchange_flags &
-+		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
-+		printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
-+		       id, server->nfs_client->cl_exchange_flags);
-+		goto out_no_driver;
-+	}
-+	ld_type = find_pnfs_driver(id);
-+	if (!ld_type) {
-+		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
-+		ld_type = find_pnfs_driver(id);
-+		if (!ld_type) {
-+			dprintk("%s: No pNFS module found for %u.\n",
-+				__func__, id);
-+			goto out_no_driver;
-+		}
-+	}
-+	if (!try_module_get(ld_type->owner)) {
-+		dprintk("%s: Could not grab reference on module\n", __func__);
-+		goto out_no_driver;
-+	}
-+	server->pnfs_curr_ld = ld_type;
-+	if (ld_type->initialize_mountpoint(server, mntfh)) {
-+		printk(KERN_ERR
-+		       "%s: Error initializing mount point for layout driver %u.\n",
-+		       __func__, id);
-+		module_put(ld_type->owner);
-+		goto out_no_driver;
-+	}
-+	dprintk("%s: pNFS module for %u set\n", __func__, id);
-+	return;
++	struct pnfs_mds_id *mdp;
++
++	dprintk("pNFSD: %s\n", __func__);
 +
-+out_no_driver:
-+	dprintk("%s: Using NFSv4 I/O\n", __func__);
-+	server->pnfs_curr_ld = NULL;
++	mdp = kmalloc(sizeof(*mdp), GFP_KERNEL);
++	if (!mdp)
++		return NULL;
++	INIT_LIST_HEAD(&mdp->di_hash);
++	INIT_LIST_HEAD(&mdp->di_mdsclid);
++	list_add(&mdp->di_hash, &mds_id_tbl);
++	mdp->di_mdsid = gsp->dsid;
++	mdp->di_mdsboot = 0;
++	kref_init(&mdp->di_ref);
++	return mdp;
 +}
 +
-+int
-+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
++static struct pnfs_ds_clientid *
++alloc_init_ds_clientid(struct pnfs_get_state *gsp)
 +{
-+	int status = -EINVAL;
-+	struct pnfs_layoutdriver_type *tmp;
-+
-+	if (ld_type->id == 0) {
-+		printk(KERN_ERR "%s id 0 is reserved\n", __func__);
-+		return status;
-+	}
-+	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
-+		printk(KERN_ERR "%s Layout driver must provide "
-+		       "alloc_lseg and free_lseg.\n", __func__);
-+		return status;
-+	}
++	struct pnfs_mds_id *mdp;
++	struct pnfs_ds_clientid *dcp;
++	clientid_t *clid = (clientid_t *)&gsp->clid;
++	unsigned int hashval = clientid_hashval(clid->cl_id);
 +
-+	if (!ld_type->read_pagelist || !ld_type->write_pagelist ||
-+	    !ld_type->commit) {
-+		printk(KERN_ERR "%s Layout driver must provide "
-+		       "read_pagelist, write_pagelist, and commit.\n",
-+		       __func__);
-+		return status;
-+	}
++	dprintk("pNFSD: %s\n", __func__);
 +
-+	spin_lock(&pnfs_spinlock);
-+	tmp = find_pnfs_driver_locked(ld_type->id);
-+	if (!tmp) {
-+		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
-+		status = 0;
-+		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
-+			ld_type->name);
++	mdp = find_pnfs_mds_id(gsp->dsid);
++	if (!mdp) {
++		mdp = alloc_init_mds_id(gsp);
++		if (!mdp)
++			return NULL;
 +	} else {
-+		printk(KERN_ERR "%s Module with id %d already loaded!\n",
-+			__func__, ld_type->id);
++		get_ds_mdsid(mdp);
 +	}
-+	spin_unlock(&pnfs_spinlock);
 +
-+	return status;
++	dcp = kmalloc(sizeof(*dcp), GFP_KERNEL);
++	if (!dcp)
++		return NULL;
++
++	INIT_LIST_HEAD(&dcp->dc_hash);
++	INIT_LIST_HEAD(&dcp->dc_stateid);
++	INIT_LIST_HEAD(&dcp->dc_permdsid);
++	list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]);
++	list_add(&dcp->dc_permdsid, &mdp->di_mdsclid);
++	dcp->dc_mdsclid = *clid;
++	kref_init(&dcp->dc_ref);
++	dcp->dc_mdsid = gsp->dsid;
++	return dcp;
 +}
-+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 +
-+void
-+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
++static struct pnfs_ds_stateid *
++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp)
 +{
-+	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
-+	spin_lock(&pnfs_spinlock);
-+	list_del(&ld_type->pnfs_tblid);
-+	spin_unlock(&pnfs_spinlock);
-+}
-+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
++	struct pnfs_ds_stateid *dsp;
++	u32 st_id = stidp->si_stateownerid;
++	u32 f_id  = stidp->si_fileid;
++	unsigned int hashval;
 +
-+/*
-+ * pNFS client layout cache
-+ */
++	dprintk("pNFSD: %s\n", __func__);
 +
-+static struct pnfs_layout_hdr *
-+pnfs_alloc_layout_hdr(struct inode *ino)
-+{
-+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
-+	return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
-+		kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
-+}
++	dsp = kmalloc(sizeof(*dsp), GFP_KERNEL);
++	if (!dsp)
++		return dsp;
 +
-+static void
-+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
-+{
-+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->inode)->pnfs_curr_ld;
-+	return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
-+}
++	INIT_LIST_HEAD(&dsp->ds_hash);
++	INIT_LIST_HEAD(&dsp->ds_perclid);
++	memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t));
++	fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle);
++	dsp->ds_access = 0;
++	dsp->ds_status = 0;
++	dsp->ds_flags = 0L;
++	kref_init(&dsp->ds_ref);
++	set_bit(DS_STATEID_NEW, &dsp->ds_flags);
++	clear_bit(DS_STATEID_VALID, &dsp->ds_flags);
++	clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
++	init_waitqueue_head(&dsp->ds_waitq);
 +
-+static void
-+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
-+{
-+	assert_spin_locked(&lo->inode->i_lock);
-+	lo->refcount++;
++	hashval = stateid_hashval(st_id, f_id);
++	list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]);
++	dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
++	return dsp;
 +}
 +
-+static void
-+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
++static int
++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh,
++		  struct pnfs_get_state *gsp)
 +{
-+	assert_spin_locked(&lo->inode->i_lock);
-+	BUG_ON(lo->refcount == 0);
++	struct pnfs_ds_clientid *dcp;
++	int new = 0;
 +
-+	lo->refcount--;
-+	if (!lo->refcount) {
-+		dprintk("%s: freeing layout cache %p\n", __func__, lo);
-+		BUG_ON(!list_empty(&lo->layouts));
-+		NFS_I(lo->inode)->layout = NULL;
-+		pnfs_free_layout_hdr(lo);
++	dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++
++	dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid);
++	if (!dcp) {
++		dcp = alloc_init_ds_clientid(gsp);
++		if (!dcp)
++			return 1;
++		new = 1;
++	}
++	if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) {
++		list_add(&dsp->ds_perclid, &dcp->dc_stateid);
++		if (!new)
++			get_ds_clientid(dcp);
 +	}
++
++	memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t));
++	dsp->ds_access = gsp->access;
++	dsp->ds_status = 0;
++	dsp->ds_verifier[0] = gsp->verifier[0];
++	dsp->ds_verifier[1] = gsp->verifier[1];
++	memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t));
++	set_bit(DS_STATEID_VALID, &dsp->ds_flags);
++	clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
++	clear_bit(DS_STATEID_NEW, &dsp->ds_flags);
++	return 0;
 +}
 +
-+void
-+put_layout_hdr(struct inode *inode)
++int
++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs)
 +{
-+	spin_lock(&inode->i_lock);
-+	put_layout_hdr_locked(NFS_I(inode)->layout);
-+	spin_unlock(&inode->i_lock);
++	stateid_t *stid = (stateid_t *)&gs->stid;
++	struct pnfs_ds_stateid *dsp;
 +
-+}
++	dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__,
++		STATEID_VAL(stid));
 +
-+static void
-+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
-+{
-+	INIT_LIST_HEAD(&lseg->fi_list);
-+	kref_init(&lseg->kref);
-+	lseg->valid = true;
-+	lseg->layout = lo;
++	ds_lock_state();
++	dsp = find_pnfs_ds_stateid(stid);
++	if (dsp)
++		put_ds_stateid(dsp);
++	ds_unlock_state();
++
++	dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++
++	if (dsp)
++		return 0;
++	return -ENOENT;
 +}
 +
-+/* Called without i_lock held */
-+static void
-+destroy_lseg(struct kref *kref)
++/* Retrieves and validates stateid.
++ * If stateid exists and its fields match, return it.
++ * If stateid exists but either the generation or
++ * ownerids don't match, check with mds to see if it is valid.
++ * If the stateid doesn't exist, the first thread creates a
++ * invalid *marker* stateid, then checks to see if the
++ * stateid exists on the mds.  If so, it validates the *marker*
++ * stateid and updates its fields.  Subsequent threads that
++ * find the *marker* stateid wait until it is valid or an error
++ * occurs.
++ * Called with ds_state_lock.
++ */
++static struct pnfs_ds_stateid *
++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp)
 +{
-+	struct pnfs_layout_segment *lseg =
-+		container_of(kref, struct pnfs_layout_segment, kref);
-+	struct pnfs_layout_hdr *local = lseg->layout;
++	struct inode *ino = cfh->fh_dentry->d_inode;
++	struct super_block *sb;
++	struct pnfs_ds_stateid *dsp = NULL;
++	struct pnfs_get_state gs = {
++		.access = 0,
++	};
++	int status = 0, waiter = 0;
 +
-+	dprintk("--> %s\n", __func__);
-+	NFS_SERVER(local->inode)->pnfs_curr_ld->free_lseg(lseg);
-+	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
-+	put_layout_hdr(local->inode);
-+}
++	dprintk("pNFSD: %s -->\n", __func__);
 +
-+void
-+put_lseg(struct pnfs_layout_segment *lseg)
-+{
-+	bool do_wake_up;
-+	struct nfs_inode *nfsi;
++	dsp = find_pnfs_ds_stateid(stidp);
++	if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) &&
++	    (stidp->si_generation == dsp->ds_stid.si_generation))
++		goto out_noput;
 +
-+	if (!lseg)
-+		return;
++	sb = ino->i_sb;
++	if (!sb || !sb->s_pnfs_op->get_state)
++		goto out_noput;
 +
-+	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
-+		atomic_read(&lseg->kref.refcount), lseg->valid);
-+	do_wake_up = !lseg->valid;
-+	nfsi = NFS_I(lseg->layout->inode);
-+	kref_put(&lseg->kref, destroy_lseg);
-+	if (do_wake_up)
-+		rpc_wake_up(&nfsi->lo_rpcwaitq);
-+}
-+EXPORT_SYMBOL_GPL(put_lseg);
++	/* Uninitialize current state if it exists yet it doesn't match.
++	 * If it is already invalid, another thread is checking state */
++	if (dsp) {
++		if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags))
++			waiter = 1;
++	} else {
++		dsp = alloc_init_ds_stateid(cfh, stidp);
++		if (!dsp)
++			goto out_noput;
++	}
 +
-+void get_lseg(struct pnfs_layout_segment *lseg)
-+{
-+	kref_get(&lseg->kref);
-+}
-+EXPORT_SYMBOL_GPL(get_lseg);
++	dprintk("pNFSD: %s Starting loop\n", __func__);
++	get_ds_stateid(dsp);
++	while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
++		ds_unlock_state();
 +
-+static inline u64
-+end_offset(u64 start, u64 len)
-+{
-+	u64 end;
++		/* Another thread is checking the state */
++		if (waiter) {
++			dprintk("pNFSD: %s waiting\n", __func__);
++			wait_event_interruptible_timeout(dsp->ds_waitq,
++				(test_bit(DS_STATEID_VALID, &dsp->ds_flags) ||
++				 test_bit(DS_STATEID_ERROR, &dsp->ds_flags)),
++				 msecs_to_jiffies(1024));
++			dprintk("pNFSD: %s awake\n", __func__);
++			ds_lock_state();
++			if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
++				goto out;
 +
-+	end = start + len;
-+	return end >= start ? end: NFS4_MAX_UINT64;
-+}
++			continue;
++		}
 +
-+/* last octet in a range */
-+static inline u64
-+last_byte_offset(u64 start, u64 len)
-+{
-+	u64 end;
++		/* Validate stateid on mds */
++		dprintk("pNFSD: %s Checking state on MDS\n", __func__);
++		memcpy(&gs.stid, stidp, sizeof(stateid_t));
++		status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs);
++		dprintk("pNFSD: %s from MDS status %d\n", __func__, status);
++		ds_lock_state();
++		/* if !status and stateid is valid, update id and mark valid */
++		if (status || update_ds_stateid(dsp, cfh, &gs)) {
++			set_bit(DS_STATEID_ERROR, &dsp->ds_flags);
++			/* remove invalid stateid from list */
++			put_ds_stateid(dsp);
++			wake_up(&dsp->ds_waitq);
++			goto out;
++		}
 +
-+	BUG_ON(!len);
-+	end = start + len;
-+	return end > start ? end - 1: NFS4_MAX_UINT64;
++		wake_up(&dsp->ds_waitq);
++	}
++out:
++	if (dsp)
++		put_ds_stateid(dsp);
++out_noput:
++	if (dsp)
++		dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n",
++			__func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid));
++	/* If error, return null */
++	if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
++		dsp = NULL;
++	dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
++	return dsp;
 +}
 +
-+/*
-+ * is l2 fully contained in l1?
-+ *   start1                             end1
-+ *   [----------------------------------)
-+ *           start2           end2
-+ *           [----------------)
-+ */
-+static inline int
-+lo_seg_contained(struct pnfs_layout_range *l1,
-+		 struct pnfs_layout_range *l2)
++int
++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid)
 +{
-+	u64 start1 = l1->offset;
-+	u64 end1 = end_offset(start1, l1->length);
-+	u64 start2 = l2->offset;
-+	u64 end2 = end_offset(start2, l2->length);
++	struct pnfs_ds_stateid *dsp;
++	int status = 0;
 +
-+	return (start1 <= start2) && (end1 >= end2);
-+}
++	dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__,
++		STATEID_VAL(stateid));
 +
-+/*
-+ * is l1 and l2 intersecting?
-+ *   start1                             end1
-+ *   [----------------------------------)
-+ *                              start2           end2
-+ *                              [----------------)
-+ */
-+static inline int
-+lo_seg_intersecting(struct pnfs_layout_range *l1,
-+		    struct pnfs_layout_range *l2)
-+{
-+	u64 start1 = l1->offset;
-+	u64 end1 = end_offset(start1, l1->length);
-+	u64 start2 = l2->offset;
-+	u64 end2 = end_offset(start2, l2->length);
++	/* Must release state lock while verifying stateid on mds */
++	nfs4_unlock_state();
++	ds_lock_state();
++	dsp = nfsv4_ds_get_state(cfh, stateid);
++	if (dsp) {
++		get_ds_stateid(dsp);
++		dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__,
++			STATEID_VAL(&dsp->ds_stid));
 +
-+	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
-+	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
-+}
++		dprintk("NFSD: %s: dsp %p fh_size %u:%u "
++			"fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] "
++			"gen %x:%x\n",
++			__func__, dsp,
++			cfh->fh_handle.fh_size, dsp->ds_fh.fh_size,
++			((unsigned *)&cfh->fh_handle.fh_base)[0],
++			((unsigned *)&cfh->fh_handle.fh_base)[1],
++			((unsigned *)&cfh->fh_handle.fh_base)[2],
++			((unsigned *)&cfh->fh_handle.fh_base)[3],
++			((unsigned *)&dsp->ds_fh.fh_base)[0],
++			((unsigned *)&dsp->ds_fh.fh_base)[1],
++			((unsigned *)&dsp->ds_fh.fh_base)[2],
++			((unsigned *)&dsp->ds_fh.fh_base)[3],
++			stateid->si_generation, dsp->ds_stid.si_generation);
++	}
 +
-+/*
-+ * iomode matching rules:
-+ * range	lseg	match
-+ * -----	-----	-----
-+ * ANY		READ	true
-+ * ANY		RW	true
-+ * RW		READ	false
-+ * RW		RW	true
-+ * READ		READ	true
-+ * READ		RW	false
-+ */
-+static int
-+should_free_lseg(struct pnfs_layout_segment *lseg,
-+		   struct pnfs_layout_range *range)
-+{
-+	return (range->iomode == IOMODE_ANY ||
-+		lseg->range.iomode == range->iomode) &&
-+	       lo_seg_intersecting(&lseg->range, range);
-+}
++	if (!dsp ||
++	    (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) ||
++	    (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base,
++		    dsp->ds_fh.fh_size) != 0) ||
++	    (stateid->si_generation > dsp->ds_stid.si_generation))
++		status = nfserr_bad_stateid;
++	else if (stateid->si_generation < dsp->ds_stid.si_generation)
++		status = nfserr_old_stateid;
 +
-+static bool
-+_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg)
-+{
-+	return atomic_read(&lseg->kref.refcount) == 1;
++	if (dsp)
++		put_ds_stateid(dsp);
++	ds_unlock_state();
++	nfs4_lock_state();
++	dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status));
++	return status;
 +}
 +
-+static void
-+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
-+		     struct pnfs_layout_range *range)
++void
++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p)
 +{
-+	struct pnfs_layout_segment *lseg, *next;
++	struct pnfs_ds_stateid *dsp = NULL;
 +
-+	dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
-+		__func__, lo, range->offset, range->length, range->iomode);
++	dprintk("pNFSD: %s --> stid %p\n", __func__, stateid);
 +
-+	assert_spin_locked(&lo->inode->i_lock);
-+	list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
-+		if (!should_free_lseg(lseg, range) ||
-+		    !_pnfs_can_return_lseg(lseg))
-+			continue;
-+		dprintk("%s: freeing lseg %p iomode %d "
-+			"offset %llu length %llu\n", __func__,
-+			lseg, lseg->range.iomode, lseg->range.offset,
-+			lseg->range.length);
-+		list_move(&lseg->fi_list, tmp_list);
++	ds_lock_state();
++	if (stateid != NULL) {
++		dsp = find_pnfs_ds_stateid(stateid);
++		if (dsp)
++			get_ds_stateid(dsp);
 +	}
-+	if (list_empty(&lo->segs)) {
-+		struct nfs_client *clp;
 +
-+		clp = NFS_SERVER(lo->inode)->nfs_client;
-+		spin_lock(&clp->cl_lock);
-+		/* List does not take a reference, so no need for put here */
-+		list_del_init(&lo->layouts);
-+		spin_unlock(&clp->cl_lock);
-+		pnfs_invalidate_layout_stateid(lo);
++	/* XXX: Should we fetch the stateid or wait if some other
++	 * thread is currently retrieving the stateid ? */
++	if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
++		*p++ = dsp->ds_verifier[0];
++		*p++ = dsp->ds_verifier[1];
++		put_ds_stateid(dsp);
++	} else {
++		/* must be on MDS */
++		ds_unlock_state();
++		sb->s_pnfs_op->get_verifier(sb, p);
++		ds_lock_state();
++		p += 2;
 +	}
-+
-+	dprintk("%s:Return\n", __func__);
++	ds_unlock_state();
++	dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
++	return;
 +}
 +
++#endif /* CONFIG_PNFSD */
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4proc.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4proc.c.orig	2011-01-28 09:37:32.559979357 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4proc.c	2011-01-28 09:43:53.357769620 -0500
+@@ -34,10 +34,14 @@
+  */
+ #include <linux/file.h>
+ #include <linux/slab.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd4_block.h>
+ 
+ #include "cache.h"
+ #include "xdr4.h"
+ #include "vfs.h"
++#include "pnfsd.h"
+ 
+ #define NFSDDBG_FACILITY		NFSDDBG_PROC
+ 
+@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc
+ 	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
+ 	 */
+ 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
++#if defined(CONFIG_SPNFS)
++	if (!status && spnfs_enabled()) {
++		struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
++
++		status = spnfs_open(inode, open);
++		if (status) {
++			dprintk(
++			     "nfsd: pNFS could not be enabled for inode: %lu\n",
++			     inode->i_ino);
++			/*
++			 * XXX When there's a failure then need to indicate to
++			 * future ops that no pNFS is available.  Should I save
++			 * the status in the inode?  It's kind of a big hammer.
++			 * But there may be no stripes available?
++			 */
++		}
++	}
++#endif /* CONFIG_SPNFS */
+ out:
+ 	if (open->op_stateowner) {
+ 		nfs4_get_stateowner(open->op_stateowner);
+@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str
+ 			   &access->ac_supported);
+ }
+ 
 +static void
-+pnfs_free_lseg_list(struct list_head *tmp_list)
++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf)
 +{
-+	struct pnfs_layout_segment *lseg;
++	u32 *p = (u32 *)verf->data;
 +
-+	while (!list_empty(tmp_list)) {
-+		lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
-+				fi_list);
-+		dprintk("%s calling put_lseg on %p\n", __func__, lseg);
-+		list_del(&lseg->fi_list);
-+		put_lseg(lseg);
++#if defined(CONFIG_PNFSD)
++	if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) {
++		nfs4_ds_get_verifier(NULL, sb, p);
++		return;
 +	}
-+}
-+
-+void
-+pnfs_destroy_layout(struct nfs_inode *nfsi)
-+{
-+	struct pnfs_layout_hdr *lo;
-+	struct pnfs_layout_range range = {
-+		.iomode = IOMODE_ANY,
-+		.offset = 0,
-+		.length = NFS4_MAX_UINT64,
-+	};
-+	LIST_HEAD(tmp_list);
-+
-+	spin_lock(&nfsi->vfs_inode.i_lock);
-+	lo = nfsi->layout;
-+	if (lo) {
-+		pnfs_clear_lseg_list(lo, &tmp_list,  &range);
-+		WARN_ON(!list_empty(&nfsi->layout->segs));
-+		WARN_ON(!list_empty(&nfsi->layout->layouts));
-+		WARN_ON(nfsi->layout->refcount != 1);
++#endif /* CONFIG_PNFSD */
 +
-+		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
-+		put_layout_hdr_locked(lo);
-+	}
-+	spin_unlock(&nfsi->vfs_inode.i_lock);
-+	pnfs_free_lseg_list(&tmp_list);
++	*p++ = nfssvc_boot.tv_sec;
++	*p++ = nfssvc_boot.tv_usec;
 +}
 +
-+/*
-+ * Called by the state manger to remove all layouts established under an
-+ * expired lease.
-+ */
-+void
-+pnfs_destroy_all_layouts(struct nfs_client *clp)
-+{
-+	struct pnfs_layout_hdr *lo;
-+	LIST_HEAD(tmp_list);
+ static __be32
+ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ 	     struct nfsd4_commit *commit)
+ {
+ 	__be32 status;
+ 
+-	u32 *p = (u32 *)commit->co_verf.data;
+-	*p++ = nfssvc_boot.tv_sec;
+-	*p++ = nfssvc_boot.tv_usec;
+-
++	nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
++			   &commit->co_verf);
+ 	status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ 			     commit->co_count);
+ 	if (status == nfserr_symlink)
+@@ -846,7 +882,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru
+ {
+ 	stateid_t *stateid = &write->wr_stateid;
+ 	struct file *filp = NULL;
+-	u32 *p;
+ 	__be32 status = nfs_ok;
+ 	unsigned long cnt;
+ 
+@@ -868,13 +903,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru
+ 
+ 	cnt = write->wr_buflen;
+ 	write->wr_how_written = write->wr_stable_how;
+-	p = (u32 *)write->wr_verifier.data;
+-	*p++ = nfssvc_boot.tv_sec;
+-	*p++ = nfssvc_boot.tv_usec;
+ 
++	nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
++			   &write->wr_verifier);
++#if defined(CONFIG_SPNFS)
++#if defined(CONFIG_SPNFS_BLOCK)
++	if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) {
++                status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode,
++		    RETURN_FILE, write->wr_offset, write->wr_buflen);
++                if (!status) {
++                        status =  nfsd_write(rqstp, &cstate->current_fh, filp,
++			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
++			     &cnt, &write->wr_how_written);
++                }
++        } else
++#endif
++		
++	if (spnfs_enabled()) {
++		status = spnfs_write(cstate->current_fh.fh_dentry->d_inode,
++			write->wr_offset, write->wr_buflen, write->wr_vlen,
++			rqstp);
++		if (status == nfs_ok) {
++			/* DMXXX: HACK to get filesize set */
++			/* write one byte at offset+length-1 */
++			struct kvec k[1];
++			char zero = 0;
++			unsigned long cnt = 1;
 +
-+	spin_lock(&clp->cl_lock);
-+	list_splice_init(&clp->cl_layouts, &tmp_list);
-+	spin_unlock(&clp->cl_lock);
++			k[0].iov_base = (void *)&zero;
++			k[0].iov_len = 1;
++			nfsd_write(rqstp, &cstate->current_fh, filp,
++				   write->wr_offset+write->wr_buflen-1, k, 1,
++				   &cnt, &write->wr_how_written);
++		}
++	} else /* we're not an MDS */
++		status =  nfsd_write(rqstp, &cstate->current_fh, filp,
++			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
++			     &cnt, &write->wr_how_written);
++#else
+ 	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
+ 			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+ 			     &cnt, &write->wr_how_written);
++#endif /* CONFIG_SPNFS */
 +
-+	while (!list_empty(&tmp_list)) {
-+		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-+				layouts);
-+		dprintk("%s freeing layout for inode %lu\n", __func__,
-+			lo->inode->i_ino);
-+		pnfs_destroy_layout(NFS_I(lo->inode));
-+	}
-+}
+ 	if (filp)
+ 		fput(filp);
+ 
+@@ -965,6 +1036,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str
+ 	return status == nfserr_same ? nfs_ok : status;
+ }
+ 
++#if defined(CONFIG_PNFSD)
 +
-+/* update lo->stateid with new if is more recent
-+ *
-+ * lo->stateid could be the open stateid, in which case we just use what given.
-+ */
-+static void
-+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-+			const nfs4_stateid *new)
++static __be32
++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp,
++		    unsigned int layout_type)
 +{
-+	nfs4_stateid *old = &lo->stateid;
-+	bool overwrite = false;
-+
-+	write_seqlock(&lo->seqlock);
-+	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
-+	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
-+		overwrite = true;
-+	else {
-+		u32 oldseq, newseq;
++	int status, type;
 +
-+		oldseq = be32_to_cpu(old->stateid.seqid);
-+		newseq = be32_to_cpu(new->stateid.seqid);
-+		if ((int)(newseq - oldseq) > 0)
-+			overwrite = true;
++	/* check to see if pNFS  is supported. */
++	status = nfserr_layoutunavailable;
++	if (exp && exp->ex_pnfs == 0) {
++		dprintk("%s: Underlying file system "
++			"is not exported over pNFS\n", __func__);
++		goto out;
++	}
++	if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) {
++		dprintk("%s: Underlying file system "
++			"does not support pNFS\n", __func__);
++		goto out;
 +	}
-+	if (overwrite)
-+		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
-+	write_sequnlock(&lo->seqlock);
-+}
 +
-+static void
-+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
-+			      struct nfs4_state *state)
-+{
-+	int seq;
++	type = sb->s_pnfs_op->layout_type(sb);
 +
-+	dprintk("--> %s\n", __func__);
-+	write_seqlock(&lo->seqlock);
-+	do {
-+		seq = read_seqbegin(&state->seqlock);
-+		memcpy(lo->stateid.data, state->stateid.data,
-+		       sizeof(state->stateid.data));
-+	} while (read_seqretry(&state->seqlock, seq));
-+	set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-+	write_sequnlock(&lo->seqlock);
-+	dprintk("<-- %s\n", __func__);
++	/* check to see if requested layout type is supported. */
++	status = nfserr_unknown_layouttype;
++	if (!type)
++		dprintk("BUG: %s: layout_type 0 is reserved and must not be "
++			"used by filesystem\n", __func__);
++	else if (type != layout_type)
++		dprintk("%s: requested layout type %d "
++		       "does not match supported type %d\n",
++			__func__, layout_type, type);
++	else
++		status = nfs_ok;
++out:
++	return status;
 +}
 +
-+void
-+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-+			struct nfs4_state *open_state)
++static __be32
++nfsd4_getdevlist(struct svc_rqst *rqstp,
++		struct nfsd4_compound_state *cstate,
++		struct nfsd4_pnfs_getdevlist *gdlp)
 +{
-+	int seq;
++	struct super_block *sb;
++	struct svc_fh *current_fh = &cstate->current_fh;
++	int status;
 +
-+	dprintk("--> %s\n", __func__);
-+	do {
-+		seq = read_seqbegin(&lo->seqlock);
-+		if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
-+			/* This will trigger retry of the read */
-+			pnfs_layout_from_open_stateid(lo, open_state);
-+		} else
-+			memcpy(dst->data, lo->stateid.data,
-+			       sizeof(lo->stateid.data));
-+	} while (read_seqretry(&lo->seqlock, seq));
-+	dprintk("<-- %s\n", __func__);
-+}
++	dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n",
++		__func__, gdlp->gd_layout_type, gdlp->gd_maxdevices,
++		gdlp->gd_cookie, gdlp->gd_verf);
 +
-+/*
-+* Get layout from server.
-+*    for now, assume that whole file layouts are requested.
-+*    arg->offset: 0
-+*    arg->length: all ones
-+*/
-+static struct pnfs_layout_segment *
-+send_layoutget(struct pnfs_layout_hdr *lo,
-+	   struct nfs_open_context *ctx,
-+	   struct pnfs_layout_range *range)
-+{
-+	struct inode *ino = lo->inode;
-+	struct nfs_server *server = NFS_SERVER(ino);
-+	struct nfs4_layoutget *lgp;
-+	struct pnfs_layout_segment *lseg = NULL;
 +
-+	dprintk("--> %s\n", __func__);
++	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++	if (status)
++		goto out;
 +
-+	BUG_ON(ctx == NULL);
-+	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-+	if (lgp == NULL) {
-+		put_layout_hdr(ino);
-+		return NULL;
-+	}
-+	lgp->args.minlength = PAGE_CACHE_SIZE;
-+	if (lgp->args.minlength > range->length)
-+		lgp->args.minlength = range->length;
-+	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-+	lgp->args.range = *range;
-+	lgp->args.type = server->pnfs_curr_ld->id;
-+	lgp->args.inode = ino;
-+	lgp->args.ctx = get_nfs_open_context(ctx);
-+	lgp->lsegpp = &lseg;
++	status = nfserr_inval;
++	sb = current_fh->fh_dentry->d_inode->i_sb;
++	if (!sb)
++		goto out;
 +
-+	/* Synchronously retrieve layout information from server and
-+	 * store in lseg.
-+	 */
-+	nfs4_proc_layoutget(lgp);
-+	if (!lseg) {
-+		/* remember that LAYOUTGET failed and suspend trying */
-+		set_bit(lo_fail_bit(range->iomode), &lo->state);
-+	}
-+	return lseg;
-+}
++	/* We must be able to encode at list one device */
++	if (!gdlp->gd_maxdevices)
++		goto out;
 +
-+static struct pnfs_layout_segment *
-+has_layout_to_return(struct pnfs_layout_hdr *lo,
-+		     struct pnfs_layout_range *range)
-+{
-+	struct pnfs_layout_segment *out = NULL, *lseg;
-+	dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
-+		__func__, lo, range->offset, range->length, range->iomode);
++	/* Ensure underlying file system supports pNFS and,
++	 * if so, the requested layout type
++	 */
++	status = nfsd4_layout_verify(sb, current_fh->fh_export,
++				     gdlp->gd_layout_type);
++	if (status)
++		goto out;
 +
-+	assert_spin_locked(&lo->inode->i_lock);
-+	list_for_each_entry(lseg, &lo->segs, fi_list)
-+		if (should_free_lseg(lseg, range)) {
-+			out = lseg;
-+			break;
-+		}
++	/* Do nothing if underlying file system does not support
++	 * getdevicelist */
++	if (!sb->s_pnfs_op->get_device_iter) {
++		status = nfserr_notsupp;
++		goto out;
++	}
 +
-+	dprintk("%s:Return lseg=%p\n", __func__, out);
-+	return out;
++	/* Set up arguments so device can be retrieved at encode time */
++	gdlp->gd_fhp = &cstate->current_fh;
++out:
++	return status;
 +}
 +
-+bool
-+pnfs_return_layout_barrier(struct nfs_inode *nfsi,
-+			   struct pnfs_layout_range *range)
++static __be32
++nfsd4_getdevinfo(struct svc_rqst *rqstp,
++		struct nfsd4_compound_state *cstate,
++		struct nfsd4_pnfs_getdevinfo *gdp)
 +{
-+	struct pnfs_layout_segment *lseg;
-+	bool ret = false;
++	struct super_block *sb;
++	int status;
++	clientid_t clid;
 +
-+	spin_lock(&nfsi->vfs_inode.i_lock);
-+	list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) {
-+		if (!should_free_lseg(lseg, range))
-+			continue;
-+		lseg->valid = false;
-+		if (!_pnfs_can_return_lseg(lseg)) {
-+			dprintk("%s: wait on lseg %p refcount %d\n",
-+				__func__, lseg,
-+				atomic_read(&lseg->kref.refcount));
-+			ret = true;
-+		}
++	dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n",
++	       __func__, gdp->gd_layout_type, gdp->gd_devid.sbid,
++	       gdp->gd_devid.devid, gdp->gd_maxcount);
++
++	status = nfserr_inval;
++	sb = find_sbid_id(gdp->gd_devid.sbid);
++	dprintk("%s: sb %p\n", __func__, sb);
++	if (!sb) {
++		status = nfserr_noent;
++		goto out;
 +	}
-+	spin_unlock(&nfsi->vfs_inode.i_lock);
-+	dprintk("%s:Return %d\n", __func__, ret);
-+	return ret;
-+}
 +
-+void
-+pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
-+{
-+	struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
-+	LIST_HEAD(tmp_list);
++	/* Ensure underlying file system supports pNFS and,
++	 * if so, the requested layout type
++	 */
++	status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type);
++	if (status)
++		goto out;
 +
-+	if (lrp->args.return_type != RETURN_FILE)
-+		return;
-+	spin_lock(&lrp->args.inode->i_lock);
-+	pnfs_clear_lseg_list(lo, &tmp_list, &lrp->args.range);
-+	if (!lrp->res.valid)
-+		;	/* forgetful model internal release */
-+	else if (!lrp->res.lrs_present)
-+		pnfs_invalidate_layout_stateid(lo);
-+	else 
-+		pnfs_set_layout_stateid(lo, &lrp->res.stateid);
-+	put_layout_hdr_locked(lo); /* Matched in _pnfs_return_layout */
-+	spin_unlock(&lrp->args.inode->i_lock);
-+	pnfs_free_lseg_list(&tmp_list);
++	/* Set up arguments so device can be retrieved at encode time */
++	gdp->gd_sb = sb;
++
++	/* Update notifications */
++	copy_clientid(&clid, cstate->session);
++	pnfs_set_device_notify(&clid, gdp->gd_notify_types);
++out:
++	return status;
 +}
 +
-+static int
-+return_layout(struct inode *ino, struct pnfs_layout_range *range,
-+	      enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
-+	      bool wait, const nfs4_stateid *stateid)
++static __be32
++nfsd4_layoutget(struct svc_rqst *rqstp,
++		struct nfsd4_compound_state *cstate,
++		struct nfsd4_pnfs_layoutget *lgp)
 +{
-+	struct nfs4_layoutreturn *lrp;
-+	struct nfs_server *server = NFS_SERVER(ino);
-+	int status = -ENOMEM;
++	int status;
++	struct super_block *sb;
++	struct svc_fh *current_fh = &cstate->current_fh;
 +
-+	dprintk("--> %s\n", __func__);
++	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++	if (status)
++		goto out;
++
++	status = nfserr_inval;
++	sb = current_fh->fh_dentry->d_inode->i_sb;
++	if (!sb)
++		goto out;
 +
-+	BUG_ON(type != RETURN_FILE);
++	/* Ensure underlying file system supports pNFS and,
++	 * if so, the requested layout type
++	 */
++	status = nfsd4_layout_verify(sb, current_fh->fh_export,
++				     lgp->lg_seg.layout_type);
++	if (status)
++		goto out;
 +
-+	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
-+	if (lrp == NULL) {
-+		if (lo && (type == RETURN_FILE))
-+			put_layout_hdr(lo->inode);
++	status = nfserr_badiomode;
++	if (lgp->lg_seg.iomode != IOMODE_READ &&
++	    lgp->lg_seg.iomode != IOMODE_RW) {
++		dprintk("pNFS %s: invalid iomode %d\n", __func__,
++			lgp->lg_seg.iomode);
 +		goto out;
 +	}
-+	lrp->args.reclaim = 0;
-+	lrp->args.layout_type = server->pnfs_curr_ld->id;
-+	lrp->args.return_type = type;
-+	lrp->args.range = *range;
-+	lrp->args.inode = ino;
-+	lrp->stateid = stateid;
 +
-+	status = nfs4_proc_layoutreturn(lrp, wait);
++	/* Set up arguments so layout can be retrieved at encode time */
++	lgp->lg_fhp = current_fh;
++	copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session);
++	status = nfs_ok;
 +out:
-+	dprintk("<-- %s status: %d\n", __func__, status);
 +	return status;
 +}
 +
-+int
-+_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
-+		    const nfs4_stateid *stateid, /* optional */
-+		    enum pnfs_layoutreturn_type type,
-+		    bool wait)
++static __be32
++nfsd4_layoutcommit(struct svc_rqst *rqstp,
++		struct nfsd4_compound_state *cstate,
++		struct nfsd4_pnfs_layoutcommit *lcp)
 +{
-+	struct pnfs_layout_hdr *lo = NULL;
-+	struct nfs_inode *nfsi = NFS_I(ino);
-+	struct pnfs_layout_range arg;
-+	int status = 0;
++	int status;
++	struct inode *ino = NULL;
++	struct iattr ia;
++	struct super_block *sb;
++	struct svc_fh *current_fh = &cstate->current_fh;
++
++	dprintk("NFSD: nfsd4_layoutcommit \n");
++	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++	if (status)
++		goto out;
++
++	status = nfserr_inval;
++	ino = current_fh->fh_dentry->d_inode;
++	if (!ino)
++		goto out;
 +
-+	dprintk("--> %s type %d\n", __func__, type);
++	status = nfserr_inval;
++	sb = ino->i_sb;
++	if (!sb)
++		goto out;
 +
++	/* Ensure underlying file system supports pNFS and,
++	 * if so, the requested layout type
++	 */
++	status = nfsd4_layout_verify(sb, current_fh->fh_export,
++				     lcp->args.lc_seg.layout_type);
++	if (status)
++		goto out;
 +
-+	arg.iomode = range ? range->iomode : IOMODE_ANY;
-+	arg.offset = 0;
-+	arg.length = NFS4_MAX_UINT64;
++	/* This will only extend the file length.  Do a quick
++	 * check to see if there is any point in waiting for the update
++	 * locks.
++	 * TODO: Is this correct for all back ends?
++	 */
++	dprintk("%s:new offset: %d new size: %llu old size: %lld\n",
++		__func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1,
++		ino->i_size);
 +
-+	if (type == RETURN_FILE) {
-+		spin_lock(&ino->i_lock);
-+		lo = nfsi->layout;
-+		if (lo && !has_layout_to_return(lo, &arg))
-+			lo = NULL;
-+		if (!lo) {
-+			spin_unlock(&ino->i_lock);
-+			dprintk("%s: no layout segments to return\n", __func__);
++	/* Set clientid from sessionid */
++	copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session);
++	lcp->res.lc_size_chg = 0;
++	if (sb->s_pnfs_op->layout_commit) {
++		status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res);
++		dprintk("%s:layout_commit result %d\n", __func__, status);
++	} else {
++		fh_lock(current_fh);
++		if ((lcp->args.lc_newoffset == 0) ||
++		    ((lcp->args.lc_last_wr + 1) <= ino->i_size)) {
++			status = 0;
++			lcp->res.lc_size_chg = 0;
++			fh_unlock(current_fh);
 +			goto out;
 +		}
 +
-+		/* Reference matched in pnfs_layoutreturn_release */
-+		get_layout_hdr_locked(lo);
-+
-+		spin_unlock(&ino->i_lock);
++		/* Try our best to update the file size */
++		dprintk("%s: Modifying file size\n", __func__);
++		ia.ia_valid = ATTR_SIZE;
++		ia.ia_size = lcp->args.lc_last_wr + 1;
++		status = notify_change(current_fh->fh_dentry, &ia);
++		fh_unlock(current_fh);
++		dprintk("%s:notify_change result %d\n", __func__, status);
++	}
 +
-+		if (layoutcommit_needed(nfsi)) {
-+			if (stateid && !wait) { /* callback */
-+				dprintk("%s: layoutcommit pending\n", __func__);
-+				status = -EAGAIN;
-+				goto out_put;
-+			}
-+			status = pnfs_layoutcommit_inode(ino, wait);
-+			if (status) {
-+				/* Return layout even if layoutcommit fails */
-+				dprintk("%s: layoutcommit failed, status=%d. "
-+					"Returning layout anyway\n",
-+					__func__, status);
-+			}
-+		}
-+		status = return_layout(ino, &arg, type, lo, wait, stateid);
++	if (!status && lcp->res.lc_size_chg &&
++	    EX_ISSYNC(current_fh->fh_export)) {
++		dprintk("%s: Synchronously writing inode size %llu\n",
++			__func__, ino->i_size);
++		write_inode_now(ino, 1);
++		lcp->res.lc_newsize = i_size_read(ino);
 +	}
 +out:
-+	dprintk("<-- %s status: %d\n", __func__, status);
 +	return status;
-+out_put:
-+	put_layout_hdr(ino);
-+	goto out;
 +}
 +
-+/*
-+ * Compare two layout segments for sorting into layout cache.
-+ * We want to preferentially return RW over RO layouts, so ensure those
-+ * are seen first.
-+ */
-+static s64
-+cmp_layout(struct pnfs_layout_range *l1,
-+	   struct pnfs_layout_range *l2)
++static __be32
++nfsd4_layoutreturn(struct svc_rqst *rqstp,
++		struct nfsd4_compound_state *cstate,
++		struct nfsd4_pnfs_layoutreturn *lrp)
 +{
-+	s64 d;
++	int status;
++	struct super_block *sb;
++	struct svc_fh *current_fh = &cstate->current_fh;
 +
-+	/* higher offset > lower offset */
-+	d = l1->offset - l2->offset;
-+	if (d)
-+		return d;
++	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++	if (status)
++		goto out;
 +
-+	/* longer length > shorter length */
-+	d = l1->length - l2->length;
-+	if (d)
-+		return d;
++	status = nfserr_inval;
++	sb = current_fh->fh_dentry->d_inode->i_sb;
++	if (!sb)
++		goto out;
 +
-+	/* read > read/write */
-+	return (int)(l2->iomode == IOMODE_READ) -
-+		(int)(l1->iomode == IOMODE_READ);
-+}
++	/* Ensure underlying file system supports pNFS and,
++	 * if so, the requested layout type
++	 */
++	status = nfsd4_layout_verify(sb, current_fh->fh_export,
++				     lrp->args.lr_seg.layout_type);
++	if (status)
++		goto out;
 +
-+static void
-+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
-+		   struct pnfs_layout_segment *lseg)
-+{
-+	struct pnfs_layout_segment *lp;
-+	int found = 0;
++	status = nfserr_inval;
++	if (lrp->args.lr_return_type != RETURN_FILE &&
++	    lrp->args.lr_return_type != RETURN_FSID &&
++	    lrp->args.lr_return_type != RETURN_ALL) {
++		dprintk("pNFS %s: invalid return_type %d\n", __func__,
++			lrp->args.lr_return_type);
++		goto out;
++	}
 +
-+	dprintk("%s:Begin\n", __func__);
++	status = nfserr_inval;
++	if (lrp->args.lr_seg.iomode != IOMODE_READ &&
++	    lrp->args.lr_seg.iomode != IOMODE_RW &&
++	    lrp->args.lr_seg.iomode != IOMODE_ANY) {
++		dprintk("pNFS %s: invalid iomode %d\n", __func__,
++			lrp->args.lr_seg.iomode);
++		goto out;
++	}
 +
-+	assert_spin_locked(&lo->inode->i_lock);
-+	if (list_empty(&lo->segs)) {
-+		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
++	/* Set clientid from sessionid */
++	copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session);
++	lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE);
++	status = nfs4_pnfs_return_layout(sb, current_fh, lrp);
++out:
++	dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n",
++		__func__, status, lrp->args.lr_return_type, lrp->lrs_present);
++	return status;
++}
++#endif /* CONFIG_PNFSD */
 +
-+		spin_lock(&clp->cl_lock);
-+		BUG_ON(!list_empty(&lo->layouts));
-+		list_add_tail(&lo->layouts, &clp->cl_layouts);
-+		spin_unlock(&clp->cl_lock);
-+	}
-+	list_for_each_entry(lp, &lo->segs, fi_list) {
-+		if (cmp_layout(&lp->range, &lseg->range) > 0)
-+			continue;
-+		list_add_tail(&lseg->fi_list, &lp->fi_list);
-+		dprintk("%s: inserted lseg %p "
-+			"iomode %d offset %llu length %llu before "
-+			"lp %p iomode %d offset %llu length %llu\n",
-+			__func__, lseg, lseg->range.iomode,
-+			lseg->range.offset, lseg->range.length,
-+			lp, lp->range.iomode, lp->range.offset,
-+			lp->range.length);
-+		found = 1;
-+		break;
-+	}
-+	if (!found) {
-+		list_add_tail(&lseg->fi_list, &lo->segs);
-+		dprintk("%s: inserted lseg %p "
-+			"iomode %d offset %llu length %llu at tail\n",
-+			__func__, lseg, lseg->range.iomode,
-+			lseg->range.offset, lseg->range.length);
-+	}
-+	get_layout_hdr_locked(lo);
+ /*
+  * NULL call.
+  */
+@@ -1355,6 +1726,29 @@ static struct nfsd4_operation nfsd4_ops[
+ 		.op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+ 		.op_name = "OP_SECINFO_NO_NAME",
+ 	},
++#if defined(CONFIG_PNFSD)
++	[OP_GETDEVICELIST] = {
++		.op_func = (nfsd4op_func)nfsd4_getdevlist,
++		.op_name = "OP_GETDEVICELIST",
++	},
++	[OP_GETDEVICEINFO] = {
++		.op_func = (nfsd4op_func)nfsd4_getdevinfo,
++		.op_flags = ALLOWED_WITHOUT_FH,
++		.op_name = "OP_GETDEVICEINFO",
++	},
++	[OP_LAYOUTGET] = {
++		.op_func = (nfsd4op_func)nfsd4_layoutget,
++		.op_name = "OP_LAYOUTGET",
++	},
++	[OP_LAYOUTCOMMIT] = {
++		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
++		.op_name = "OP_LAYOUTCOMMIT",
++	},
++	[OP_LAYOUTRETURN] = {
++		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
++		.op_name = "OP_LAYOUTRETURN",
++	},
++#endif /* CONFIG_PNFSD */
+ };
+ 
+ static const char *nfsd4_op_name(unsigned opnum)
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4state.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4state.c.orig	2011-01-28 09:37:32.562979253 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4state.c	2011-01-28 09:43:53.359769399 -0500
+@@ -42,6 +42,8 @@
+ #include "xdr4.h"
+ #include "vfs.h"
+ 
++#include "pnfsd.h"
 +
-+	dprintk("%s:Return\n", __func__);
+ #define NFSDDBG_FACILITY                NFSDDBG_PROC
+ 
+ /* Globals */
+@@ -59,8 +61,6 @@ static u64 current_sessionid = 1;
+ #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+ 
+ /* forward declarations */
+-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
+-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
+ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+ static void nfs4_set_recdir(char *recdir);
+ 
+@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir
+ 
+ /* Currently used for almost all code touching nfsv4 state: */
+ static DEFINE_MUTEX(client_mutex);
++struct task_struct *client_mutex_owner;
+ 
+ /*
+  * Currently used for the del_recall_lru and file hash table.  In an
+@@ -85,11 +86,21 @@ void
+ nfs4_lock_state(void)
+ {
+ 	mutex_lock(&client_mutex);
++	client_mutex_owner = current;
 +}
 +
-+static struct pnfs_layout_hdr *
-+alloc_init_layout_hdr(struct inode *ino)
-+{
-+	struct pnfs_layout_hdr *lo;
-+
-+	lo = pnfs_alloc_layout_hdr(ino);
-+	if (!lo)
-+		return NULL;
-+	lo->refcount = 1;
-+	INIT_LIST_HEAD(&lo->layouts);
-+	INIT_LIST_HEAD(&lo->segs);
-+	seqlock_init(&lo->seqlock);
-+	lo->inode = ino;
-+	return lo;
-+}
++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current)
 +
-+static struct pnfs_layout_hdr *
-+pnfs_find_alloc_layout(struct inode *ino)
++void
++nfs4_bug_on_unlocked_state(void)
 +{
-+	struct nfs_inode *nfsi = NFS_I(ino);
-+	struct pnfs_layout_hdr *new = NULL;
-+
-+	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
-+
-+	assert_spin_locked(&ino->i_lock);
-+	if (nfsi->layout)
-+		return nfsi->layout;
++	BUG_ON(client_mutex_owner != current);
+ }
+ 
+ void
+ nfs4_unlock_state(void)
+ {
++	client_mutex_owner = NULL;
+ 	mutex_unlock(&client_mutex);
+ }
+ 
+@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbyt
+ 
+ static struct list_head del_recall_lru;
+ 
+-static inline void
++inline void
+ put_nfs4_file(struct nfs4_file *fi)
+ {
+ 	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi)
+ 	}
+ }
+ 
+-static inline void
++inline void
+ get_nfs4_file(struct nfs4_file *fi)
+ {
+ 	atomic_inc(&fi->fi_ref);
+@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct 
+ 
+ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+ {
+-	if (fp->fi_fds[oflag]) {
+-		fput(fp->fi_fds[oflag]);
+-		fp->fi_fds[oflag] = NULL;
+-	}
++	struct file *fd = fp->fi_fds[oflag];
 +
-+	spin_unlock(&ino->i_lock);
-+	new = alloc_init_layout_hdr(ino);
-+	spin_lock(&ino->i_lock);
++	if (!fd)
++		return;
 +
-+	if (likely(nfsi->layout == NULL))	/* Won the race? */
-+		nfsi->layout = new;
-+	else
-+		pnfs_free_layout_hdr(new);
-+	return nfsi->layout;
-+}
++	fp->fi_fds[oflag] = NULL;
++	BUG_ON_UNLOCKED_STATE();
++	nfs4_unlock_state();	/* allow nested layout recall/return */
++	fput(fd);
++	nfs4_lock_state();
+ }
+ 
+ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+@@ -306,8 +323,8 @@ static DEFINE_SPINLOCK(client_lock);
+  * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+  * used in reboot/reset lease grace period processing
+  *
+- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
+- * setclientid_confirmed info. 
++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold
++ * confirmed setclientid_confirmed info.
+  *
+  * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed 
+  * setclientid info.
+@@ -332,6 +349,7 @@ static void unhash_generic_stateid(struc
+ 	list_del(&stp->st_hash);
+ 	list_del(&stp->st_perfile);
+ 	list_del(&stp->st_perstateowner);
++	release_pnfs_ds_dev_list(stp);
+ }
+ 
+ static void free_generic_stateid(struct nfs4_stateid *stp)
+@@ -954,6 +972,8 @@ expire_client(struct nfs4_client *clp)
+ 	struct nfs4_delegation *dp;
+ 	struct list_head reaplist;
+ 
++	BUG_ON_UNLOCKED_STATE();
 +
-+/*
-+ * iomode matching rules:
-+ * range	lseg	match
-+ * -----	-----	-----
-+ * ANY		READ	true
-+ * ANY		RW	true
-+ * RW		READ	false
-+ * RW		RW	true
-+ * READ		READ	true
-+ * READ		RW	true
-+ */
-+static int
-+is_matching_lseg(struct pnfs_layout_segment *lseg,
-+		 struct pnfs_layout_range *range)
+ 	INIT_LIST_HEAD(&reaplist);
+ 	spin_lock(&recall_lock);
+ 	while (!list_empty(&clp->cl_delegations)) {
+@@ -973,6 +993,7 @@ expire_client(struct nfs4_client *clp)
+ 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
+ 		release_openowner(sop);
+ 	}
++	pnfs_expire_client(clp);
+ 	nfsd4_shutdown_callback(clp);
+ 	if (clp->cl_cb_conn.cb_xprt)
+ 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+@@ -985,6 +1006,13 @@ expire_client(struct nfs4_client *clp)
+ 	spin_unlock(&client_lock);
+ }
+ 
++void expire_client_lock(struct nfs4_client *clp)
 +{
-+	struct pnfs_layout_range range1;
-+
-+	if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) ||
-+	    !lo_seg_intersecting(&lseg->range, range))
-+		return 0;
-+
-+	/* range1 covers only the first byte in the range */
-+	range1 = *range;
-+	range1.length = 1;
-+	return lo_seg_contained(&lseg->range, &range1);
++	nfs4_lock_state();
++	expire_client(clp);
++	nfs4_unlock_state();
 +}
 +
-+/*
-+ * lookup range in layout
-+ */
-+struct pnfs_layout_segment *
-+pnfs_has_layout(struct pnfs_layout_hdr *lo,
-+		struct pnfs_layout_range *range)
+ static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+ {
+ 	memcpy(target->cl_verifier.data, source->data,
+@@ -1076,6 +1104,11 @@ static struct nfs4_client *create_client
+ 	INIT_LIST_HEAD(&clp->cl_strhash);
+ 	INIT_LIST_HEAD(&clp->cl_openowners);
+ 	INIT_LIST_HEAD(&clp->cl_delegations);
++#if defined(CONFIG_PNFSD)
++	INIT_LIST_HEAD(&clp->cl_layouts);
++	INIT_LIST_HEAD(&clp->cl_layoutrecalls);
++	atomic_set(&clp->cl_deviceref, 0);
++#endif /* CONFIG_PNFSD */
+ 	INIT_LIST_HEAD(&clp->cl_lru);
+ 	INIT_LIST_HEAD(&clp->cl_callbacks);
+ 	spin_lock_init(&clp->cl_lock);
+@@ -1127,7 +1160,7 @@ move_to_confirmed(struct nfs4_client *cl
+ 	renew_client(clp);
+ }
+ 
+-static struct nfs4_client *
++struct nfs4_client *
+ find_confirmed_client(clientid_t *clid)
+ {
+ 	struct nfs4_client *clp;
+@@ -1182,6 +1215,24 @@ find_unconfirmed_client_by_str(const cha
+ 	return NULL;
+ }
+ 
++int
++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *),
++			 void *arg)
 +{
-+	struct pnfs_layout_segment *lseg, *ret = NULL;
-+
-+	dprintk("%s:Begin\n", __func__);
++	struct nfs4_client *clp, *next;
++	int i, status = 0;
 +
-+	assert_spin_locked(&lo->inode->i_lock);
-+	list_for_each_entry(lseg, &lo->segs, fi_list) {
-+		if (is_matching_lseg(lseg, range)) {
-+			ret = lseg;
-+			break;
++	for (i = 0; i < CLIENT_HASH_SIZE; i++)
++		list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i],
++					  cl_strhash) {
++			status = func(clp, arg);
++			if (status)
++				break;
 +		}
-+		if (cmp_layout(range, &lseg->range) > 0)
-+			break;
-+	}
 +
-+	dprintk("%s:Return lseg %p ref %d valid %d\n",
-+		__func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0,
-+		ret ? ret->valid : 0);
-+	return ret;
++	return status;
 +}
 +
-+/*
-+ * Layout segment is retreived from the server if not cached.
-+ * The appropriate layout segment is referenced and returned to the caller.
-+ */
-+struct pnfs_layout_segment *
-+pnfs_update_layout(struct inode *ino,
-+		   struct nfs_open_context *ctx,
-+		   loff_t pos,
-+		   u64 count,
-+		   enum pnfs_iomode iomode)
-+{
-+	struct pnfs_layout_range arg = {
-+		.iomode = iomode,
-+		.offset = pos,
-+		.length = count,
-+	};
-+	struct nfs_inode *nfsi = NFS_I(ino);
-+	struct pnfs_layout_hdr *lo;
-+	struct pnfs_layout_segment *lseg = NULL;
-+
-+	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
-+		return NULL;
-+	spin_lock(&ino->i_lock);
-+	lo = pnfs_find_alloc_layout(ino);
-+	if (lo == NULL) {
-+		dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
-+		goto out_unlock;
-+	}
+ static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+ {
+ 	switch (family) {
+@@ -1330,8 +1381,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co
+ static void
+ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+ {
+-	/* pNFS is not supported */
++#if defined(CONFIG_PNFSD)
++	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS |
++				  EXCHGID4_FLAG_USE_PNFS_DS;
++#else  /* CONFIG_PNFSD */
+ 	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
++#endif /* CONFIG_PNFSD */
+ 
+ 	/* Referrals are supported, Migration is not. */
+ 	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+@@ -1527,6 +1582,13 @@ nfsd4_create_session(struct svc_rqst *rq
+ 	bool confirm_me = false;
+ 	int status = 0;
+ 
++#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
++	/* XXX hack to get local ip address */
++	memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local,
++		sizeof(pnfsd_lexp_addr));
++	pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen;
++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
 +
-+	/* Check to see if the layout for the given range already exists */
-+	lseg = pnfs_has_layout(lo, &arg);
-+	if (lseg) {
-+		if (lseg->valid) {
-+			dprintk("%s: Using cached lseg %p for %llu@%llu "
-+				"iomode %d)\n",
-+				__func__,
-+				lseg,
-+				arg.length,
-+				arg.offset,
-+				arg.iomode);
-+			get_lseg(lseg);
-+			goto out_unlock;
-+		}
-+		/* someone is cleaning the layout */
-+		lseg = NULL;
-+	}
+ 	nfs4_lock_state();
+ 	unconf = find_unconfirmed_client(&cr_ses->clientid);
+ 	conf = find_confirmed_client(&cr_ses->clientid);
+@@ -1562,6 +1624,9 @@ nfsd4_create_session(struct svc_rqst *rq
+ 			goto out;
+ 		}
+ 
++		if (is_ds_only_session(unconf->cl_exchange_flags))
++			cr_ses->flags &= ~SESSION4_BACK_CHAN;
 +
-+	/* if LAYOUTGET already failed once we don't try again */
-+	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
-+		goto out_unlock;
+ 		confirm_me = true;
+ 		conf = unconf;
+ 	} else {
+@@ -2064,7 +2129,7 @@ out:
+ 
+ /* OPEN Share state helper functions */
+ static inline struct nfs4_file *
+-alloc_init_file(struct inode *ino)
++alloc_init_file(struct inode *ino, struct svc_fh *current_fh)
+ {
+ 	struct nfs4_file *fp;
+ 	unsigned int hashval = file_hashval(ino);
+@@ -2080,6 +2145,16 @@ alloc_init_file(struct inode *ino)
+ 		fp->fi_had_conflict = false;
+ 		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+ 		memset(fp->fi_access, 0, sizeof(fp->fi_access));
++#if defined(CONFIG_PNFSD)
++		INIT_LIST_HEAD(&fp->fi_layouts);
++		INIT_LIST_HEAD(&fp->fi_layout_states);
++		fp->fi_fsid.major = current_fh->fh_export->ex_fsid;
++		fp->fi_fsid.minor = 0;
++		fp->fi_fhlen = current_fh->fh_handle.fh_size;
++		BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval));
++		memcpy(fp->fi_fhval, &current_fh->fh_handle.fh_base,
++		       fp->fi_fhlen);
++#endif /* CONFIG_PNFSD */
+ 		spin_lock(&recall_lock);
+ 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ 		spin_unlock(&recall_lock);
+@@ -2088,7 +2163,7 @@ alloc_init_file(struct inode *ino)
+ 	return NULL;
+ }
+ 
+-static void
++void
+ nfsd4_free_slab(struct kmem_cache **slab)
+ {
+ 	if (*slab == NULL)
+@@ -2104,6 +2179,7 @@ nfsd4_free_slabs(void)
+ 	nfsd4_free_slab(&file_slab);
+ 	nfsd4_free_slab(&stateid_slab);
+ 	nfsd4_free_slab(&deleg_slab);
++	nfsd4_free_pnfs_slabs();
+ }
+ 
+ static int
+@@ -2125,6 +2201,8 @@ nfsd4_init_slabs(void)
+ 			sizeof(struct nfs4_delegation), 0, 0, NULL);
+ 	if (deleg_slab == NULL)
+ 		goto out_nomem;
++	if (nfsd4_init_pnfs_slabs())
++		goto out_nomem;
+ 	return 0;
+ out_nomem:
+ 	nfsd4_free_slabs();
+@@ -2198,6 +2276,9 @@ init_stateid(struct nfs4_stateid *stp, s
+ 	INIT_LIST_HEAD(&stp->st_perstateowner);
+ 	INIT_LIST_HEAD(&stp->st_lockowners);
+ 	INIT_LIST_HEAD(&stp->st_perfile);
++#if defined(CONFIG_PNFSD)
++	INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
++#endif /* CONFIG_PNFSD */
+ 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
+ 	list_add(&stp->st_perstateowner, &sop->so_stateids);
+ 	list_add(&stp->st_perfile, &fp->fi_stateids);
+@@ -2239,6 +2320,7 @@ find_openstateowner_str(unsigned int has
+ {
+ 	struct nfs4_stateowner *so = NULL;
+ 
++	BUG_ON_UNLOCKED_STATE();
+ 	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+ 		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
+ 			return so;
+@@ -2247,7 +2329,7 @@ find_openstateowner_str(unsigned int has
+ }
+ 
+ /* search file_hashtbl[] for file */
+-static struct nfs4_file *
++struct nfs4_file *
+ find_file(struct inode *ino)
+ {
+ 	unsigned int hashval = file_hashval(ino);
+@@ -2265,6 +2347,18 @@ find_file(struct inode *ino)
+ 	return NULL;
+ }
+ 
++struct nfs4_file *
++find_alloc_file(struct inode *ino, struct svc_fh *current_fh)
++{
++	struct nfs4_file *fp;
 +
-+	get_layout_hdr_locked(lo); /* Matched in pnfs_layoutget_release */
-+	spin_unlock(&ino->i_lock);
++	fp = find_file(ino);
++	if (fp)
++		return fp;
 +
-+	lseg = send_layoutget(lo, ctx, &arg);
-+out:
-+	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-+		nfsi->layout->state, lseg);
-+	return lseg;
-+out_unlock:
-+	spin_unlock(&ino->i_lock);
-+	goto out;
++	return alloc_init_file(ino, current_fh);
 +}
 +
+ static inline int access_valid(u32 x, u32 minorversion)
+ {
+ 	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
+@@ -2758,7 +2852,7 @@ nfsd4_process_open2(struct svc_rqst *rqs
+ 		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+ 			goto out;
+ 		status = nfserr_resource;
+-		fp = alloc_init_file(ino);
++		fp = alloc_init_file(ino, current_fh);
+ 		if (fp == NULL)
+ 			goto out;
+ 	}
+@@ -2979,7 +3073,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct
+ 	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+ }
+ 
+-static int
 +int
-+pnfs_layout_process(struct nfs4_layoutget *lgp)
+ STALE_STATEID(stateid_t *stateid)
+ {
+ 	if (stateid->si_boot == boot_time)
+@@ -2989,6 +3083,16 @@ STALE_STATEID(stateid_t *stateid)
+ 	return 1;
+ }
+ 
++__be32
++nfs4_check_stateid(stateid_t *stateid)
 +{
-+	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
-+	struct nfs4_layoutget_res *res = &lgp->res;
-+	struct pnfs_layout_segment *lseg;
-+	struct inode *ino = lo->inode;
-+	int status = 0;
++	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
++		return nfserr_bad_stateid;
++	if (STALE_STATEID(stateid))
++		return nfserr_stale_stateid;
++	return 0;
++}
 +
-+	/* Inject layout blob into I/O device driver */
-+	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
-+	if (!lseg || IS_ERR(lseg)) {
-+		if (!lseg)
-+			status = -ENOMEM;
+ static inline int
+ access_permit_read(unsigned long access_bmap)
+ {
+@@ -3100,6 +3204,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_
+ 	if (grace_disallows_io(ino))
+ 		return nfserr_grace;
+ 
++#if defined(CONFIG_PNFSD)
++	if (pnfs_fh_is_ds(&current_fh->fh_handle)) {
++		if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
++			status = nfserr_bad_stateid;
 +		else
-+			status = PTR_ERR(lseg);
-+		dprintk("%s: Could not allocate layout: error %d\n",
-+		       __func__, status);
++#ifdef CONFIG_GFS2_FS_LOCKING_DLM
++		{
++			dprintk("%s Don't check DS stateid\n", __func__);
++			return 0;
++		}
++#else /* CONFIG_GFS2_FS_LOCKING_DLM */
++			status = nfs4_preprocess_pnfs_ds_stateid(current_fh,
++								 stateid);
++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
 +		goto out;
 +	}
++#endif /* CONFIG_PNFSD */
 +
-+	spin_lock(&ino->i_lock);
-+	init_lseg(lo, lseg);
-+	lseg->range = res->range;
-+	get_lseg(lseg);
-+	*lgp->lsegpp = lseg;
-+	pnfs_insert_layout(lo, lseg);
-+
-+	if (res->return_on_close) {
-+		/* FI: This needs to be re-examined.  At lo level,
-+		 * all it needs is a bit indicating whether any of
-+		 * the lsegs in the list have the flags set.
-+		 */
-+		lo->roc_iomode |= res->range.iomode;
-+	}
-+
-+	/* Done processing layoutget. Set the layout stateid */
-+	pnfs_set_layout_stateid(lo, &res->stateid);
-+	spin_unlock(&ino->i_lock);
-+out:
-+	return status;
-+}
-+
-+void
-+readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset,
-+		size_t *count)
+ 	if (nfsd4_has_session(cstate))
+ 		flags |= HAS_SESSION;
+ 
+@@ -3187,13 +3309,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co
+ 	*stpp = NULL;
+ 	*sopp = NULL;
+ 
+-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+-		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
+-		return nfserr_bad_stateid;
+-	}
+-
+-	if (STALE_STATEID(stateid))
+-		return nfserr_stale_stateid;
++	status = nfs4_check_stateid(stateid);
++	if (status)
++		return status;
+ 
+ 	if (nfsd4_has_session(cstate))
+ 		flags |= HAS_SESSION;
+@@ -3468,11 +3586,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp
+ 	if (nfsd4_has_session(cstate))
+ 		flags |= HAS_SESSION;
+ 	nfs4_lock_state();
+-	status = nfserr_bad_stateid;
+-	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+-		goto out;
+-	status = nfserr_stale_stateid;
+-	if (STALE_STATEID(stateid))
++	status = nfs4_check_stateid(stateid);
++	if (status)
+ 		goto out;
+ 	status = nfserr_bad_stateid;
+ 	if (!is_delegation_stateid(stateid))
+@@ -3502,26 +3617,6 @@ out:
+ #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
+ #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
+ 
+-static inline u64
+-end_offset(u64 start, u64 len)
+-{
+-	u64 end;
+-
+-	end = start + len;
+-	return end >= start ? end: NFS4_MAX_UINT64;
+-}
+-
+-/* last octet in a range */
+-static inline u64
+-last_byte_offset(u64 start, u64 len)
+-{
+-	u64 end;
+-
+-	BUG_ON(!len);
+-	end = start + len;
+-	return end > start ? end - 1: NFS4_MAX_UINT64;
+-}
+-
+ #define lockownerid_hashval(id) \
+         ((id) & LOCK_HASH_MASK)
+ 
+@@ -3538,7 +3633,7 @@ static struct list_head lock_ownerid_has
+ static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
+ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
+ 
+-static struct nfs4_stateid *
++struct nfs4_stateid *
+ find_stateid(stateid_t *stid, int flags)
+ {
+ 	struct nfs4_stateid *local;
+@@ -3567,7 +3662,7 @@ find_stateid(stateid_t *stid, int flags)
+ 	return NULL;
+ }
+ 
+-static struct nfs4_delegation *
++struct nfs4_delegation *
+ find_delegation_stateid(struct inode *ino, stateid_t *stid)
+ {
+ 	struct nfs4_file *fp;
+@@ -3698,6 +3793,9 @@ alloc_init_lock_stateid(struct nfs4_stat
+ 	INIT_LIST_HEAD(&stp->st_perfile);
+ 	INIT_LIST_HEAD(&stp->st_perstateowner);
+ 	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
++#if defined(CONFIG_PNFSD)
++	INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
++#endif /* CONFIG_PNFSD */
+ 	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+ 	list_add(&stp->st_perfile, &fp->fi_stateids);
+ 	list_add(&stp->st_perstateowner, &sop->so_stateids);
+@@ -4274,6 +4372,9 @@ nfs4_state_init(void)
+ 	INIT_LIST_HEAD(&client_lru);
+ 	INIT_LIST_HEAD(&del_recall_lru);
+ 	reclaim_str_hashtbl_size = 0;
++#if defined(CONFIG_PNFSD)
++	nfs4_pnfs_state_init();
++#endif /* CONFIG_PNFSD */
+ 	return 0;
+ }
+ 
+@@ -4378,6 +4479,7 @@ __nfs4_state_shutdown(void)
+ 	}
+ 
+ 	nfsd4_shutdown_recdir();
++	nfs4_pnfs_state_shutdown();
+ }
+ 
+ void
+diff -up linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c
+--- linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c.orig	2011-01-28 09:37:32.564979184 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfs4xdr.c	2011-01-28 09:43:53.361769183 -0500
+@@ -45,11 +45,16 @@
+ #include <linux/statfs.h>
+ #include <linux/utsname.h>
+ #include <linux/sunrpc/svcauth_gss.h>
++#include <linux/exportfs.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd4_block.h>
+ 
+ #include "idmap.h"
+ #include "acl.h"
+ #include "xdr4.h"
+ #include "vfs.h"
++#include "pnfsd.h"
+ 
+ 
+ #define NFSDDBG_FACILITY		NFSDDBG_XDR
+@@ -1279,6 +1284,138 @@ static __be32 nfsd4_decode_reclaim_compl
+ 	DECODE_TAIL;
+ }
+ 
++#if defined(CONFIG_PNFSD)
++static __be32
++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp,
++			struct nfsd4_pnfs_getdevlist *gdevl)
 +{
-+	struct page *first, *last;
-+	loff_t foff, i_size = i_size_read(inode);
-+	pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
-+	size_t range;
-+
-+	first = list_entry((pages)->prev, struct page, lru);
-+	last = list_entry((pages)->next, struct page, lru);
++	DECODE_HEAD;
 +
-+	foff = (loff_t)first->index << PAGE_CACHE_SHIFT;
++	READ_BUF(16 + sizeof(nfs4_verifier));
++	READ32(gdevl->gd_layout_type);
++	READ32(gdevl->gd_maxdevices);
++	READ64(gdevl->gd_cookie);
++	COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
 +
-+	range = (last->index - first->index) * PAGE_CACHE_SIZE;
-+	if (last->index == end_index)
-+		range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
-+	else
-+		range += PAGE_CACHE_SIZE;
-+	dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff,
-+		range);
-+	*offset = foff;
-+	*count = range;
++	DECODE_TAIL;
 +}
 +
-+void
-+pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
++static __be32
++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp,
++			struct nfsd4_pnfs_getdevinfo *gdev)
 +{
-+	struct pnfs_layout_hdr *lo;
-+	struct pnfs_layoutdriver_type *ld;
-+
-+	pgio->pg_test = NULL;
++	u32 num;
++	DECODE_HEAD;
 +
-+	lo = NFS_I(inode)->layout;
-+	ld = NFS_SERVER(inode)->pnfs_curr_ld;
-+	if (!ld || !lo)
-+		return;
++	READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid));
++	READ64(gdev->gd_devid.sbid);
++	READ64(gdev->gd_devid.devid);
++	READ32(gdev->gd_layout_type);
++	READ32(gdev->gd_maxcount);
++	READ32(num);
++	if (num) {
++		READ_BUF(4);
++		READ32(gdev->gd_notify_types);
++	} else {
++		gdev->gd_notify_types = 0;
++	}
 +
-+	pgio->pg_test = ld->pg_test;
++	DECODE_TAIL;
 +}
 +
-+/*
-+ * rsize is already set by caller to MDS rsize.
-+ */
-+void
-+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-+		  struct inode *inode,
-+		  struct nfs_open_context *ctx,
-+		  struct list_head *pages,
-+		  size_t *rsize)
++static __be32
++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
++			struct nfsd4_pnfs_layoutget *lgp)
 +{
-+	struct nfs_server *nfss = NFS_SERVER(inode);
-+	size_t count = 0;
-+	loff_t loff;
-+
-+	pgio->pg_iswrite = 0;
-+	pgio->pg_test = NULL;
-+	pgio->pg_lseg = NULL;
-+
-+	if (!pnfs_enabled_sb(nfss))
-+		return;
-+
-+	readahead_range(inode, pages, &loff, &count);
-+	pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ);
-+	if (pgio->pg_lseg) {
-+		pnfs_set_pg_test(inode, pgio);
-+		*rsize = NFS_SERVER(inode)->ds_rsize;
-+	}
-+}
++	DECODE_HEAD;
 +
-+void
-+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
-+		       size_t *wsize)
-+{
-+	struct nfs_server *server = NFS_SERVER(inode);
++	READ_BUF(36);
++	READ32(lgp->lg_signal);
++	READ32(lgp->lg_seg.layout_type);
++	READ32(lgp->lg_seg.iomode);
++	READ64(lgp->lg_seg.offset);
++	READ64(lgp->lg_seg.length);
++	READ64(lgp->lg_minlength);
++	nfsd4_decode_stateid(argp, &lgp->lg_sid);
++	READ_BUF(4);
++	READ32(lgp->lg_maxcount);
 +
-+	pgio->pg_iswrite = 1;
-+	if (!pnfs_enabled_sb(server))
-+		pgio->pg_test = NULL;
-+	else {
-+		pnfs_set_pg_test(inode, pgio);
-+		*wsize = server->ds_wsize;
-+	}
++	DECODE_TAIL;
 +}
 +
-+/* Set buffer size for data servers */
-+void
-+pnfs_set_ds_iosize(struct nfs_server *server)
++static __be32
++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
++			  struct nfsd4_pnfs_layoutcommit *lcp)
 +{
-+	unsigned dssize = 0;
-+
-+	if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize)
-+		dssize = server->pnfs_curr_ld->get_blocksize();
-+	if (dssize)
-+		server->ds_rsize = server->ds_wsize =
-+			nfs_block_size(dssize, NULL);
-+	else {
-+		server->ds_wsize = server->wsize;
-+		server->ds_rsize = server->rsize;
-+	}
-+}
++	DECODE_HEAD;
++	u32 timechange;
 +
-+static int
-+pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data)
-+{
-+	put_lseg(pdata->lseg);
-+	pdata->lseg = NULL;
-+	pdata->call_ops->rpc_call_done(task, data);
-+	if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN)
-+		return -EAGAIN;
-+	if (pdata->pnfsflags & PNFS_NO_RPC) {
-+		pdata->call_ops->rpc_release(data);
++	READ_BUF(20);
++	READ64(lcp->args.lc_seg.offset);
++	READ64(lcp->args.lc_seg.length);
++	READ32(lcp->args.lc_reclaim);
++	nfsd4_decode_stateid(argp, &lcp->lc_sid);
++	READ_BUF(4);
++	READ32(lcp->args.lc_newoffset);
++	if (lcp->args.lc_newoffset) {
++		READ_BUF(8);
++		READ64(lcp->args.lc_last_wr);
++	} else
++		lcp->args.lc_last_wr = 0;
++	READ_BUF(4);
++	READ32(timechange);
++	if (timechange) {
++		READ_BUF(12);
++		READ64(lcp->args.lc_mtime.seconds);
++		READ32(lcp->args.lc_mtime.nseconds);
 +	} else {
-+		/*
-+		 * just restore original rpc call ops
-+		 * rpc_release will be called later by the rpc scheduling layer.
-+		 */
-+		task->tk_ops = pdata->call_ops;
++		lcp->args.lc_mtime.seconds = 0;
++		lcp->args.lc_mtime.nseconds = 0;
 +	}
-+	return 0;
-+}
-+
-+/* Post-write completion function
-+ * Invoked by all layout drivers when write_pagelist is done.
-+ *
-+ * NOTE: callers set data->pnfsflags PNFS_NO_RPC
-+ * so that the NFS cleanup routines perform only the page cache
-+ * cleanup.
-+ */
-+static void
-+pnfs_write_retry(struct work_struct *work)
-+{
-+	struct rpc_task *task;
-+	struct nfs_write_data *wdata;
-+	struct pnfs_layout_range range;
-+
-+	dprintk("%s enter\n", __func__);
-+	task = container_of(work, struct rpc_task, u.tk_work);
-+	wdata = container_of(task, struct nfs_write_data, task);
-+	range.iomode = IOMODE_RW;
-+	range.offset = wdata->args.offset;
-+	range.length = wdata->args.count;
-+	_pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true);
-+	pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode),
-+			    wdata->pdata.call_ops, wdata->pdata.how);
-+}
-+
-+void
-+pnfs_writeback_done(struct nfs_write_data *data)
-+{
-+	struct pnfs_call_data *pdata = &data->pdata;
-+
-+	dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
-+
-+	/* update last write offset and need layout commit
-+	 * for non-files layout types (files layout calls
-+	 * pnfs4_write_done for this)
++	READ_BUF(8);
++	READ32(lcp->args.lc_seg.layout_type);
++	/* XXX: saving XDR'ed layout update. Since we don't have the
++	 * current_fh yet, and therefore no export_ops, we can't call
++	 * the layout specific decode routines. File and pVFS2
++	 * do not use the layout update....
 +	 */
-+	if ((pdata->pnfsflags & PNFS_NO_RPC) &&
-+	    data->task.tk_status >= 0 && data->res.count > 0) {
-+		struct nfs_inode *nfsi = NFS_I(data->inode);
-+
-+		pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
-+		pnfs_need_layoutcommit(nfsi, data->args.context);
++	READ32(lcp->args.lc_up_len);
++	if (lcp->args.lc_up_len > 0) {
++		READ_BUF(lcp->args.lc_up_len);
++		READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len);
 +	}
 +
-+	if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
-+		INIT_WORK(&data->task.u.tk_work, pnfs_write_retry);
-+		queue_work(nfsiod_workqueue, &data->task.u.tk_work);
-+	}
++	DECODE_TAIL;
 +}
-+EXPORT_SYMBOL_GPL(pnfs_writeback_done);
 +
-+static void _pnfs_clear_lseg_from_pages(struct list_head *head)
++static __be32
++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
++			  struct nfsd4_pnfs_layoutreturn *lrp)
 +{
-+	struct nfs_page *req;
++	DECODE_HEAD;
 +
-+	list_for_each_entry(req, head, wb_list) {
-+		put_lseg(req->wb_lseg);
-+		req->wb_lseg = NULL;
++	READ_BUF(16);
++	READ32(lrp->args.lr_reclaim);
++	READ32(lrp->args.lr_seg.layout_type);
++	READ32(lrp->args.lr_seg.iomode);
++	READ32(lrp->args.lr_return_type);
++	if (lrp->args.lr_return_type == RETURN_FILE) {
++		READ_BUF(16);
++		READ64(lrp->args.lr_seg.offset);
++		READ64(lrp->args.lr_seg.length);
++		nfsd4_decode_stateid(argp, &lrp->lr_sid);
++		READ_BUF(4);
++		READ32(lrp->args.lrf_body_len);
++		if (lrp->args.lrf_body_len > 0) {
++			READ_BUF(lrp->args.lrf_body_len);
++			READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len);
++		}
 +	}
-+}
-+
-+/*
-+ * Call the appropriate parallel I/O subsystem write function.
-+ * If no I/O device driver exists, or one does match the returned
-+ * fstype, then return a positive status for regular NFS processing.
-+ *
-+ * TODO: Is wdata->how and wdata->args.stable always the same value?
-+ * TODO: It seems in NFS, the server may not do a stable write even
-+ * though it was requested (and vice-versa?).  To check, it looks
-+ * in data->res.verf->committed.  Do we need this ability
-+ * for non-file layout drivers?
-+ */
-+enum pnfs_try_status
-+pnfs_try_to_write_data(struct nfs_write_data *wdata,
-+			const struct rpc_call_ops *call_ops, int how)
-+{
-+	struct inode *inode = wdata->inode;
-+	enum pnfs_try_status trypnfs;
-+	struct nfs_server *nfss = NFS_SERVER(inode);
-+	struct pnfs_layout_segment *lseg = wdata->req->wb_lseg;
-+
-+	wdata->pdata.call_ops = call_ops;
-+	wdata->pdata.pnfs_error = 0;
-+	wdata->pdata.how = how;
 +
-+	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
-+		inode->i_ino, wdata->args.count, wdata->args.offset, how);
++	DECODE_TAIL;
++}
++#endif /* CONFIG_PNFSD */
 +
-+	get_lseg(lseg);
+ static __be32
+ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+ {
+@@ -1380,11 +1517,19 @@ static nfsd4_dec nfsd41_dec_ops[] = {
+ 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
+ 	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
+ 	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
++#if defined(CONFIG_PNFSD)
++	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdevinfo,
++	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_getdevlist,
++	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
++	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
++	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
++#else  /* CONFIG_PNFSD */
+ 	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
+ 	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+ 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
+ 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
+ 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
++#endif /* CONFIG_PNFSD */
+ 	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
+ 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
+ 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
+@@ -2191,6 +2336,36 @@ out_acl:
+ 		}
+ 		WRITE64(stat.ino);
+ 	}
++#if defined(CONFIG_PNFSD)
++	if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
++		struct super_block *sb = dentry->d_inode->i_sb;
++		int type = 0;
 +
-+	if (!pnfs_use_rpc(nfss))
-+		wdata->pdata.pnfsflags |= PNFS_NO_RPC;
-+	wdata->pdata.lseg = lseg;
-+	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata,
-+		nfs_page_array_len(wdata->args.pgbase, wdata->args.count),
-+		how);
++		/* Query the filesystem for supported pNFS layout types.
++		 * Currently, we only support one layout type per file system.
++		 * The export_ops->layout_type() returns the pnfs_layouttype4.
++		 */
++		buflen -= 4;
++		if (buflen < 0)		/* length */
++			goto out_resource;
 +
-+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-+		wdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
-+		wdata->pdata.lseg = NULL;
-+		put_lseg(lseg);
-+		_pnfs_clear_lseg_from_pages(&wdata->pages);
-+	} else {
-+		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
++		if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type)
++			type = sb->s_pnfs_op->layout_type(sb);
++		if (type) {
++			if ((buflen -= 4) < 0)	/* type */
++				goto out_resource;
++			WRITE32(1); 	/* length */
++			WRITE32(type);  /* type */
++		} else
++			WRITE32(0);  /* length */
 +	}
-+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-+	return trypnfs;
-+}
 +
-+/* Post-read completion function.  Invoked by all layout drivers when
-+ * read_pagelist is done
-+ */
-+static void
-+pnfs_read_retry(struct work_struct *work)
-+{
-+	struct rpc_task *task;
-+	struct nfs_read_data *rdata;
-+	struct pnfs_layout_range range;
++	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
++		if ((buflen -= 4) < 0)
++			goto out_resource;
++		WRITE32(stat.blksize);
++	}
++#endif /* CONFIG_PNFSD */
+ 	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ 		WRITE32(3);
+ 		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+@@ -2434,6 +2609,10 @@ nfsd4_encode_commit(struct nfsd4_compoun
+ 	if (!nfserr) {
+ 		RESERVE_SPACE(8);
+ 		WRITEMEM(commit->co_verf.data, 8);
++		dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n",
++			((u32 *)(&commit->co_verf.data))[0],
++			((u32 *)(&commit->co_verf.data))[1]);
 +
-+	dprintk("%s enter\n", __func__);
-+	task = container_of(work, struct rpc_task, u.tk_work);
-+	rdata = container_of(task, struct nfs_read_data, task);
-+	range.iomode = IOMODE_RW;
-+	range.offset = rdata->args.offset;
-+	range.length = rdata->args.count;
-+	_pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true);
-+	pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode),
-+			   rdata->pdata.call_ops);
-+}
+ 		ADJUST_ARGS();
+ 	}
+ 	return nfserr;
+@@ -2688,6 +2867,13 @@ nfsd4_encode_read(struct nfsd4_compoundr
+ 	}
+ 	read->rd_vlen = v;
+ 
++#if defined(CONFIG_SPNFS)
++	if (spnfs_enabled())
++		nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode,
++				    read->rd_offset, &maxcount, read->rd_vlen,
++				    resp->rqstp);
++	else /* we're not an MDS */
++#endif /* CONFIG_SPNFS */
+ 	nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
+ 			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
+ 			&maxcount);
+@@ -3007,6 +3193,9 @@ nfsd4_encode_write(struct nfsd4_compound
+ 		WRITE32(write->wr_bytes_written);
+ 		WRITE32(write->wr_how_written);
+ 		WRITEMEM(write->wr_verifier.data, 8);
++		dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n",
++			((u32 *)(&write->wr_verifier.data))[0],
++			((u32 *)(&write->wr_verifier.data))[1]);
+ 		ADJUST_ARGS();
+ 	}
+ 	return nfserr;
+@@ -3146,6 +3335,343 @@ nfsd4_encode_sequence(struct nfsd4_compo
+ 	return 0;
+ }
+ 
++#if defined(CONFIG_PNFSD)
 +
-+void
-+pnfs_read_done(struct nfs_read_data *data)
++/* Uses the export interface to iterate through the available devices
++ * and encodes them on the response stream.
++ */
++static  __be32
++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp,
++			      struct nfsd4_pnfs_getdevlist *gdevl,
++			      unsigned int *dev_count)
 +{
-+	struct pnfs_call_data *pdata = &data->pdata;
++	struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb;
++	__be32 nfserr;
++	int status;
++	__be32 *p;
++	struct nfsd4_pnfs_dev_iter_res res = {
++		.gd_cookie = gdevl->gd_cookie,
++		.gd_verf = gdevl->gd_verf,
++		.gd_eof = 0
++	};
++	u64 sbid;
 +
-+	dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
++	dprintk("%s: Begin\n", __func__);
 +
-+	if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
-+		INIT_WORK(&data->task.u.tk_work, pnfs_read_retry);
-+		queue_work(nfsiod_workqueue, &data->task.u.tk_work);
-+	}
-+}
-+EXPORT_SYMBOL_GPL(pnfs_read_done);
++	sbid = find_create_sbid(sb);
++	*dev_count = 0;
++	do {
++		status = sb->s_pnfs_op->get_device_iter(sb,
++							gdevl->gd_layout_type,
++							&res);
++		if (status) {
++			if (status == -ENOENT) {
++				res.gd_eof = 1;
++				/* return success */
++				break;
++			}
++			nfserr = nfserrno(status);
++			goto out_err;
++		}
 +
-+/*
-+ * Call the appropriate parallel I/O subsystem read function.
-+ * If no I/O device driver exists, or one does match the returned
-+ * fstype, then return a positive status for regular NFS processing.
-+ */
-+enum pnfs_try_status
-+pnfs_try_to_read_data(struct nfs_read_data *rdata,
-+		       const struct rpc_call_ops *call_ops)
-+{
-+	struct inode *inode = rdata->inode;
-+	struct nfs_server *nfss = NFS_SERVER(inode);
-+	struct pnfs_layout_segment *lseg = rdata->req->wb_lseg;
-+	enum pnfs_try_status trypnfs;
++		/* Encode device id and layout type */
++		RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid));
++		WRITE64((__be64)sbid);
++		WRITE64(res.gd_devid);	/* devid minor */
++		ADJUST_ARGS();
++		(*dev_count)++;
++	} while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof);
++	gdevl->gd_cookie = res.gd_cookie;
++	gdevl->gd_verf = res.gd_verf;
++	gdevl->gd_eof = res.gd_eof;
++	nfserr = nfs_ok;
++out_err:
++	dprintk("%s: Encoded %u devices\n", __func__, *dev_count);
++	return nfserr;
++}
 +
-+	rdata->pdata.call_ops = call_ops;
-+	rdata->pdata.pnfs_error = 0;
++/* Encodes the response of get device list.
++*/
++static __be32
++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr,
++			struct nfsd4_pnfs_getdevlist *gdevl)
++{
++	unsigned int dev_count = 0, lead_count;
++	u32 *p_in = resp->p;
++	__be32 *p;
 +
-+	dprintk("%s: Reading ino:%lu %u@%llu\n",
-+		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
++	dprintk("%s: err %d\n", __func__, nfserr);
++	if (nfserr)
++		return nfserr;
 +
-+	get_lseg(lseg);
++	/* Ensure we have room for cookie, verifier, and devlist len,
++	 * which we will backfill in after we encode as many devices as possible
++	 */
++	lead_count = 8 + sizeof(nfs4_verifier) + 4;
++	RESERVE_SPACE(lead_count);
++	/* skip past these values */
++	p += XDR_QUADLEN(lead_count);
++	ADJUST_ARGS();
 +
-+	if (!pnfs_use_rpc(nfss))
-+		rdata->pdata.pnfsflags |= PNFS_NO_RPC;
-+	rdata->pdata.lseg = lseg;
-+	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata,
-+		nfs_page_array_len(rdata->args.pgbase, rdata->args.count));
-+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-+		rdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
-+		rdata->pdata.lseg = NULL;
-+		put_lseg(lseg);
-+		_pnfs_clear_lseg_from_pages(&rdata->pages);
-+	} else {
-+		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
-+	}
-+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-+	return trypnfs;
-+}
++	/* Iterate over as many device ids as possible on the xdr stream */
++	nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count);
++	if (nfserr)
++		goto out_err;
 +
-+/*
-+ * This gives the layout driver an opportunity to read in page "around"
-+ * the data to be written.  It returns 0 on success, otherwise an error code
-+ * which will either be passed up to user, or ignored if
-+ * some previous part of write succeeded.
-+ * Note the range [pos, pos+len-1] is entirely within the page.
-+ */
-+int _pnfs_write_begin(struct inode *inode, struct page *page,
-+		      loff_t pos, unsigned len,
-+		      struct pnfs_layout_segment *lseg,
-+		      struct pnfs_fsdata **fsdata)
-+{
-+	struct pnfs_fsdata *data;
-+	int status = 0;
++	/* Backfill in cookie, verf and number of devices encoded */
++	p = p_in;
++	WRITE64(gdevl->gd_cookie);
++	WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
++	WRITE32(dev_count);
 +
-+	dprintk("--> %s: pos=%llu len=%u\n",
-+		__func__, (unsigned long long)pos, len);
-+	data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
-+	if (!data) {
-+		status = -ENOMEM;
-+		goto out;
-+	}
-+	data->lseg = lseg; /* refcount passed into data to be managed there */
-+	status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
-+						lseg, page, pos, len, data);
-+	if (status) {
-+		kfree(data);
-+		data = NULL;
-+	}
-+out:
-+	*fsdata = data;
-+	dprintk("<-- %s: status=%d\n", __func__, status);
-+	return status;
-+}
++	/* Skip over devices */
++	p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid));
++	ADJUST_ARGS();
 +
-+/* pNFS Commit callback function for all layout drivers */
-+void
-+pnfs_commit_done(struct nfs_write_data *data)
-+{
-+	struct pnfs_call_data *pdata = &data->pdata;
++	/* are we at the end of devices? */
++	RESERVE_SPACE(4);
++	WRITE32(gdevl->gd_eof);
++	ADJUST_ARGS();
 +
-+	dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
++	dprintk("%s: done.\n", __func__);
 +
-+	if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
-+		struct pnfs_layout_range range = {
-+			.iomode = IOMODE_RW,
-+			.offset = data->args.offset,
-+			.length = data->args.count,
-+		};
-+		dprintk("%s: retrying\n", __func__);
-+		_pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE,
-+				    true);
-+		pnfs_initiate_commit(data, NFS_CLIENT(data->inode),
-+				     pdata->call_ops, pdata->how, 1);
-+	}
++	nfserr = nfs_ok;
++out:
++	return nfserr;
++out_err:
++	p = p_in;
++	ADJUST_ARGS();
++	goto out;
 +}
-+EXPORT_SYMBOL_GPL(pnfs_commit_done);
 +
-+enum pnfs_try_status
-+pnfs_try_to_commit(struct nfs_write_data *data,
-+		    const struct rpc_call_ops *call_ops, int sync)
++/* For a given device id, have the file system retrieve and encode the
++ * associated device.  For file layout, the encoding function is
++ * passed down to the file system.  The file system then has the option
++ * of using this encoding function or one of its own.
++ *
++ * Note: the file system must return the XDR size of struct device_addr4
++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the
++ * gdir_mincount calculation.
++ */
++static __be32
++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
++			struct nfsd4_pnfs_getdevinfo *gdev)
 +{
-+	struct inode *inode = data->inode;
-+	struct nfs_server *nfss = NFS_SERVER(data->inode);
-+	enum pnfs_try_status trypnfs;
++	struct super_block *sb;
++	int maxcount = 0, type_notify_len = 12;
++	__be32 *p, *p_save = NULL, *p_in = resp->p;
++	struct exp_xdr_stream xdr;
 +
-+	dprintk("%s: Begin\n", __func__);
++	dprintk("%s: err %d\n", __func__, nfserr);
++	if (nfserr)
++		return nfserr;
 +
-+	if (!pnfs_use_rpc(nfss))
-+		data->pdata.pnfsflags |= PNFS_NO_RPC;
-+	/* We need to account for possibility that
-+	 * each nfs_page can point to a different lseg (or be NULL).
-+	 * For the immediate case of whole-file-only layouts, we at
-+	 * least know there can be only a single lseg.
-+	 * We still have to account for the possibility of some being NULL.
-+	 * This will be done by passing the buck to the layout driver.
-+	 */
-+	data->pdata.call_ops = call_ops;
-+	data->pdata.pnfs_error = 0;
-+	data->pdata.how = sync;
-+	data->pdata.lseg = NULL;
-+	trypnfs = nfss->pnfs_curr_ld->commit(data, sync);
-+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
-+		data->pdata.pnfsflags &= ~PNFS_NO_RPC;
-+		_pnfs_clear_lseg_from_pages(&data->pages);
-+	} else
-+		nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT);
-+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-+	return trypnfs;
-+}
++	sb = gdev->gd_sb;
 +
-+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
-+{
-+	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
++	if (gdev->gd_maxcount != 0) {
++		/* FIXME: this will be bound by the session max response */
++		maxcount = svc_max_payload(resp->rqstp);
++		if (maxcount > gdev->gd_maxcount)
++			maxcount = gdev->gd_maxcount;
 +
-+	/* TODO: Maybe we should avoid this by allowing the layout driver
-+	* to directly xdr its layout on the wire.
-+	*/
-+	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
-+		nfss->pnfs_curr_ld->cleanup_layoutcommit(
-+					NFS_I(data->args.inode)->layout,
-+					&data->args, data->status);
-+}
++		/* Ensure have room for type and notify field */
++		maxcount -= type_notify_len;
++		if (maxcount < 0) {
++			nfserr = -ETOOSMALL;
++			goto toosmall;
++		}
++	}
 +
-+/*
-+ * Set up the argument/result storage required for the RPC call.
-+ */
-+static int
-+pnfs_layoutcommit_setup(struct inode *inode,
-+			struct nfs4_layoutcommit_data *data,
-+			loff_t write_begin_pos, loff_t write_end_pos)
-+{
-+	struct nfs_server *nfss = NFS_SERVER(inode);
-+	int result = 0;
++	RESERVE_SPACE(4);
++	WRITE32(gdev->gd_layout_type);
++	ADJUST_ARGS();
 +
-+	dprintk("--> %s\n", __func__);
++	/* If maxcount is 0 then just update notifications */
++	if (gdev->gd_maxcount == 0)
++		goto handle_notifications;
 +
-+	data->args.inode = inode;
-+	data->args.fh = NFS_FH(inode);
-+	data->args.layout_type = nfss->pnfs_curr_ld->id;
-+	data->res.fattr = &data->fattr;
-+	nfs_fattr_init(&data->fattr);
++	xdr.p = p_save = resp->p;
++	xdr.end = resp->end;
++	if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
++		xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
 +
-+	/* TODO: Need to determine the correct values */
-+	data->args.time_modify_changed = 0;
++	nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type,
++						&gdev->gd_devid);
++	if (nfserr)
++		goto err;
 +
-+	/* Set values from inode so it can be reset
++	/* The file system should never write 0 bytes without
++	 * returning an error
 +	 */
-+	data->args.range.iomode = IOMODE_RW;
-+	data->args.range.offset = write_begin_pos;
-+	data->args.range.length = write_end_pos - write_begin_pos + 1;
-+	data->args.lastbytewritten =  min(write_end_pos,
-+					  i_size_read(inode) - 1);
-+	data->args.bitmask = nfss->attr_bitmask;
-+	data->res.server = nfss;
++	BUG_ON(xdr.p == p_save);
++	BUG_ON(xdr.p > xdr.end);
 +
-+	/* Call layout driver to set the arguments */
-+	if (nfss->pnfs_curr_ld->setup_layoutcommit)
-+		result = nfss->pnfs_curr_ld->setup_layoutcommit(
-+				NFS_I(inode)->layout, &data->args);
++	/* Update the xdr stream with the number of bytes encoded
++	 * by the file system.
++	 */
++	p = xdr.p;
++	ADJUST_ARGS();
 +
-+	dprintk("<-- %s Status %d\n", __func__, result);
-+	return result;
++handle_notifications:
++	/* Encode supported device notifications */
++	RESERVE_SPACE(4);
++	if (sb->s_pnfs_op->set_device_notify) {
++		struct pnfs_devnotify_arg dn_args;
++
++		dn_args.dn_layout_type = gdev->gd_layout_type;
++		dn_args.dn_devid = gdev->gd_devid;
++		dn_args.dn_notify_types = gdev->gd_notify_types;
++		nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args);
++		if (nfserr)
++			goto err;
++		WRITE32(dn_args.dn_notify_types);
++	} else {
++		WRITE32(0);
++	}
++	ADJUST_ARGS();
++
++out:
++	return nfserrno(nfserr);
++toosmall:
++	dprintk("%s: maxcount too small\n", __func__);
++	RESERVE_SPACE(4);
++	WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len);
++	ADJUST_ARGS();
++	goto out;
++err:
++	/* Rewind to the beginning */
++	p = p_in;
++	ADJUST_ARGS();
++	if (nfserr == -ETOOSMALL)
++		goto toosmall;
++	printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr);
++	goto out;
 +}
 +
-+/* Issue a async layoutcommit for an inode.
-+ */
-+int
-+pnfs_layoutcommit_inode(struct inode *inode, int sync)
++static __be32
++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp,
++		       __be32 nfserr,
++		       struct nfsd4_pnfs_layoutget *lgp)
 +{
-+	struct nfs4_layoutcommit_data *data;
-+	struct nfs_inode *nfsi = NFS_I(inode);
-+	loff_t write_begin_pos;
-+	loff_t write_end_pos;
++	int maxcount, leadcount;
++	struct super_block *sb;
++	struct exp_xdr_stream xdr;
++	__be32 *p, *p_save, *p_start = resp->p;
 +
-+	int status = 0;
++	dprintk("%s: err %d\n", __func__, nfserr);
++	if (nfserr)
++		return nfserr;
 +
-+	dprintk("%s Begin (sync:%d)\n", __func__, sync);
++	sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb;
++	maxcount = PAGE_SIZE;
++	if (maxcount > lgp->lg_maxcount)
++		maxcount = lgp->lg_maxcount;
 +
-+	BUG_ON(!has_layout(nfsi));
++	/* Check for space on xdr stream */
++	leadcount = 36 + sizeof(stateid_opaque_t);
++	RESERVE_SPACE(leadcount);
++	/* encode layout metadata after file system encodes layout */
++	p += XDR_QUADLEN(leadcount);
++	ADJUST_ARGS();
 +
-+	data = kzalloc(sizeof(*data), GFP_NOFS);
-+	if (!data)
-+		return -ENOMEM;
++	/* Ensure have room for ret_on_close, off, len, iomode, type */
++	maxcount -= leadcount;
++	if (maxcount < 0) {
++		printk(KERN_ERR "%s: buffer too small\n", __func__);
++		nfserr = nfserr_toosmall;
++		goto err;
++	}
 +
-+	spin_lock(&inode->i_lock);
-+	if (!layoutcommit_needed(nfsi)) {
-+		spin_unlock(&inode->i_lock);
-+		goto out_free;
++	/* Set xdr info so file system can encode layout */
++	xdr.p = p_save = resp->p;
++	xdr.end = resp->end;
++	if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
++		xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
++
++	/* Retrieve, encode, and merge layout; process stateid */
++	nfserr = nfs4_pnfs_get_layout(lgp, &xdr);
++	if (nfserr)
++		goto err;
++
++	/* Ensure file system returned enough bytes for the client
++	 * to access.
++	 */
++	if (lgp->lg_seg.length < lgp->lg_minlength) {
++		nfserr = nfserr_badlayout;
++		goto err;
 +	}
 +
-+	/* Clear layoutcommit properties in the inode so
-+	 * new lc info can be generated
++	/* The file system should never write 0 bytes without
++	 * returning an error
 +	 */
-+	write_begin_pos = nfsi->layout->write_begin_pos;
-+	write_end_pos = nfsi->layout->write_end_pos;
-+	data->cred = nfsi->layout->cred;
-+	nfsi->layout->write_begin_pos = 0;
-+	nfsi->layout->write_end_pos = 0;
-+	nfsi->layout->cred = NULL;
-+	__clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
-+	pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout, NULL);
++	BUG_ON(xdr.p == p_save);
 +
-+	/* Reference for layoutcommit matched in pnfs_layoutcommit_release */
-+	get_layout_hdr_locked(NFS_I(inode)->layout);
++	/* Rewind to beginning and encode attrs */
++	resp->p = p_start;
++	RESERVE_SPACE(4);
++	WRITE32(lgp->lg_roc);	/* return on close */
++	ADJUST_ARGS();
++	nfsd4_encode_stateid(resp, &lgp->lg_sid);
++	RESERVE_SPACE(28);
++	/* Note: response logr_layout array count, always one for now */
++	WRITE32(1);
++	WRITE64(lgp->lg_seg.offset);
++	WRITE64(lgp->lg_seg.length);
++	WRITE32(lgp->lg_seg.iomode);
++	WRITE32(lgp->lg_seg.layout_type);
 +
-+	spin_unlock(&inode->i_lock);
++	/* Update the xdr stream with the number of bytes written
++	 * by the file system
++	 */
++	p = xdr.p;
++	ADJUST_ARGS();
 +
-+	/* Set up layout commit args */
-+	status = pnfs_layoutcommit_setup(inode, data, write_begin_pos,
-+					 write_end_pos);
-+	if (status) {
-+		/* The layout driver failed to setup the layoutcommit */
-+		put_rpccred(data->cred);
-+		put_layout_hdr(inode);
-+		goto out_free;
-+	}
-+	status = nfs4_proc_layoutcommit(data, sync);
-+out:
-+	dprintk("%s end (err:%d)\n", __func__, status);
-+	return status;
-+out_free:
-+	kfree(data);
-+	goto out;
++	return nfs_ok;
++err:
++	resp->p = p_start;
++	return nfserr;
 +}
 +
-+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
++static __be32
++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
++			  struct nfsd4_pnfs_layoutcommit *lcp)
 +{
-+	/* lseg refcounting handled directly in nfs_write_end */
-+	kfree(fsdata);
-+}
++	__be32 *p;
 +
-+/*
-+ * Device ID cache. Currently supports one layout type per struct nfs_client.
-+ * Add layout type to the lookup key to expand to support multiple types.
-+ */
-+int
-+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
-+			 void (*free_callback)(struct pnfs_deviceid_node *))
-+{
-+	struct pnfs_deviceid_cache *c;
++	if (nfserr)
++		goto out;
 +
-+	c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
-+	if (!c)
-+		return -ENOMEM;
-+	spin_lock(&clp->cl_lock);
-+	if (clp->cl_devid_cache != NULL) {
-+		atomic_inc(&clp->cl_devid_cache->dc_ref);
-+		dprintk("%s [kref [%d]]\n", __func__,
-+			atomic_read(&clp->cl_devid_cache->dc_ref));
-+		kfree(c);
-+	} else {
-+		/* kzalloc initializes hlists */
-+		spin_lock_init(&c->dc_lock);
-+		atomic_set(&c->dc_ref, 1);
-+		c->dc_free_callback = free_callback;
-+		clp->cl_devid_cache = c;
-+		dprintk("%s [new]\n", __func__);
++	RESERVE_SPACE(4);
++	WRITE32(lcp->res.lc_size_chg);
++	ADJUST_ARGS();
++	if (lcp->res.lc_size_chg) {
++		RESERVE_SPACE(8);
++		WRITE64(lcp->res.lc_newsize);
++		ADJUST_ARGS();
 +	}
-+	spin_unlock(&clp->cl_lock);
-+	return 0;
++out:
++	return nfserr;
 +}
-+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
 +
-+/* Must be called with locked c->dc_lock */
-+static struct pnfs_deviceid_node *
-+pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c,
-+		     struct nfs4_deviceid *id)
++static __be32
++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
++			  struct nfsd4_pnfs_layoutreturn *lrp)
 +{
-+	struct pnfs_deviceid_node *d;
-+	struct hlist_node *n;
-+	long h = nfs4_deviceid_hash(id);
++	__be32 *p;
 +
-+	dprintk("%s hash %ld\n", __func__, h);
-+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
-+		if (!memcmp(&d->de_id, id, sizeof(*id))) {
-+			hlist_del_rcu(&d->de_node);
-+			synchronize_rcu();
-+			return d;
-+		}
++	if (nfserr)
++		goto out;
 +
-+	return NULL;
++	RESERVE_SPACE(4);
++	WRITE32(lrp->lrs_present != 0);    /* got stateid? */
++	ADJUST_ARGS();
++	if (lrp->lrs_present)
++		nfsd4_encode_stateid(resp, &lrp->lr_sid);
++out:
++	return nfserr;
 +}
++#endif /* CONFIG_PNFSD */
 +
-+/*
-+ * Called from pnfs_layoutdriver_type->free_lseg
-+ * last layout segment reference frees deviceid
-+ */
-+void
-+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-+		  struct pnfs_deviceid_node *devid)
-+{
-+	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
-+	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
-+		return;
-+
-+	pnfs_unhash_deviceid(c, &devid->de_id);
-+	spin_unlock(&c->dc_lock);
+ static __be32
+ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+ {
+@@ -3206,11 +3732,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
+ 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
+ 	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+ 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
++#if defined(CONFIG_PNFSD)
++	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdevinfo,
++	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_getdevlist,
++	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
++	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
++	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
++#else  /* CONFIG_PNFSD */
+ 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+ 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+ 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+ 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+ 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
++#endif /* CONFIG_PNFSD */
+ 	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
+ 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+ 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+diff -up linux-2.6.37.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.37.noarch/fs/nfsd/nfsctl.c
+--- linux-2.6.37.noarch/fs/nfsd/nfsctl.c.orig	2011-01-28 09:37:32.565979149 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfsctl.c	2011-01-28 09:43:53.363768972 -0500
+@@ -12,11 +12,16 @@
+ #include <linux/nfsd/syscall.h>
+ #include <linux/lockd/lockd.h>
+ #include <linux/sunrpc/clnt.h>
++#include <linux/nfsd/nfs4pnfsdlm.h>
+ 
+ #include "idmap.h"
+ #include "nfsd.h"
+ #include "cache.h"
+ 
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
++#include <linux/nfsd4_spnfs.h>
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
 +
-+	c->dc_free_callback(devid);
-+}
-+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+ /*
+  *	We have a single directory with 9 nodes in it.
+  */
+@@ -51,6 +56,9 @@ enum {
+ 	NFSD_Gracetime,
+ 	NFSD_RecoveryDir,
+ #endif
++#ifdef CONFIG_PNFSD
++	NFSD_pnfs_dlm_device,
++#endif
+ };
+ 
+ /*
+@@ -78,6 +86,9 @@ static ssize_t write_leasetime(struct fi
+ static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
+ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+ #endif
++#ifdef CONFIG_PNFSD
++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size);
++#endif
+ 
+ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+ #ifdef CONFIG_NFSD_DEPRECATED
+@@ -102,6 +113,9 @@ static ssize_t (*write_op[])(struct file
+ 	[NFSD_Gracetime] = write_gracetime,
+ 	[NFSD_RecoveryDir] = write_recoverydir,
+ #endif
++#ifdef CONFIG_PNFSD
++	[NFSD_pnfs_dlm_device] = write_pnfs_dlm_device,
++#endif
+ };
+ 
+ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+@@ -1366,6 +1380,68 @@ static ssize_t write_recoverydir(struct 
+ 
+ #endif
+ 
++#ifdef CONFIG_PNFSD
 +
-+void
-+pnfs_delete_deviceid(struct pnfs_deviceid_cache *c,
-+		     struct nfs4_deviceid *id)
++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf,
++				       size_t size)
 +{
-+	struct pnfs_deviceid_node *devid;
-+
-+	spin_lock(&c->dc_lock);
-+	devid = pnfs_unhash_deviceid(c, id);
-+	spin_unlock(&c->dc_lock);
-+
-+	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
-+	if (atomic_dec_and_test(&devid->de_ref))
-+		c->dc_free_callback(devid);
-+}
-+EXPORT_SYMBOL_GPL(pnfs_delete_deviceid);
++	char *mesg = buf;
++	char *pnfs_dlm_device;
++	int max_size = NFSD_PNFS_DLM_DEVICE_MAX;
++	int len, ret = 0;
 +
-+/* Find and reference a deviceid */
-+struct pnfs_deviceid_node *
-+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-+{
-+	struct pnfs_deviceid_node *d;
-+	struct hlist_node *n;
-+	long hash = nfs4_deviceid_hash(id);
++	if (size > 0) {
++		ret = -EINVAL;
++		if (size > max_size || buf[size-1] != '\n')
++			return ret;
++		buf[size-1] = 0;
 +
-+	dprintk("--> %s hash %ld\n", __func__, hash);
-+	rcu_read_lock();
-+	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-+		if (!memcmp(&d->de_id, id, sizeof(*id))) {
-+			if (!atomic_inc_not_zero(&d->de_ref)) {
-+				goto fail;
-+			} else {
-+				rcu_read_unlock();
-+				return d;
-+			}
-+		}
-+	}
-+fail:
-+	rcu_read_unlock();
-+	return NULL;
-+}
-+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
++		pnfs_dlm_device = mesg;
++		len = qword_get(&mesg, pnfs_dlm_device, size);
++		if (len <= 0)
++			return ret;
 +
-+/*
-+ * Add a deviceid to the cache.
-+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
-+ */
-+struct pnfs_deviceid_node *
-+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
-+{
-+	struct pnfs_deviceid_node *d;
-+	long hash = nfs4_deviceid_hash(&new->de_id);
++		ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len);
++	} else
++		return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT);
 +
-+	dprintk("--> %s hash %ld\n", __func__, hash);
-+	spin_lock(&c->dc_lock);
-+	d = pnfs_find_get_deviceid(c, &new->de_id);
-+	if (d) {
-+		spin_unlock(&c->dc_lock);
-+		dprintk("%s [discard]\n", __func__);
-+		c->dc_free_callback(new);
-+		return d;
-+	}
-+	INIT_HLIST_NODE(&new->de_node);
-+	atomic_set(&new->de_ref, 1);
-+	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-+	spin_unlock(&c->dc_lock);
-+	dprintk("%s [new]\n", __func__);
-+	return new;
++	return ret <= 0 ? ret : strlen(buf);
 +}
-+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-+
-+void
-+pnfs_put_deviceid_cache(struct nfs_client *clp)
-+{
-+	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
 +
-+	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
-+	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-+		int i;
-+		/* Verify cache is empty */
-+		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-+			BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
-+		clp->cl_devid_cache = NULL;
-+		spin_unlock(&clp->cl_lock);
-+		kfree(local);
-+	}
-+}
-+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
-diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
-new file mode 100644
-index 0000000..0e90b0e
---- /dev/null
-+++ b/fs/nfs/pnfs.h
-@@ -0,0 +1,595 @@
-+/*
-+ *  pNFS client data structures.
-+ *
-+ *  Copyright (c) 2002
-+ *  The Regents of the University of Michigan
-+ *  All Rights Reserved
++/**
++ * write_pnfs_dlm_device - Set or report the current pNFS data server list
 + *
-+ *  Dean Hildebrand <dhildebz at umich.edu>
++ * Input:
++ *			buf:		ignored
++ *			size:		zero
 + *
-+ *  Permission is granted to use, copy, create derivative works, and
-+ *  redistribute this software and such derivative works for any purpose,
-+ *  so long as the name of the University of Michigan is not used in
-+ *  any advertising or publicity pertaining to the use or distribution
-+ *  of this software without specific, written prior authorization. If
-+ *  the above copyright notice or any other identification of the
-+ *  University of Michigan is included in any copy of any portion of
-+ *  this software, then the disclaimer below must also be included.
++ * OR
 + *
-+ *  This software is provided as is, without representation or warranty
-+ *  of any kind either express or implied, including without limitation
-+ *  the implied warranties of merchantability, fitness for a particular
-+ *  purpose, or noninfringement.  The Regents of the University of
-+ *  Michigan shall not be liable for any damages, including special,
-+ *  indirect, incidental, or consequential damages, with respect to any
-+ *  claim arising out of or in connection with the use of the software,
-+ *  even if it has been or is hereafter advised of the possibility of
-+ *  such damages.
++ * Input:
++ *			buf:		C string containing a block device name,
++ *					a colon, and then a comma separated
++ *					list of pNFS data server IPv4 addresses
++ *			size:		non-zero length of C string in @buf
++ * Output:
++ *	On success:	passed-in buffer filled with '\n'-terminated C
++ *			string containing a block device name, a colon, and
++ *			then a comma separated list of pNFS
++ *			data server IPv4 addresses.
++ *			return code is the size in bytes of the string
++ *	On error:	return code is a negative errno value
 + */
++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size)
++{
++	ssize_t rv;
 +
-+#ifndef FS_NFS_PNFS_H
-+#define FS_NFS_PNFS_H
-+
-+#include <linux/nfs_page.h>
-+
-+struct pnfs_layout_segment {
-+	struct list_head fi_list;
-+	struct pnfs_layout_range range;
-+	struct kref kref;
-+	bool valid;
-+	struct pnfs_layout_hdr *layout;
-+};
-+
-+enum pnfs_try_status {
-+	PNFS_ATTEMPTED     = 0,
-+	PNFS_NOT_ATTEMPTED = 1,
-+};
-+
-+struct pnfs_fsdata {
-+	struct pnfs_layout_segment *lseg;
-+	int bypass_eof;
-+	void *private;
-+};
-+
-+#ifdef CONFIG_NFS_V4_1
-+
-+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
-+
-+enum {
-+	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
-+	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
-+	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */
-+	NFS_LAYOUT_NEED_LCOMMIT,	/* LAYOUTCOMMIT needed */
-+};
-+
-+enum layoutdriver_policy_flags {
-+	/* Should the full nfs rpc cleanup code be used after io */
-+	PNFS_USE_RPC_CODE		= 1 << 0,
-+
-+	/* Should the pNFS client commit and return the layout upon a setattr */
-+	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 1,
-+};
-+
-+/* Per-layout driver specific registration structure */
-+struct pnfs_layoutdriver_type {
-+	struct list_head pnfs_tblid;
-+	const u32 id;
-+	const char *name;
-+	struct module *owner;
-+	unsigned flags;
-+	int (*initialize_mountpoint) (struct nfs_server *, const struct nfs_fh *);
-+	int (*uninitialize_mountpoint) (struct nfs_server *);
-+
-+	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
-+	void (*free_layout_hdr) (struct pnfs_layout_hdr *);
-+
-+	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
-+	void (*free_lseg) (struct pnfs_layout_segment *lseg);
-+
-+	/* test for nfs page cache coalescing */
-+	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
-+
-+	/* Retreive the block size of the file system.
-+	 * If gather_across_stripes == 1, then the file system will gather
-+	 * requests into the block size.
-+	 * TODO: Where will the layout driver get this info?  It is hard
-+	 * coded in PVFS2.
-+	 */
-+	ssize_t (*get_blocksize) (void);
-+
-+/* read and write pagelist should return just 0 (to indicate that
-+	 * the layout code has taken control) or 1 (to indicate that the
-+	 * layout code wishes to fall back to normal nfs.)  If 0 is returned,
-+	 * information can be passed back through nfs_data->res and
-+	 * nfs_data->task.tk_status, and the appropriate pnfs done function
-+	 * MUST be called.
-+	 */
-+	enum pnfs_try_status
-+	(*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages);
-+	enum pnfs_try_status
-+	(*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how);
-+	int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
-+			    loff_t pos, unsigned count,
-+			    struct pnfs_fsdata *fsdata);
-+	int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
-+			 unsigned count, unsigned copied,
-+			 struct pnfs_layout_segment *lseg);
-+	void (*write_end_cleanup)(struct file *filp,
-+				  struct pnfs_fsdata *fsdata);
-+
-+	/* Consistency ops */
-+	/* 2 problems:
-+	 * 1) the page list contains nfs_pages, NOT pages
-+	 * 2) currently the NFS code doesn't create a page array (as it does with read/write)
-+	 */
-+	enum pnfs_try_status
-+	(*commit) (struct nfs_write_data *nfs_data, int how);
-+
-+	int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
-+				   struct nfs4_layoutcommit_args *args);
-+
-+	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
-+				     struct xdr_stream *xdr,
-+				     const struct nfs4_layoutcommit_args *args);
-+
-+	void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
-+				      struct nfs4_layoutcommit_args *args,
-+				      int status);
-+
-+	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
-+				     struct xdr_stream *xdr,
-+				     const struct nfs4_layoutreturn_args *args);
-+};
++	mutex_lock(&nfsd_mutex);
++	rv = __write_pnfs_dlm_device(file, buf, size);
++	mutex_unlock(&nfsd_mutex);
++	return rv;
++}
 +
-+struct pnfs_layout_hdr {
-+	unsigned long		refcount;
-+	struct list_head	layouts;   /* other client layouts */
-+	struct list_head	segs;      /* layout segments list */
-+	int			roc_iomode;/* return on close iomode, 0=none */
-+	seqlock_t		seqlock;   /* Protects the stateid */
-+	nfs4_stateid		stateid;
-+	unsigned long		state;
-+	struct rpc_cred		*cred;     /* layoutcommit credential */
-+	/* DH: These vars keep track of the maximum write range
-+	 * so the values can be used for layoutcommit.
-+	 */
-+	loff_t			write_begin_pos;
-+	loff_t			write_end_pos;
-+	struct inode		*inode;
-+};
++#endif /* CONFIG_PNFSD */
 +
-+struct pnfs_device {
-+	struct nfs4_deviceid dev_id;
-+	unsigned int  layout_type;
-+	unsigned int  mincount;
-+	struct page **pages;
-+	void          *area;
-+	unsigned int  pgbase;
-+	unsigned int  pglen;
-+};
+ /*----------------------------------------------------------------------------*/
+ /*
+  *	populating the filesystem.
+@@ -1402,6 +1478,10 @@ static int nfsd_fill_super(struct super_
+ 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
+ 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+ #endif
++#ifdef CONFIG_PNFSD
++		[NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops,
++					   S_IWUSR|S_IRUSR},
++#endif
+ 		/* last one */ {""}
+ 	};
+ 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
+@@ -1440,6 +1520,9 @@ static int create_proc_exports_entry(voi
+ }
+ #endif
+ 
++#if defined(CONFIG_SPNFS_BLOCK)
++int nfsd_bl_init(void);
++#endif
+ static int __init init_nfsd(void)
+ {
+ 	int retval;
+@@ -1462,6 +1545,15 @@ static int __init init_nfsd(void)
+ 	retval = create_proc_exports_entry();
+ 	if (retval)
+ 		goto out_free_idmap;
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
++	retval = spnfs_init_proc();
++	if (retval != 0)
++		goto out_free_idmap;
++#if defined(CONFIG_SPNFS_BLOCK)
++	nfsd_bl_init();
++#endif /* CONFIG_SPNFS_BLOCK */
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
 +
-+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+ 	retval = register_filesystem(&nfsd_fs_type);
+ 	if (retval)
+ 		goto out_free_all;
+@@ -1484,7 +1576,22 @@ out_free_stat:
+ 
+ static void __exit exit_nfsd(void)
+ {
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
++	remove_proc_entry("fs/nfs/spnfs/recall", NULL);
++	remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
++	remove_proc_entry("fs/nfs/spnfs/getfh", NULL);
++	remove_proc_entry("fs/nfs/spnfs/config", NULL);
++	remove_proc_entry("fs/nfs/spnfs/ctl", NULL);
++	remove_proc_entry("fs/nfs/spnfs", NULL);
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
 +
-+struct pnfs_devicelist {
-+	unsigned int		eof;
-+	unsigned int		num_devs;
-+	struct nfs4_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
-+};
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
++	remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
++	remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL);
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */
++
+ 	nfsd_export_shutdown();
++	nfsd4_pnfs_dlm_shutdown();
+ 	nfsd_reply_cache_shutdown();
+ 	remove_proc_entry("fs/nfs/exports", NULL);
+ 	remove_proc_entry("fs/nfs", NULL);
+diff -up linux-2.6.37.noarch/fs/nfsd/nfsd.h.orig linux-2.6.37.noarch/fs/nfsd/nfsd.h
+--- linux-2.6.37.noarch/fs/nfsd/nfsd.h.orig	2011-01-28 09:37:32.566979114 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfsd.h	2011-01-28 09:43:53.363768972 -0500
+@@ -287,11 +287,22 @@ extern time_t nfsd4_grace;
+ #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+ 	NFSD4_SUPPORTED_ATTRS_WORD0
+ 
++#if defined(CONFIG_PNFSD)
++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
++	(NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES)
++#else /* CONFIG_PNFSD */
+ #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+ 	NFSD4_SUPPORTED_ATTRS_WORD1
++#endif /* CONFIG_PNFSD */
+ 
++#if defined(CONFIG_PNFSD)
++#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
++	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \
++	 FATTR4_WORD2_LAYOUT_BLKSIZE)
++#else /* CONFIG_PNFSD */
+ #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+ 	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
++#endif /* CONFIG_PNFSD */
+ 
+ static inline u32 nfsd_suppattrs0(u32 minorversion)
+ {
+diff -up linux-2.6.37.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.37.noarch/fs/nfsd/nfsfh.c
+--- linux-2.6.37.noarch/fs/nfsd/nfsfh.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfsfh.c	2011-01-28 09:43:53.364768868 -0500
+@@ -10,6 +10,7 @@
+ #include <linux/exportfs.h>
+ 
+ #include <linux/sunrpc/svcauth_gss.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
+ #include "nfsd.h"
+ #include "vfs.h"
+ #include "auth.h"
+@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s
+ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ {
+ 	struct knfsd_fh	*fh = &fhp->fh_handle;
++	int fsid_type;
+ 	struct fid *fid = NULL, sfid;
+ 	struct svc_export *exp;
+ 	struct dentry *dentry;
+@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct 
+ 			return error;
+ 		if (fh->fh_auth_type != 0)
+ 			return error;
+-		len = key_len(fh->fh_fsid_type) / 4;
++		fsid_type = pnfs_fh_fsid_type(fh);
++		len = key_len(fsid_type) / 4;
+ 		if (len == 0)
+ 			return error;
+ 		if  (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct 
+ 		data_left -= len;
+ 		if (data_left < 0)
+ 			return error;
+-		exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
++		exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth);
+ 		fid = (struct fid *)(fh->fh_auth + len);
+ 	} else {
+ 		__u32 tfh[2];
+diff -up linux-2.6.37.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.37.noarch/fs/nfsd/nfsfh.h
+--- linux-2.6.37.noarch/fs/nfsd/nfsfh.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfsfh.h	2011-01-28 09:43:53.365768765 -0500
+@@ -14,6 +14,7 @@ enum nfsd_fsid {
+ 	FSID_UUID8,
+ 	FSID_UUID16,
+ 	FSID_UUID16_INUM,
++	FSID_MAX
+ };
+ 
+ enum fsid_source {
+@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp)
+ 	}
+ }
+ 
++#if defined(CONFIG_PNFSD)
 +
 +/*
-+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied
++ * to a DS by LAYOUTGET.  nfs4_preprocess_stateid_op() uses this to decide how
++ * to handle a given stateid.
 + */
-+#define NFS4_DEVICE_ID_HASH_BITS	5
-+#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
-+#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
-+
-+static inline u32
-+nfs4_deviceid_hash(struct nfs4_deviceid *id)
++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
 +{
-+	unsigned char *cptr = (unsigned char *)id->data;
-+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-+	u32 x = 0;
-+
-+	while (nbytes--) {
-+		x *= 37;
-+		x += *cptr++;
-+	}
-+	return x & NFS4_DEVICE_ID_HASH_MASK;
++	return fh->fh_fsid_type >= FSID_MAX;
 +}
 +
-+struct pnfs_deviceid_node {
-+	struct hlist_node	de_node;
-+	struct nfs4_deviceid	de_id;
-+	atomic_t		de_ref;
-+};
-+
-+struct pnfs_deviceid_cache {
-+	spinlock_t		dc_lock;
-+	atomic_t		dc_ref;
-+	void			(*dc_free_callback)(struct pnfs_deviceid_node *);
-+	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-+};
-+
-+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-+			void (*free_callback)(struct pnfs_deviceid_node *));
-+extern void pnfs_put_deviceid_cache(struct nfs_client *);
-+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-+				struct pnfs_deviceid_cache *,
-+				struct nfs4_deviceid *);
-+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-+				struct pnfs_deviceid_cache *,
-+				struct pnfs_deviceid_node *);
-+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-+			      struct pnfs_deviceid_node *devid);
-+extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *,
-+				 struct nfs4_deviceid *);
-+
-+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
-+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
-+
-+/* nfs4proc.c */
-+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
-+				   const struct nfs_fh *fh,
-+				   struct pnfs_devicelist *devlist);
-+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
-+				   struct pnfs_device *dev);
-+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
-+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
-+				   int issync);
-+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
-+
-+/* pnfs.c */
-+void get_lseg(struct pnfs_layout_segment *lseg);
-+void put_lseg(struct pnfs_layout_segment *lseg);
-+struct pnfs_layout_segment *
-+pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
-+struct pnfs_layout_segment *
-+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-+		   loff_t pos, u64 count, enum pnfs_iomode access_type);
-+bool pnfs_return_layout_barrier(struct nfs_inode *, struct pnfs_layout_range *);
-+int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *,
-+			const nfs4_stateid *stateid, /* optional */
-+			enum pnfs_layoutreturn_type, bool wait);
-+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id);
-+void unset_pnfs_layoutdriver(struct nfs_server *);
-+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
-+					     const struct rpc_call_ops *, int);
-+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
-+					    const struct rpc_call_ops *);
-+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
-+int pnfs_layoutcommit_inode(struct inode *inode, int sync);
-+void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent);
-+void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx);
-+void pnfs_set_ds_iosize(struct nfs_server *server);
-+enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
-+					 const struct rpc_call_ops *, int);
-+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
-+			   struct nfs_open_context *, struct list_head *,
-+			   size_t *);
-+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
-+			    size_t *);
-+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
-+int pnfs_layout_process(struct nfs4_layoutget *lgp);
-+void pnfs_layoutreturn_release(struct nfs4_layoutreturn *lpr);
-+void pnfs_destroy_layout(struct nfs_inode *);
-+void pnfs_destroy_all_layouts(struct nfs_client *);
-+void put_layout_hdr(struct inode *inode);
-+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-+			     struct nfs4_state *open_state);
-+void pnfs_read_done(struct nfs_read_data *);
-+void pnfs_writeback_done(struct nfs_write_data *);
-+void pnfs_commit_done(struct nfs_write_data *);
-+int _pnfs_write_begin(struct inode *inode, struct page *page,
-+		      loff_t pos, unsigned len,
-+		      struct pnfs_layout_segment *lseg,
-+		      struct pnfs_fsdata **fsdata);
-+
-+static inline bool
-+has_layout(struct nfs_inode *nfsi)
++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh)
 +{
-+	return nfsi->layout != NULL;
++	BUG_ON(fh->fh_version != 1);
++	BUG_ON(pnfs_fh_is_ds(fh));
++	fh->fh_fsid_type += FSID_MAX;
 +}
 +
-+static inline int lo_fail_bit(u32 iomode)
-+{
-+	return iomode == IOMODE_RW ?
-+			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
-+}
++#else  /* CONFIG_PNFSD */
 +
-+static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
 +{
-+	write_seqlock(&lo->seqlock);
-+	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-+	write_sequnlock(&lo->seqlock);
++	return 0;
 +}
 +
-+/* Return true if a layout driver is being used for this mountpoint */
-+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
-+{
-+	return nfss->pnfs_curr_ld != NULL;
-+}
++#endif /* CONFIG_PNFSD */
 +
-+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
-+			       struct pnfs_fsdata *fsdata)
++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */
++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh)
 +{
-+	return !fsdata  || ((struct pnfs_layout_segment *)fsdata == lseg) ||
-+		!fsdata->bypass_eof;
-+}
++	int fsid_type = fh->fh_fsid_type;
 +
-+/* Should the pNFS client commit and return the layout upon a setattr */
-+static inline bool
-+pnfs_ld_layoutret_on_setattr(struct inode *inode)
-+{
-+	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
-+		return false;
-+	return NFS_SERVER(inode)->pnfs_curr_ld->flags &
-+		PNFS_LAYOUTRET_ON_SETATTR;
++	if (pnfs_fh_is_ds(fh))
++		return fsid_type - FSID_MAX;
++	return fsid_type;
 +}
 +
-+static inline bool pnfs_use_rpc(struct nfs_server *nfss)
-+{
-+	if (pnfs_enabled_sb(nfss))
-+		return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE;
+ #endif /* _LINUX_NFSD_FH_INT_H */
+diff -up linux-2.6.37.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.37.noarch/fs/nfsd/nfssvc.c
+--- linux-2.6.37.noarch/fs/nfsd/nfssvc.c.orig	2011-01-28 09:37:32.567979080 -0500
++++ linux-2.6.37.noarch/fs/nfsd/nfssvc.c	2011-01-28 09:43:53.365768765 -0500
+@@ -116,7 +116,7 @@ struct svc_program		nfsd_program = {
+ 
+ };
+ 
+-u32 nfsd_supported_minorversion;
++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION;
+ 
+ int nfsd_vers(int vers, enum vers_op change)
+ {
+diff -up linux-2.6.37.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.37.noarch/fs/nfsd/pnfsd.h
+--- linux-2.6.37.noarch/fs/nfsd/pnfsd.h.orig	2011-01-28 09:43:53.366768664 -0500
++++ linux-2.6.37.noarch/fs/nfsd/pnfsd.h	2011-01-28 09:43:53.366768664 -0500
+@@ -0,0 +1,144 @@
++/*
++ *  Copyright (c) 2005 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros at umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
 +
-+	return true;
-+}
++#ifndef LINUX_NFSD_PNFSD_H
++#define LINUX_NFSD_PNFSD_H
 +
-+/* Should the pNFS client commit and return the layout on close
-+ */
-+static inline int
-+pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
-+{
-+	return nfsi->layout->roc_iomode;
-+}
++#include <linux/list.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
 +
-+static inline int pnfs_write_begin(struct file *filp, struct page *page,
-+				   loff_t pos, unsigned len,
-+				   struct pnfs_layout_segment *lseg,
-+				   void **fsdata)
-+{
-+	struct inode *inode = filp->f_dentry->d_inode;
-+	struct nfs_server *nfss = NFS_SERVER(inode);
-+	int status = 0;
++#include "state.h"
++#include "xdr4.h"
 +
-+	*fsdata = lseg;
-+	if (lseg && nfss->pnfs_curr_ld->write_begin)
-+		status = _pnfs_write_begin(inode, page, pos, len, lseg,
-+					   (struct pnfs_fsdata **) fsdata);
-+	return status;
-+}
++/* outstanding layout stateid */
++struct nfs4_layout_state {
++	struct list_head	ls_perfile;
++	struct list_head	ls_layouts; /* list of nfs4_layouts */
++	struct kref		ls_ref;
++	struct nfs4_client	*ls_client;
++	struct nfs4_file	*ls_file;
++	stateid_t		ls_stateid;
++};
++
++/* outstanding layout */
++struct nfs4_layout {
++	struct list_head		lo_perfile;	/* hash by f_id */
++	struct list_head		lo_perclnt;	/* hash by clientid */
++	struct list_head		lo_perstate;
++	struct nfs4_file		*lo_file;	/* backpointer */
++	struct nfs4_client		*lo_client;
++	struct nfs4_layout_state	*lo_state;
++	struct nfsd4_layout_seg 	lo_seg;
++};
 +
-+/* CAREFUL - what happens if copied < len??? */
-+static inline int pnfs_write_end(struct file *filp, struct page *page,
-+				 loff_t pos, unsigned len, unsigned copied,
-+				 struct pnfs_layout_segment *lseg)
-+{
-+	struct inode *inode = filp->f_dentry->d_inode;
-+	struct nfs_server *nfss = NFS_SERVER(inode);
++struct pnfs_inval_state {
++	struct knfsd_fh		mdsfh; /* needed only by invalidate all */
++	stateid_t		stid;
++	clientid_t		clid;
++	u32			status;
++};
 +
-+	if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
-+		return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
-+						     copied, lseg);
-+	else
-+		return 0;
-+}
++/* pNFS Data Server state */
++#define DS_STATEID_VALID   0
++#define DS_STATEID_ERROR   1
++#define DS_STATEID_NEW     2
 +
-+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
-+{
-+	struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
++struct pnfs_ds_stateid {
++	struct list_head	ds_hash;        /* ds_stateid hash entry */
++	struct list_head	ds_perclid;     /* per client hash entry */
++	stateid_t		ds_stid;
++	struct knfsd_fh		ds_fh;
++	unsigned long		ds_access;
++	u32			ds_status;      /* from MDS */
++	u32			ds_verifier[2]; /* from MDS */
++	wait_queue_head_t	ds_waitq;
++	unsigned long		ds_flags;
++	struct kref		ds_ref;
++	clientid_t		ds_mdsclid;
++};
 +
-+	if (fsdata && nfss->pnfs_curr_ld) {
-+		if (nfss->pnfs_curr_ld->write_end_cleanup)
-+			nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
-+		if (nfss->pnfs_curr_ld->write_begin)
-+			pnfs_free_fsdata(fsdata);
-+	}
-+}
++struct pnfs_ds_clientid {
++	struct list_head	dc_hash;        /* mds_clid_hashtbl entry */
++	struct list_head	dc_stateid;     /* ds_stateid head */
++	struct list_head	dc_permdsid;    /* per mdsid hash entry */
++	clientid_t		dc_mdsclid;
++	struct kref		dc_ref;
++	uint32_t		dc_mdsid;
++};
 +
-+static inline int pnfs_return_layout(struct inode *ino,
-+				     struct pnfs_layout_range *range,
-+				     const nfs4_stateid *stateid, /* optional */
-+				     enum pnfs_layoutreturn_type type,
-+				     bool wait)
-+{
-+	struct nfs_inode *nfsi = NFS_I(ino);
-+	struct nfs_server *nfss = NFS_SERVER(ino);
++struct pnfs_mds_id {
++	struct list_head	di_hash;        /* mds_nodeid list entry */
++	struct list_head	di_mdsclid;     /* mds_clientid head */
++	uint32_t		di_mdsid;
++	time_t			di_mdsboot;	/* mds boot time */
++	struct kref		di_ref;
++};
 +
-+	if (pnfs_enabled_sb(nfss) &&
-+	    (type != RETURN_FILE || has_layout(nfsi)))
-+		return _pnfs_return_layout(ino, range, stateid, type, wait);
++/* notify device request (from exported filesystem) */
++struct nfs4_notify_device {
++	struct nfsd4_pnfs_cb_dev_list  *nd_list;
++	struct nfs4_client	       *nd_client;
++	struct list_head	        nd_perclnt;
 +
-+	return 0;
-+}
++	/* nfsd internal */
++	struct nfsd4_callback		nd_recall;
++};
 +
-+static inline bool
-+layoutcommit_needed(struct nfs_inode *nfsi)
-+{
-+	return has_layout(nfsi) &&
-+	       test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
-+}
++u64 find_create_sbid(struct super_block *);
++struct super_block *find_sbid_id(u64);
++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *);
++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *,
++					struct nfsd4_pnfs_layoutreturn *);
++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *);
++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *);
++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
++int put_layoutrecall(struct nfs4_layoutrecall *);
++void nomatching_layout(struct nfs4_layoutrecall *);
++void *layoutrecall_done(struct nfs4_layoutrecall *);
++void nfsd4_cb_layout(struct nfs4_layoutrecall *);
++int nfsd_layout_recall_cb(struct super_block *, struct inode *,
++			  struct nfsd4_pnfs_cb_layout *);
++int nfsd_device_notify_cb(struct super_block *,
++			  struct nfsd4_pnfs_cb_dev_list *);
++void nfsd4_cb_notify_device(struct nfs4_notify_device *);
++void pnfs_set_device_notify(clientid_t *, unsigned int types);
++void pnfs_clear_device_notify(struct nfs4_client *);
 +
-+static inline int pnfs_get_write_status(struct nfs_write_data *data)
-+{
-+	return data->pdata.pnfs_error;
-+}
++#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
++extern struct sockaddr pnfsd_lexp_addr;
++extern size_t pnfs_lexp_addr_len;
 +
-+static inline int pnfs_get_read_status(struct nfs_read_data *data)
-+{
-+	return data->pdata.pnfs_error;
-+}
++extern void pnfsd_lexp_init(struct inode *);
++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
 +
-+static inline struct pnfs_layout_segment *
-+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
-+{
-+	if (fsdata) {
-+		struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
++#endif /* LINUX_NFSD_PNFSD_H */
+diff -up linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c
+--- linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c.orig	2011-01-28 09:43:53.367768598 -0500
++++ linux-2.6.37.noarch/fs/nfsd/pnfsd_lexp.c	2011-01-28 09:43:53.367768598 -0500
+@@ -0,0 +1,225 @@
++/*
++ * linux/fs/nfsd/pnfs_lexp.c
++ *
++ * pNFS export of local filesystems.
++ *
++ * Export local file systems over the files layout type.
++ * The MDS (metadata server) functions also as a single DS (data server).
++ * This is mostly useful for development and debugging purposes.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * Copyright (C) 2008 Benny Halevy, <bhalevy at panasas.com>
++ *
++ * Initial implementation was based on the pnfs-gfs2 patches done
++ * by David M. Richter <richterd at citi.umich.edu>
++ */
 +
-+		if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
-+			return ((struct pnfs_fsdata *) fsdata)->lseg;
-+		return (struct pnfs_layout_segment *)fsdata;
-+	}
-+	return NULL;
-+}
++#include <linux/sunrpc/svc_xprt.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
 +
-+#else  /* CONFIG_NFS_V4_1 */
++#include "pnfsd.h"
 +
-+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
-+{
-+}
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
 +
-+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
-+{
-+}
++struct sockaddr pnfsd_lexp_addr;
++size_t pnfs_lexp_addr_len;
 +
-+static inline void get_lseg(struct pnfs_layout_segment *lseg)
++static int
++pnfsd_lexp_layout_type(struct super_block *sb)
 +{
++	int ret = LAYOUT_NFSV4_1_FILES;
++	dprintk("<-- %s: return %d\n", __func__, ret);
++	return ret;
 +}
 +
-+static inline void put_lseg(struct pnfs_layout_segment *lseg)
++static int
++pnfsd_lexp_get_device_iter(struct super_block *sb,
++			   u32 layout_type,
++			   struct nfsd4_pnfs_dev_iter_res *res)
 +{
-+}
++	dprintk("--> %s: sb=%p\n", __func__, sb);
 +
-+static inline struct pnfs_layout_segment *
-+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-+		   loff_t pos, u64 count, enum pnfs_iomode access_type)
-+{
-+	return NULL;
-+}
++	BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
 +
-+static inline bool
-+has_layout(struct nfs_inode *nfsi)
-+{
-+	return false;
-+}
++	res->gd_eof = 1;
++	if (res->gd_cookie)
++		return -ENOENT;
++	res->gd_cookie = 1;
++	res->gd_verf = 1;
++	res->gd_devid = 1;
 +
-+static inline bool
-+layoutcommit_needed(struct nfs_inode *nfsi)
-+{
++	dprintk("<-- %s: return 0\n", __func__);
 +	return 0;
 +}
 +
-+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
-+			       struct pnfs_fsdata *fsdata)
++static int
++pnfsd_lexp_get_device_info(struct super_block *sb,
++			   struct exp_xdr_stream *xdr,
++			   u32 layout_type,
++			   const struct nfsd4_pnfs_deviceid *devid)
 +{
-+	return 1;
-+}
++	int err;
++	struct pnfs_filelayout_device fdev;
++	struct pnfs_filelayout_multipath fl_devices[1];
++	u32 fl_stripe_indices[1] = { 0 };
++	struct pnfs_filelayout_devaddr daddr;
++	/* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */
++	char daddr_buf[8*4 + 2*3 + 10];
 +
-+static inline enum pnfs_try_status
-+pnfs_try_to_read_data(struct nfs_read_data *data,
-+		      const struct rpc_call_ops *call_ops)
-+{
-+	return PNFS_NOT_ATTEMPTED;
-+}
++	dprintk("--> %s: sb=%p\n", __func__, sb);
 +
-+static inline enum pnfs_try_status
-+pnfs_try_to_write_data(struct nfs_write_data *data,
-+		       const struct rpc_call_ops *call_ops, int how)
-+{
-+	return PNFS_NOT_ATTEMPTED;
-+}
++	BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
 +
-+static inline enum pnfs_try_status
-+pnfs_try_to_commit(struct nfs_write_data *data,
-+		   const struct rpc_call_ops *call_ops, int how)
-+{
-+	return PNFS_NOT_ATTEMPTED;
-+}
++	memset(&fdev, '\0', sizeof(fdev));
++
++	if (devid->devid != 1) {
++		printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 "
++			"(got: 0x%llx)\n", __func__, devid->devid);
++		err = -EINVAL;
++		goto out;
++	}
++
++	/* count the number of comma-delimited DS IPs */
++	fdev.fl_device_length = 1;
++	fdev.fl_device_list = fl_devices;
++
++	fdev.fl_stripeindices_length = fdev.fl_device_length;
++	fdev.fl_stripeindices_list = fl_stripe_indices;
++
++	daddr.r_addr.data = daddr_buf;
++	daddr.r_addr.len = sizeof(daddr_buf);
++	err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr);
++	if (err < 0)
++		goto out;
++	daddr.r_addr.len = err;
++	switch (pnfsd_lexp_addr.sa_family) {
++	case AF_INET:
++		daddr.r_netid.data = "tcp";
++		daddr.r_netid.len = 3;
++		break;
++	case AF_INET6:
++		daddr.r_netid.data = "tcp6";
++		daddr.r_netid.len = 4;
++		break;
++	default:
++		BUG();
++	}
++	fdev.fl_device_list[0].fl_multipath_length = 1;
++	fdev.fl_device_list[0].fl_multipath_list = &daddr;
 +
-+static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync)
-+{
-+	return 0;
++	/* have nfsd encode the device info */
++	err = filelayout_encode_devinfo(xdr, &fdev);
++out:
++	dprintk("<-- %s: return %d\n", __func__, err);
++	return err;
 +}
 +
-+static inline bool
-+pnfs_ld_layoutret_on_setattr(struct inode *inode)
++static int get_stripe_unit(int blocksize)
 +{
-+	return false;
++	if (blocksize < NFSSVC_MAXBLKSIZE)
++		blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
++	dprintk("%s: return %d\n", __func__, blocksize);
++	return blocksize;
 +}
 +
-+static inline bool pnfs_use_rpc(struct nfs_server *nfss)
++static enum nfsstat4
++pnfsd_lexp_layout_get(struct inode *inode,
++		      struct exp_xdr_stream *xdr,
++		      const struct nfsd4_pnfs_layoutget_arg *arg,
++		      struct nfsd4_pnfs_layoutget_res *res)
 +{
-+	return true;
-+}
++	enum nfsstat4 rc = NFS4_OK;
++	struct pnfs_filelayout_layout *layout = NULL;
++	struct knfsd_fh *fhp = NULL;
 +
-+static inline int
-+pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
-+{
-+	return 0;
-+}
++	dprintk("--> %s: inode=%p\n", __func__, inode);
 +
-+static inline int pnfs_return_layout(struct inode *ino,
-+				     struct pnfs_layout_range *range,
-+				     const nfs4_stateid *stateid, /* optional */
-+				     enum pnfs_layoutreturn_type type,
-+				     bool wait)
-+{
-+	return 0;
-+}
++	res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
++	res->lg_seg.offset = 0;
++	res->lg_seg.length = NFS4_MAX_UINT64;
 +
-+static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id)
-+{
-+}
++	layout = kzalloc(sizeof(*layout), GFP_KERNEL);
++	if (layout == NULL) {
++		rc = -ENOMEM;
++		goto error;
++	}
 +
-+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
-+{
-+}
++	/* Set file layout response args */
++	layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
++	layout->lg_stripe_type = STRIPE_SPARSE;
++	layout->lg_commit_through_mds = true;
++	layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
++	layout->lg_fh_length = 1;
++	layout->device_id.sbid = arg->lg_sbid;
++	layout->device_id.devid = 1;				/*FSFTEMP*/
++	layout->lg_first_stripe_index = 0;			/*FSFTEMP*/
++	layout->lg_pattern_offset = 0;
 +
-+static inline void pnfs_set_ds_iosize(struct nfs_server *server)
-+{
-+	server->ds_wsize = server->ds_rsize = -1;
-+}
++	fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
++	if (fhp == NULL) {
++		rc = -ENOMEM;
++		goto error;
++	}
 +
-+static inline int pnfs_write_begin(struct file *filp, struct page *page,
-+				   loff_t pos, unsigned len,
-+				   struct pnfs_layout_segment *lseg,
-+				   void **fsdata)
-+{
-+	*fsdata = NULL;
-+	return 0;
-+}
++	memcpy(fhp, arg->lg_fh, sizeof(*fhp));
++	pnfs_fh_mark_ds(fhp);
++	layout->lg_fh_list = fhp;
 +
-+static inline int pnfs_write_end(struct file *filp, struct page *page,
-+				 loff_t pos, unsigned len, unsigned copied,
-+				 struct pnfs_layout_segment *lseg)
-+{
-+	return 0;
-+}
++	/* Call nfsd to encode layout */
++	rc = filelayout_encode_layout(xdr, layout);
++exit:
++	kfree(layout);
++	kfree(fhp);
++	dprintk("<-- %s: return %d\n", __func__, rc);
++	return rc;
 +
-+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
-+{
++error:
++	res->lg_seg.length = 0;
++	goto exit;
 +}
 +
-+static inline int pnfs_get_write_status(struct nfs_write_data *data)
++static int
++pnfsd_lexp_layout_commit(struct inode *inode,
++			 const struct nfsd4_pnfs_layoutcommit_arg *args,
++			 struct nfsd4_pnfs_layoutcommit_res *res)
 +{
++	dprintk("%s: (unimplemented)\n", __func__);
++
 +	return 0;
 +}
 +
-+static inline int pnfs_get_read_status(struct nfs_read_data *data)
++static int
++pnfsd_lexp_layout_return(struct inode *inode,
++			 const struct nfsd4_pnfs_layoutreturn_arg *args)
 +{
++	dprintk("%s: (unimplemented)\n", __func__);
++
 +	return 0;
 +}
 +
-+static inline void
-+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino,
-+		      struct nfs_open_context *ctx, struct list_head *pages,
-+		      size_t *rsize)
++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh,
++				struct pnfs_get_state *p)
 +{
-+	pgio->pg_lseg = NULL;
++	return 0;	/* just use the current stateid */
 +}
 +
-+static inline void
-+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino,
-+		       size_t *wsize)
-+{
-+	pgio->pg_lseg = NULL;
-+}
++static struct pnfs_export_operations pnfsd_lexp_ops = {
++	.layout_type = pnfsd_lexp_layout_type,
++	.get_device_info = pnfsd_lexp_get_device_info,
++	.get_device_iter = pnfsd_lexp_get_device_iter,
++	.layout_get = pnfsd_lexp_layout_get,
++	.layout_commit = pnfsd_lexp_layout_commit,
++	.layout_return = pnfsd_lexp_layout_return,
++	.get_state = pnfsd_lexp_get_state,
++};
 +
-+static inline struct pnfs_layout_segment *
-+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
++void
++pnfsd_lexp_init(struct inode *inode)
 +{
-+	return NULL;
++	dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops);
++	inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops;
 +}
+diff -up linux-2.6.37.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.37.noarch/fs/nfsd/spnfs_com.c
+--- linux-2.6.37.noarch/fs/nfsd/spnfs_com.c.orig	2011-01-28 09:43:53.368768479 -0500
++++ linux-2.6.37.noarch/fs/nfsd/spnfs_com.c	2011-01-28 09:43:53.368768479 -0500
+@@ -0,0 +1,535 @@
++/*
++ * fs/nfsd/spnfs_com.c
++ *
++ * Communcation layer between spNFS kernel and userspace
++ * Based heavily on idmap.c
++ *
++ */
 +
-+#endif /* CONFIG_NFS_V4_1 */
++/*
++ *  Copyright (c) 2002 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Marius Aamodt Eriksen <marius at umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++#include <linux/namei.h>
++#include <linux/mount.h>
++#include <linux/path.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/sunrpc/rpc_pipe_fs.h>
++#include <linux/nfsd/debug.h>
 +
-+#endif /* FS_NFS_PNFS_H */
-diff --git a/fs/nfs/read.c b/fs/nfs/read.c
-index 87adc27..1df536a 100644
---- a/fs/nfs/read.c
-+++ b/fs/nfs/read.c
-@@ -18,8 +18,12 @@
- #include <linux/sunrpc/clnt.h>
- #include <linux/nfs_fs.h>
- #include <linux/nfs_page.h>
-+#include <linux/smp_lock.h>
-+#include <linux/module.h>
- 
- #include <asm/system.h>
-+#include <linux/module.h>
-+#include "pnfs.h"
- 
- #include "nfs4_fs.h"
- #include "internal.h"
-@@ -117,11 +121,16 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
- 	LIST_HEAD(one_request);
- 	struct nfs_page	*new;
- 	unsigned int len;
-+	loff_t pgoffs;
-+	struct pnfs_layout_segment *lseg;
- 
- 	len = nfs_page_length(page);
- 	if (len == 0)
- 		return nfs_return_empty_page(page);
--	new = nfs_create_request(ctx, inode, page, 0, len);
-+	pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT;
-+	lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ);
-+	new = nfs_create_request(ctx, inode, page, 0, len, lseg);
-+	put_lseg(lseg);
- 	if (IS_ERR(new)) {
- 		unlock_page(page);
- 		return PTR_ERR(new);
-@@ -155,24 +164,20 @@ static void nfs_readpage_release(struct nfs_page *req)
- 	nfs_release_request(req);
- }
- 
--/*
-- * Set up the NFS read request struct
-- */
--static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
--		const struct rpc_call_ops *call_ops,
--		unsigned int count, unsigned int offset)
-+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+		      const struct rpc_call_ops *call_ops)
- {
--	struct inode *inode = req->wb_context->path.dentry->d_inode;
-+	struct inode *inode = data->inode;
- 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
- 	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_argp = &data->args,
- 		.rpc_resp = &data->res,
--		.rpc_cred = req->wb_context->cred,
-+		.rpc_cred = data->cred,
- 	};
- 	struct rpc_task_setup task_setup_data = {
- 		.task = &data->task,
--		.rpc_client = NFS_CLIENT(inode),
-+		.rpc_client = clnt,
- 		.rpc_message = &msg,
- 		.callback_ops = call_ops,
- 		.callback_data = data,
-@@ -180,9 +185,46 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
- 		.flags = RPC_TASK_ASYNC | swap_flags,
- 	};
- 
-+	/* Set up the initial task struct. */
-+	NFS_PROTO(inode)->read_setup(data, &msg);
++#include <linux/nfsd4_spnfs.h>
 +
-+	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-+			data->task.tk_pid,
-+			inode->i_sb->s_id,
-+			(long long)NFS_FILEID(inode),
-+			data->args.count,
-+			(unsigned long long)data->args.offset);
++#define	NFSDDBG_FACILITY		NFSDDBG_PROC
 +
-+	task = rpc_run_task(&task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
-+	rpc_put_task(task);
-+	return 0;
-+}
-+EXPORT_SYMBOL(nfs_initiate_read);
++static ssize_t   spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *,
++		     char __user *, size_t);
++static ssize_t   spnfs_pipe_downcall(struct file *, const char __user *,
++		     size_t);
++static void      spnfs_pipe_destroy_msg(struct rpc_pipe_msg *);
 +
-+int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+		       const struct rpc_call_ops *call_ops)
-+{
-+	if (data->req->wb_lseg &&
-+	    (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
-+		return pnfs_get_read_status(data);
++static struct rpc_pipe_ops spnfs_upcall_ops = {
++	.upcall		= spnfs_pipe_upcall,
++	.downcall	= spnfs_pipe_downcall,
++	.destroy_msg	= spnfs_pipe_destroy_msg,
++};
 +
-+	return nfs_initiate_read(data, clnt, call_ops);
-+}
++/* evil global variable */
++struct spnfs *global_spnfs;
++struct spnfs_config *spnfs_config;
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++int spnfs_use_layoutsegments;
++uint64_t layoutsegment_size;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
 +
 +/*
-+ * Set up the NFS read request struct
++ * Used by spnfs_enabled()
++ * Tracks if the subsystem has been initialized at some point.  It doesn't
++ * matter if it's not currently initialized.
 + */
-+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-+		const struct rpc_call_ops *call_ops,
-+		unsigned int count, unsigned int offset)
++static int spnfs_enabled_at_some_point;
++
++/* call this to start the ball rolling */
++/* code it like we're going to avoid the global variable in the future */
++int
++nfsd_spnfs_new(void)
 +{
-+	struct inode *inode = req->wb_context->path.dentry->d_inode;
++	struct spnfs *spnfs = NULL;
++	struct path path;
++	struct nameidata nd;
++	int rc;
 +
- 	data->req	  = req;
- 	data->inode	  = inode;
--	data->cred	  = msg.rpc_cred;
-+	data->cred	  = req->wb_context->cred;
- 
- 	data->args.fh     = NFS_FH(inode);
- 	data->args.offset = req_offset(req) + offset;
-@@ -197,21 +239,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
- 	data->res.eof     = 0;
- 	nfs_fattr_init(&data->fattr);
- 
--	/* Set up the initial task struct. */
--	NFS_PROTO(inode)->read_setup(data, &msg);
--
--	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
--			data->task.tk_pid,
--			inode->i_sb->s_id,
--			(long long)NFS_FILEID(inode),
--			count,
--			(unsigned long long)data->args.offset);
--
--	task = rpc_run_task(&task_setup_data);
--	if (IS_ERR(task))
--		return PTR_ERR(task);
--	rpc_put_task(task);
--	return 0;
-+	return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
- }
- 
- static void
-@@ -355,7 +383,14 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
- {
- 	struct nfs_readargs *argp = &data->args;
- 	struct nfs_readres *resp = &data->res;
-+	struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client;
- 
-+#ifdef CONFIG_NFS_V4_1
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS read\n", __func__);
-+		clp = data->fldata.ds_nfs_client;
-+	}
-+#endif /* CONFIG_NFS_V4_1 */
- 	if (resp->eof || resp->count == argp->count)
- 		return;
- 
-@@ -369,7 +404,10 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
- 	argp->offset += resp->count;
- 	argp->pgbase += resp->count;
- 	argp->count -= resp->count;
--	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
-+#ifdef CONFIG_NFS_V4_1
-+	data->pdata.pnfs_error = -EAGAIN;
-+#endif /* CONFIG_NFS_V4_1 */
-+	nfs_restart_rpc(task, clp);
- }
- 
- /*
-@@ -410,13 +448,19 @@ static void nfs_readpage_release_partial(void *calldata)
- void nfs_read_prepare(struct rpc_task *task, void *calldata)
- {
- 	struct nfs_read_data *data = calldata;
-+	struct nfs4_session *ds_session = NULL;
- 
--	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS read\n", __func__);
-+		ds_session = data->fldata.ds_nfs_client->cl_session;
++	if (global_spnfs != NULL)
++		return -EEXIST;
++
++	path.mnt = rpc_get_mount();
++	if (IS_ERR(path.mnt))
++		return PTR_ERR(path.mnt);
++
++	/* FIXME: do not abuse rpc_pipefs/nfs */
++	rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
++	if (rc)
++		goto err;
++
++	spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL);
++	if (spnfs == NULL){
++		rc = -ENOMEM;
++		goto err;
 +	}
-+	if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
- 				&data->args.seq_args, &data->res.seq_res,
- 				0, task))
- 		return;
- 	rpc_call_start(task);
- }
-+EXPORT_SYMBOL(nfs_read_prepare);
- #endif /* CONFIG_NFS_V4_1 */
- 
- static const struct rpc_call_ops nfs_read_partial_ops = {
-@@ -569,7 +613,20 @@ readpage_async_filler(void *data, struct page *page)
- 	if (len == 0)
- 		return nfs_return_empty_page(page);
- 
--	new = nfs_create_request(desc->ctx, inode, page, 0, len);
-+	if (desc->pgio->pg_lseg) {
-+		loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT;
-+		struct pnfs_layout_range *range = &desc->pgio->pg_lseg->range;
 +
-+		/* retry later with the right lseg? */
-+		if (range->offset > pgoff + len ||
-+		    range->offset + range->length < pgoff) {
-+			new = ERR_PTR(-EAGAIN);
-+			goto out_error;
-+		}
++	spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs,
++					 &spnfs_upcall_ops, 0);
++	if (IS_ERR(spnfs->spnfs_dentry)) {
++		rc = -EPIPE;
++		goto err;
 +	}
 +
-+	new = nfs_create_request(desc->ctx, inode, page, 0, len,
-+				 desc->pgio->pg_lseg);
- 	if (IS_ERR(new))
- 		goto out_error;
- 
-@@ -625,6 +682,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
- 	if (ret == 0)
- 		goto read_complete; /* all pages were read */
- 
-+	pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize);
- 	if (rsize < PAGE_CACHE_SIZE)
- 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
- 	else
-@@ -633,6 +691,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
- 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
- 
- 	nfs_pageio_complete(&pgio);
-+	put_lseg(pgio.pg_lseg);
- 	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
- read_complete:
-diff --git a/fs/nfs/super.c b/fs/nfs/super.c
-index f4cbf0c..91606fb 100644
---- a/fs/nfs/super.c
-+++ b/fs/nfs/super.c
-@@ -64,6 +64,7 @@
- #include "iostat.h"
- #include "internal.h"
- #include "fscache.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_VFS
- 
-@@ -687,6 +688,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
- 
- 	return 0;
- }
-+#ifdef CONFIG_NFS_V4_1
-+void show_sessions(struct seq_file *m, struct nfs_server *server)
-+{
-+	if (nfs4_has_session(server->nfs_client))
-+		seq_printf(m, ",sessions");
-+}
-+#else
-+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
-+#endif
++	mutex_init(&spnfs->spnfs_lock);
++	mutex_init(&spnfs->spnfs_plock);
++	init_waitqueue_head(&spnfs->spnfs_wq);
 +
-+#ifdef CONFIG_NFS_V4_1
-+void show_pnfs(struct seq_file *m, struct nfs_server *server)
-+{
-+	seq_printf(m, ",pnfs=");
-+	if (server->pnfs_curr_ld)
-+		seq_printf(m, "%s", server->pnfs_curr_ld->name);
-+	else
-+		seq_printf(m, "not configured");
-+}
-+#else  /* CONFIG_NFS_V4_1 */
-+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
-+#endif /* CONFIG_NFS_V4_1 */
- 
- /*
-  * Present statistical information for this VFS mountpoint
-@@ -725,6 +748,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
- 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
- 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
- 		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
-+		show_sessions(m, nfss);
-+		show_pnfs(m, nfss);
- 	}
- #endif
- 
-diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
-index 2f84ada..51ae53b 100644
---- a/fs/nfs/unlink.c
-+++ b/fs/nfs/unlink.c
-@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
- 	struct nfs_unlinkdata *data = calldata;
- 	struct nfs_server *server = NFS_SERVER(data->dir);
- 
--	if (nfs4_setup_sequence(server, &data->args.seq_args,
-+	if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
- 				&data->res.seq_res, 1, task))
- 		return;
- 	rpc_call_start(task);
-diff --git a/fs/nfs/write.c b/fs/nfs/write.c
-index 874972d..988b65a 100644
---- a/fs/nfs/write.c
-+++ b/fs/nfs/write.c
-@@ -28,6 +28,7 @@
- #include "iostat.h"
- #include "nfs4_fs.h"
- #include "fscache.h"
-+#include "pnfs.h"
- 
- #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
- 
-@@ -59,6 +60,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
- 	}
- 	return p;
- }
-+EXPORT_SYMBOL(nfs_commitdata_alloc);
- 
- void nfs_commit_free(struct nfs_write_data *p)
- {
-@@ -429,6 +431,17 @@ static void nfs_inode_remove_request(struct nfs_page *req)
- 	nfs_clear_request(req);
- 	nfs_release_request(req);
- }
-+static void
-+nfs_mark_request_nopnfs(struct nfs_page *req)
++	global_spnfs = spnfs;
++	spnfs_enabled_at_some_point = 1;
++
++	return 0;
++err:
++	rpc_put_mount();
++	kfree(spnfs);
++	return rc;
++}
++
++/* again, code it like we're going to remove the global variable */
++void
++nfsd_spnfs_delete(void)
 +{
-+	struct pnfs_layout_segment *lseg = req->wb_lseg;
++	struct spnfs *spnfs = global_spnfs;
 +
-+	if (req->wb_lseg == NULL)
++	if (!spnfs)
 +		return;
-+	req->wb_lseg = NULL;
-+	put_lseg(lseg);
-+	dprintk(" retry through MDS\n");
++	rpc_unlink(spnfs->spnfs_dentry);
++	rpc_put_mount();
++	global_spnfs = NULL;
++	kfree(spnfs);
 +}
- 
- static void
- nfs_mark_request_dirty(struct nfs_page *req)
-@@ -534,7 +547,7 @@ nfs_need_commit(struct nfs_inode *nfsi)
-  * The requests are *not* checked to ensure that they form a contiguous set.
-  */
- static int
--nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
-+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs)
- {
- 	struct nfs_inode *nfsi = NFS_I(inode);
- 	int ret;
-@@ -542,7 +555,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
- 	if (!nfs_need_commit(nfsi))
- 		return 0;
- 
--	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
-+	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT,
-+			    use_pnfs);
- 	if (ret > 0)
- 		nfsi->ncommit -= ret;
- 	if (nfs_need_commit(NFS_I(inode)))
-@@ -571,7 +585,8 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
- static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
- 		struct page *page,
- 		unsigned int offset,
--		unsigned int bytes)
-+		unsigned int bytes,
-+		struct pnfs_layout_segment *lseg)
- {
- 	struct nfs_page *req;
- 	unsigned int rqend;
-@@ -596,8 +611,8 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
- 		 * Note: nfs_flush_incompatible() will already
- 		 * have flushed out requests having wrong owners.
- 		 */
--		if (offset > rqend
--		    || end < req->wb_offset)
-+		if (offset > rqend || end < req->wb_offset ||
-+		    req->wb_lseg != lseg)
- 			goto out_flushme;
- 
- 		if (nfs_set_page_tag_locked(req))
-@@ -645,16 +660,17 @@ out_err:
-  * already called nfs_flush_incompatible() if necessary.
-  */
- static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
--		struct page *page, unsigned int offset, unsigned int bytes)
-+		struct page *page, unsigned int offset, unsigned int bytes,
-+		struct pnfs_layout_segment *lseg)
- {
- 	struct inode *inode = page->mapping->host;
- 	struct nfs_page	*req;
- 	int error;
- 
--	req = nfs_try_to_update_request(inode, page, offset, bytes);
-+	req = nfs_try_to_update_request(inode, page, offset, bytes, lseg);
- 	if (req != NULL)
- 		goto out;
--	req = nfs_create_request(ctx, inode, page, offset, bytes);
-+	req = nfs_create_request(ctx, inode, page, offset, bytes, lseg);
- 	if (IS_ERR(req))
- 		goto out;
- 	error = nfs_inode_add_request(inode, req);
-@@ -667,23 +683,27 @@ out:
- }
- 
- static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
--		unsigned int offset, unsigned int count)
-+			       unsigned int offset, unsigned int count,
-+			       struct pnfs_layout_segment *lseg,
-+			       void *fsdata)
- {
- 	struct nfs_page	*req;
- 
--	req = nfs_setup_write_request(ctx, page, offset, count);
-+	req = nfs_setup_write_request(ctx, page, offset, count, lseg);
- 	if (IS_ERR(req))
- 		return PTR_ERR(req);
- 	nfs_mark_request_dirty(req);
- 	/* Update file length */
--	nfs_grow_file(page, offset, count);
-+	if (pnfs_grow_ok(lseg, fsdata))
-+		nfs_grow_file(page, offset, count);
- 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
- 	nfs_mark_request_dirty(req);
- 	nfs_clear_page_tag_locked(req);
- 	return 0;
- }
- 
--int nfs_flush_incompatible(struct file *file, struct page *page)
-+int nfs_flush_incompatible(struct file *file, struct page *page,
-+			   struct pnfs_layout_segment *lseg)
- {
- 	struct nfs_open_context *ctx = nfs_file_open_context(file);
- 	struct nfs_page	*req;
-@@ -702,7 +722,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
- 			return 0;
- 		do_flush = req->wb_page != page || req->wb_context != ctx ||
- 			req->wb_lock_context->lockowner != current->files ||
--			req->wb_lock_context->pid != current->tgid;
-+			req->wb_lock_context->pid != current->tgid ||
-+			req->wb_lseg != lseg;
- 		nfs_release_request(req);
- 		if (!do_flush)
- 			return 0;
-@@ -729,7 +750,8 @@ static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
-  * things with a page scheduled for an RPC call (e.g. invalidate it).
-  */
- int nfs_updatepage(struct file *file, struct page *page,
--		unsigned int offset, unsigned int count)
-+		   unsigned int offset, unsigned int count,
-+		   struct pnfs_layout_segment *lseg, void *fsdata)
- {
- 	struct nfs_open_context *ctx = nfs_file_open_context(file);
- 	struct inode	*inode = page->mapping->host;
-@@ -754,7 +776,7 @@ int nfs_updatepage(struct file *file, struct page *page,
- 		offset = 0;
- 	}
- 
--	status = nfs_writepage_setup(ctx, page, offset, count);
-+	status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
- 	if (status < 0)
- 		nfs_set_pageerror(page);
- 
-@@ -784,25 +806,21 @@ static int flush_task_priority(int how)
- 	return RPC_PRIORITY_NORMAL;
- }
- 
--/*
-- * Set up the argument/result storage required for the RPC call.
-- */
--static int nfs_write_rpcsetup(struct nfs_page *req,
--		struct nfs_write_data *data,
--		const struct rpc_call_ops *call_ops,
--		unsigned int count, unsigned int offset,
--		int how)
-+int nfs_initiate_write(struct nfs_write_data *data,
-+		       struct rpc_clnt *clnt,
-+		       const struct rpc_call_ops *call_ops,
-+		       int how)
- {
--	struct inode *inode = req->wb_context->path.dentry->d_inode;
-+	struct inode *inode = data->inode;
- 	int priority = flush_task_priority(how);
- 	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_argp = &data->args,
- 		.rpc_resp = &data->res,
--		.rpc_cred = req->wb_context->cred,
-+		.rpc_cred = data->cred,
- 	};
- 	struct rpc_task_setup task_setup_data = {
--		.rpc_client = NFS_CLIENT(inode),
-+		.rpc_client = clnt,
- 		.task = &data->task,
- 		.rpc_message = &msg,
- 		.callback_ops = call_ops,
-@@ -813,12 +831,62 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
- 	};
- 	int ret = 0;
- 
-+	/* Set up the initial task struct.  */
-+	NFS_PROTO(inode)->write_setup(data, &msg);
 +
-+	dprintk("NFS: %5u initiated write call "
-+		"(req %s/%lld, %u bytes @ offset %llu)\n",
-+		data->task.tk_pid,
-+		inode->i_sb->s_id,
-+		(long long)NFS_FILEID(inode),
-+		data->args.count,
-+		(unsigned long long)data->args.offset);
++/* RPC pipefs upcall/downcall routines */
++/* looks like this code is invoked by the rpc_pipe code */
++/* to handle upcalls on things we've queued elsewhere */
++/* See nfs_idmap_id for an exmaple of enqueueing */
++static ssize_t
++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
++    char __user *dst, size_t buflen)
++{
++	char *data = (char *)msg->data + msg->copied;
++	ssize_t mlen = msg->len - msg->copied;
++	ssize_t left;
 +
-+	task = rpc_run_task(&task_setup_data);
-+	if (IS_ERR(task)) {
-+		ret = PTR_ERR(task);
-+		goto out;
++	if (mlen > buflen)
++		mlen = buflen;
++
++	left = copy_to_user(dst, data, mlen);
++	if (left < 0) {
++		msg->errno = left;
++		return left;
 +	}
-+	if (how & FLUSH_SYNC) {
-+		ret = rpc_wait_for_completion_task(task);
-+		if (ret == 0)
-+			ret = task->tk_status;
++	mlen -= left;
++	msg->copied += mlen;
++	msg->errno = 0;
++	return mlen;
++}
++
++static ssize_t
++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
++{
++	struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
++	struct spnfs *spnfs = (struct spnfs *)rpci->private;
++	struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im;
++	int ret;
++
++	if (mlen != sizeof(struct spnfs_msg))
++		return -ENOSPC;
++
++	im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++	if (im_in == NULL)
++		return -ENOMEM;
++
++	if (copy_from_user(im_in, src, mlen) != 0)
++		return -EFAULT;
++
++	mutex_lock(&spnfs->spnfs_plock);
++
++	ret = mlen;
++	im->im_status = im_in->im_status;
++	/* If we got an error, terminate now, and wake up pending upcalls */
++	if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) {
++		wake_up(&spnfs->spnfs_wq);
++		goto out;
 +	}
-+	rpc_put_task(task);
++
++	ret = -EINVAL;
++	/* Did we match the current upcall? */
++	/* DMXXX: do not understand the comment above, from original code */
++	/* DMXXX: when do we _not_ match the current upcall? */
++	/* DMXXX: anyway, let's to a simplistic check */
++	if (im_in->im_type == im->im_type) {
++		/* copy the response into the spnfs struct */
++		memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res));
++		ret = mlen;
++	} else
++		dprintk("spnfs: downcall type != upcall type\n");
++
++
++	wake_up(&spnfs->spnfs_wq);
++/* DMXXX handle rval processing */
 +out:
++	mutex_unlock(&spnfs->spnfs_plock);
++	kfree(im_in);
 +	return ret;
 +}
-+EXPORT_SYMBOL(nfs_initiate_write);
 +
-+int pnfs_initiate_write(struct nfs_write_data *data,
-+			struct rpc_clnt *clnt,
-+			const struct rpc_call_ops *call_ops,
-+			int how)
++static void
++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 +{
-+	if (data->req->wb_lseg &&
-+	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
-+		return pnfs_get_write_status(data);
++	struct spnfs_msg *im = msg->data;
++	struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im);
 +
-+	return nfs_initiate_write(data, clnt, call_ops, how);
++	if (msg->errno >= 0)
++		return;
++	mutex_lock(&spnfs->spnfs_plock);
++	im->im_status = SPNFS_STATUS_FAIL;  /* DMXXX */
++	wake_up(&spnfs->spnfs_wq);
++	mutex_unlock(&spnfs->spnfs_plock);
 +}
 +
-+/*
-+ * Set up the argument/result storage required for the RPC call.
-+ */
-+static int nfs_write_rpcsetup(struct nfs_page *req,
-+		struct nfs_write_data *data,
-+		const struct rpc_call_ops *call_ops,
-+		unsigned int count, unsigned int offset,
-+		int how)
++/* generic upcall.  called by functions in spnfs_ops.c  */
++int
++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg,
++		union spnfs_msg_res *res)
 +{
-+	struct inode *inode = req->wb_context->path.dentry->d_inode;
++	struct rpc_pipe_msg msg;
++	struct spnfs_msg *im;
++	DECLARE_WAITQUEUE(wq, current);
++	int ret = -EIO;
++	int rval;
 +
- 	/* Set up the RPC argument and reply structs
- 	 * NB: take care not to mess about with data->commit et al. */
- 
- 	data->req = req;
- 	data->inode = inode = req->wb_context->path.dentry->d_inode;
--	data->cred = msg.rpc_cred;
-+	data->cred = req->wb_context->cred;
- 
- 	data->args.fh     = NFS_FH(inode);
- 	data->args.offset = req_offset(req) + offset;
-@@ -839,30 +907,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
- 	data->res.verf    = &data->verf;
- 	nfs_fattr_init(&data->fattr);
- 
--	/* Set up the initial task struct.  */
--	NFS_PROTO(inode)->write_setup(data, &msg);
--
--	dprintk("NFS: %5u initiated write call "
--		"(req %s/%lld, %u bytes @ offset %llu)\n",
--		data->task.tk_pid,
--		inode->i_sb->s_id,
--		(long long)NFS_FILEID(inode),
--		count,
--		(unsigned long long)data->args.offset);
--
--	task = rpc_run_task(&task_setup_data);
--	if (IS_ERR(task)) {
--		ret = PTR_ERR(task);
--		goto out;
--	}
--	if (how & FLUSH_SYNC) {
--		ret = rpc_wait_for_completion_task(task);
--		if (ret == 0)
--			ret = task->tk_status;
--	}
--	rpc_put_task(task);
--out:
--	return ret;
-+	return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
- }
- 
- /* If a nfs_flush_* function fails, it should remove reqs from @head and
-@@ -873,6 +918,7 @@ static void nfs_redirty_request(struct nfs_page *req)
- {
- 	struct page *page = req->wb_page;
- 
-+	nfs_mark_request_nopnfs(req);
- 	nfs_mark_request_dirty(req);
- 	nfs_clear_page_tag_locked(req);
- 	nfs_end_page_writeback(page);
-@@ -985,6 +1031,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
- {
- 	size_t wsize = NFS_SERVER(inode)->wsize;
- 
-+	pnfs_pageio_init_write(pgio, inode, &wsize);
++	im = &spnfs->spnfs_im;
 +
- 	if (wsize < PAGE_CACHE_SIZE)
- 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
- 	else
-@@ -1050,13 +1098,27 @@ out:
- void nfs_write_prepare(struct rpc_task *task, void *calldata)
- {
- 	struct nfs_write_data *data = calldata;
-+	struct nfs4_session *ds_session = NULL;
++	mutex_lock(&spnfs->spnfs_lock);
++	mutex_lock(&spnfs->spnfs_plock);
 +
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS read\n", __func__);
-+		ds_session = data->fldata.ds_nfs_client->cl_session;
-+	} else if (data->args.count > NFS_SERVER(data->inode)->wsize) {
-+		/* retrying via MDS? */
-+		data->pdata.orig_count = data->args.count;
-+		data->args.count = NFS_SERVER(data->inode)->wsize;
-+		dprintk("%s: trimmed count %u to wsize %u\n", __func__,
-+		data->pdata.orig_count, data->args.count);
-+	} else
-+		data->pdata.orig_count = 0;
- 
--	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-+	if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
- 				&data->args.seq_args,
- 				&data->res.seq_res, 1, task))
- 		return;
- 	rpc_call_start(task);
- }
-+EXPORT_SYMBOL(nfs_write_prepare);
- #endif /* CONFIG_NFS_V4_1 */
- 
- static const struct rpc_call_ops nfs_write_partial_ops = {
-@@ -1140,10 +1202,11 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
- 	struct nfs_writeargs	*argp = &data->args;
- 	struct nfs_writeres	*resp = &data->res;
- 	struct nfs_server	*server = NFS_SERVER(data->inode);
-+	struct nfs_client	*clp = server->nfs_client;
- 	int status;
- 
--	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
--		task->tk_pid, task->tk_status);
-+	dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n",
-+		task->tk_pid, task->tk_status, resp->count);
- 
- 	/*
- 	 * ->write_done will attempt to use post-op attributes to detect
-@@ -1156,6 +1219,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
- 	if (status != 0)
- 		return status;
- 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
-+#ifdef CONFIG_NFS_V4_1
-+	/* Is this a DS session */
-+	if (data->fldata.ds_nfs_client) {
-+		dprintk("%s DS write\n", __func__);
-+		clp = data->fldata.ds_nfs_client;
++	memset(im, 0, sizeof(*im));
++	memcpy(im, upmsg, sizeof(*upmsg));
++
++	memset(&msg, 0, sizeof(msg));
++	msg.data = im;
++	msg.len = sizeof(*im);
++
++	add_wait_queue(&spnfs->spnfs_wq, &wq);
++	rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg);
++	if (rval < 0) {
++		remove_wait_queue(&spnfs->spnfs_wq, &wq);
++		goto out;
 +	}
-+#endif /* CONFIG_NFS_V4_1 */
- 
- #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
- 	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
-@@ -1172,7 +1242,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
- 		if (time_before(complain, jiffies)) {
- 			dprintk("NFS:       faulty NFS server %s:"
- 				" (committed = %d) != (stable = %d)\n",
--				server->nfs_client->cl_hostname,
-+				clp->cl_hostname,
- 				resp->verf->committed, argp->stable);
- 			complain = jiffies + 300 * HZ;
- 		}
-@@ -1198,6 +1268,9 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
- 				 */
- 				argp->stable = NFS_FILE_SYNC;
- 			}
-+#ifdef CONFIG_NFS_V4_1
-+			data->pdata.pnfs_error = -EAGAIN;
-+#endif /* CONFIG_NFS_V4_1 */
- 			nfs_restart_rpc(task, server->nfs_client);
- 			return -EAGAIN;
- 		}
-@@ -1242,40 +1315,73 @@ static void nfs_commitdata_release(void *data)
- 	nfs_commit_free(wdata);
- }
- 
--/*
-- * Set up the argument/result storage required for the RPC call.
-- */
--static int nfs_commit_rpcsetup(struct list_head *head,
--		struct nfs_write_data *data,
--		int how)
-+int nfs_initiate_commit(struct nfs_write_data *data,
-+			struct rpc_clnt *clnt,
-+			const struct rpc_call_ops *call_ops,
-+			int how)
- {
--	struct nfs_page *first = nfs_list_entry(head->next);
--	struct inode *inode = first->wb_context->path.dentry->d_inode;
-+	struct inode *inode = data->inode;
- 	int priority = flush_task_priority(how);
- 	struct rpc_task *task;
- 	struct rpc_message msg = {
- 		.rpc_argp = &data->args,
- 		.rpc_resp = &data->res,
--		.rpc_cred = first->wb_context->cred,
-+		.rpc_cred = data->cred,
- 	};
- 	struct rpc_task_setup task_setup_data = {
- 		.task = &data->task,
--		.rpc_client = NFS_CLIENT(inode),
-+		.rpc_client = clnt,
- 		.rpc_message = &msg,
--		.callback_ops = &nfs_commit_ops,
-+		.callback_ops = call_ops,
- 		.callback_data = data,
- 		.workqueue = nfsiod_workqueue,
- 		.flags = RPC_TASK_ASYNC,
- 		.priority = priority,
- 	};
- 
-+	/* Set up the initial task struct.  */
-+	NFS_PROTO(inode)->commit_setup(data, &msg);
 +
-+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
++	set_current_state(TASK_UNINTERRUPTIBLE);
++	mutex_unlock(&spnfs->spnfs_plock);
++	schedule();
++	current->state = TASK_RUNNING;
++	remove_wait_queue(&spnfs->spnfs_wq, &wq);
++	mutex_lock(&spnfs->spnfs_plock);
++
++	if (im->im_status & SPNFS_STATUS_SUCCESS) {
++		/* copy our result from the upcall */
++		memcpy(res, &im->im_res, sizeof(*res));
++		ret = 0;
++	}
++
++out:
++	memset(im, 0, sizeof(*im));
++	mutex_unlock(&spnfs->spnfs_plock);
++	mutex_unlock(&spnfs->spnfs_lock);
++	return(ret);
++}
++
++/*
++ * This is used to determine if the spnfsd daemon has been started at
++ * least once since the system came up.  This is used to by the export
++ * mechanism to decide if spnfs is in use.
++ *
++ * Returns non-zero if the spnfsd has initialized the communication pipe
++ * at least once.
++ */
++int spnfs_enabled(void)
++{
++	return spnfs_enabled_at_some_point;
++}
++
++#ifdef CONFIG_PROC_FS
++
++/*
++ * procfs virtual files for user/kernel space communication:
++ *
++ * ctl - currently just an on/off switch...can be expanded
++ * getfh - fd to fh conversion
++ * recall - recall a layout from the command line, for example:
++ *		echo <path> > /proc/fs/spnfs/recall
++ * config - configuration info, e.g., stripe size, num ds, etc.
++ */
 +
-+	task = rpc_run_task(&task_setup_data);
-+	if (IS_ERR(task))
-+		return PTR_ERR(task);
-+	rpc_put_task(task);
++/*-------------- start ctl -------------------------*/
++static ssize_t ctl_write(struct file *file, const char __user *buf,
++			 size_t count, loff_t *offset)
++{
++	int cmd, rc;
++
++	if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int)))
++		return -EFAULT;
++	if (cmd) {
++		rc = nfsd_spnfs_new();
++		if (rc != 0)
++			return rc;
++	} else
++		nfsd_spnfs_delete();
++
++	return count;
++}
++
++static const struct file_operations ctl_ops = {
++	.write		= ctl_write,
++};
++/*-------------- end ctl ---------------------------*/
++
++/*-------------- start config -------------------------*/
++static ssize_t config_write(struct file *file, const char __user *buf,
++			    size_t count, loff_t *offset)
++{
++	static struct spnfs_config cfg;
++
++	if (copy_from_user(&cfg, buf, count))
++		return -EFAULT;
++
++	spnfs_config = &cfg;
 +	return 0;
 +}
-+EXPORT_SYMBOL(nfs_initiate_commit);
 +
++static const struct file_operations config_ops = {
++	.write		= config_write,
++};
++/*-------------- end config ---------------------------*/
 +
-+int pnfs_initiate_commit(struct nfs_write_data *data,
-+			 struct rpc_clnt *clnt,
-+			 const struct rpc_call_ops *call_ops,
-+			 int how, int pnfs)
++/*-------------- start getfh -----------------------*/
++static int getfh_open(struct inode *inode, struct file *file)
 +{
-+	if (pnfs &&
-+	    (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED))
-+		return pnfs_get_write_status(data);
++	file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
++	if (file->private_data == NULL)
++		return -ENOMEM;
 +
-+	return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how);
++	return 0;
 +}
 +
-+/*
-+ * Set up the argument/result storage required for the RPC call.
-+ */
-+static int nfs_commit_rpcsetup(struct list_head *head,
-+		struct nfs_write_data *data,
-+		int how, int pnfs)
++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count,
++			  loff_t *offset)
 +{
-+	struct nfs_page *first = nfs_list_entry(head->next);
-+	struct inode *inode = first->wb_context->path.dentry->d_inode;
++	if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh)))
++		return -EFAULT;
 +
- 	/* Set up the RPC argument and reply structs
- 	 * NB: take care not to mess about with data->commit et al. */
- 
- 	list_splice_init(head, &data->pages);
- 
- 	data->inode	  = inode;
--	data->cred	  = msg.rpc_cred;
-+	data->cred	  = first->wb_context->cred;
- 
- 	data->args.fh     = NFS_FH(data->inode);
- 	/* Note: we always request a commit of the entire inode */
-@@ -1286,45 +1392,47 @@ static int nfs_commit_rpcsetup(struct list_head *head,
- 	data->res.fattr   = &data->fattr;
- 	data->res.verf    = &data->verf;
- 	nfs_fattr_init(&data->fattr);
-+	kref_init(&data->refcount);
-+	data->parent      = NULL;
-+	data->args.context = first->wb_context;  /* used by commit done */
- 
--	/* Set up the initial task struct.  */
--	NFS_PROTO(inode)->commit_setup(data, &msg);
-+	return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops,
-+				    how, pnfs);
++	return count;
 +}
- 
--	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-+/* Handle memory error during commit */
-+void nfs_mark_list_commit(struct list_head *head)
++
++static ssize_t getfh_write(struct file *file, const char __user *buf,
++			   size_t count, loff_t *offset)
 +{
-+	struct nfs_page         *req;
- 
--	task = rpc_run_task(&task_setup_data);
--	if (IS_ERR(task))
--		return PTR_ERR(task);
--	rpc_put_task(task);
--	return 0;
-+	while (!list_empty(head)) {
-+		req = nfs_list_entry(head->next);
-+		nfs_list_remove_request(req);
-+		nfs_mark_request_commit(req);
-+		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-+				BDI_RECLAIMABLE);
-+		nfs_clear_page_tag_locked(req);
++	int fd;
++
++	if (copy_from_user((int *)&fd, (int *)buf, sizeof(int)))
++		return -EFAULT;
++	if (spnfs_getfh(fd, file->private_data) != 0)
++		return -EIO;
++
++	return count;
++}
++
++static int getfh_release(struct inode *inode, struct file *file)
++{
++	kfree(file->private_data);
++	return 0;
++}
++
++static const struct file_operations getfh_ops = {
++	.open		= getfh_open,
++	.read		= getfh_read,
++	.write		= getfh_write,
++	.release	= getfh_release,
++};
++/*-------------- end getfh ------------------------*/
++
++
++/*-------------- start recall layout --------------*/
++static ssize_t recall_write(struct file *file, const char __user *buf,
++			    size_t count, loff_t *offset)
++{
++	char input[128];
++	char *path, *str, *p;
++	int rc;
++	u64 off = 0, len = 0;
++
++	if (count > 128)
++		return -EINVAL;
++
++	if (copy_from_user(input, buf, count))
++		return -EFAULT;
++
++	/* assumes newline-terminated path */
++	p = memchr(input, '\n', count);
++	if (p == NULL)
++		return -EINVAL;
++	*p = '\0';
++
++	/*
++	 * Scan for path and, optionally, an offset and length
++	 * of a layout segment to be recalled; if there are two
++	 * fields, they're assumed to be path and offset.
++	 */
++	p = input;
++	path = strsep(&p, " ");
++	if (path == NULL)
++		return -EINVAL;
++
++	str = strsep(&p, " ");
++	if (str != NULL) {
++		rc = strict_strtoull(str, 10, &off);
++		if (rc != 0)
++			return -EINVAL;
++
++		str = strsep(&p, " ");
++		if (str != NULL) {
++			rc = strict_strtoull(str, 10, &len);
++			if (rc != 0)
++				return -EINVAL;
++		}
 +	}
- }
-+EXPORT_SYMBOL(nfs_mark_list_commit);
- 
- /*
-  * Commit dirty pages
-  */
- static int
--nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-+nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs)
- {
- 	struct nfs_write_data	*data;
--	struct nfs_page         *req;
- 
- 	data = nfs_commitdata_alloc();
--
- 	if (!data)
- 		goto out_bad;
- 
- 	/* Set up the argument struct */
--	return nfs_commit_rpcsetup(head, data, how);
-+	return nfs_commit_rpcsetup(head, data, how, pnfs);
-  out_bad:
--	while (!list_empty(head)) {
--		req = nfs_list_entry(head->next);
--		nfs_list_remove_request(req);
--		nfs_mark_request_commit(req);
--		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
--		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
--				BDI_RECLAIMABLE);
--		nfs_clear_page_tag_locked(req);
--	}
-+	nfs_mark_list_commit(head);
- 	nfs_commit_clear_lock(NFS_I(inode));
- 	return -ENOMEM;
- }
-@@ -1344,6 +1452,19 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
- 		return;
- }
- 
-+static inline void nfs_commit_cleanup(struct kref *kref)
++
++	rc = spnfs_test_layoutrecall(path, off, len);
++	if (rc != 0)
++		return rc;
++
++	return count;
++}
++
++static const struct file_operations recall_ops = {
++	.write		= recall_write,
++};
++/*-------------- end recall layout --------------*/
++
++
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++/*-------------- start layoutseg -------------------------*/
++static ssize_t layoutseg_write(struct file *file, const char __user *buf,
++			       size_t count, loff_t *offset)
++{
++	char cmd[3];
++
++	if (copy_from_user(cmd, buf, 1))
++		return -EFAULT;
++	if (cmd[0] == '0')
++		spnfs_use_layoutsegments = 0;
++	else
++		spnfs_use_layoutsegments = 1;
++
++	return count;
++}
++
++static const struct file_operations layoutseg_ops = {
++	.write		= layoutseg_write,
++};
++/*-------------- end layoutseg ---------------------------*/
++
++/*-------------- start layoutsegsize -------------------------*/
++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf,
++				   size_t count, loff_t *offset)
 +{
-+	struct nfs_write_data *data;
++	char cmd[50];
 +
-+	data = container_of(kref, struct nfs_write_data, refcount);
-+	/* Clear lock only when all cloned commits are finished */
-+	if (data->parent)
-+		kref_put(&data->parent->refcount, nfs_commit_cleanup);
-+	else
-+		nfs_commit_clear_lock(NFS_I(data->inode));
-+	nfs_commitdata_release(data);
++	if (copy_from_user(cmd, buf, 49))
++		return -EFAULT;
++	layoutsegment_size = simple_strtoull(cmd, NULL, 10);
++
++	return count;
 +}
 +
- static void nfs_commit_release(void *calldata)
- {
- 	struct nfs_write_data	*data = calldata;
-@@ -1361,6 +1482,11 @@ static void nfs_commit_release(void *calldata)
- 			req->wb_bytes,
- 			(long long)req_offset(req));
- 		if (status < 0) {
-+			if (req->wb_lseg) {
-+				nfs_mark_request_nopnfs(req);
-+				nfs_mark_request_dirty(req);
-+				goto next;
-+			}
- 			nfs_context_set_write_error(req->wb_context, status);
- 			nfs_inode_remove_request(req);
- 			dprintk(", error = %d\n", status);
-@@ -1377,12 +1503,12 @@ static void nfs_commit_release(void *calldata)
- 		}
- 		/* We have a mismatch. Write the page again */
- 		dprintk(" mismatch\n");
-+		nfs_mark_request_nopnfs(req);
- 		nfs_mark_request_dirty(req);
- 	next:
- 		nfs_clear_page_tag_locked(req);
- 	}
--	nfs_commit_clear_lock(NFS_I(data->inode));
--	nfs_commitdata_release(calldata);
-+	kref_put(&data->refcount, nfs_commit_cleanup);
- }
- 
- static const struct rpc_call_ops nfs_commit_ops = {
-@@ -1398,21 +1524,22 @@ int nfs_commit_inode(struct inode *inode, int how)
- 	LIST_HEAD(head);
- 	int may_wait = how & FLUSH_SYNC;
- 	int res = 0;
-+	int use_pnfs = 0;
- 
- 	if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
- 		goto out_mark_dirty;
- 	spin_lock(&inode->i_lock);
--	res = nfs_scan_commit(inode, &head, 0, 0);
-+	res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs);
- 	spin_unlock(&inode->i_lock);
- 	if (res) {
--		int error = nfs_commit_list(inode, &head, how);
-+		int error = nfs_commit_list(inode, &head, how, use_pnfs);
- 		if (error < 0)
- 			return error;
--		if (may_wait)
-+		if (may_wait) {
- 			wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
- 					nfs_wait_bit_killable,
- 					TASK_KILLABLE);
--		else
-+		} else
- 			goto out_mark_dirty;
- 	} else
- 		nfs_commit_clear_lock(NFS_I(inode));
-@@ -1465,7 +1592,18 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
- 
- int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
- {
--	return nfs_commit_unstable_pages(inode, wbc);
-+	int ret;
-+	ret = nfs_commit_unstable_pages(inode, wbc);
-+	if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) {
-+		int err, sync = wbc->sync_mode;
++static const struct file_operations layoutsegsize_ops = {
++	.write		= layoutsegsize_write,
++};
++/*-------------- end layoutsegsize ---------------------------*/
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
 +
-+		if (wbc->nonblocking || wbc->for_background)
-+			sync = 0;
-+		err = pnfs_layoutcommit_inode(inode, sync);
-+		if (err < 0)
-+			ret = err;
-+	}
-+	return ret;
- }
- 
- /*
-diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
-index 4264377..62033eb 100644
---- a/fs/nfsd/Kconfig
-+++ b/fs/nfsd/Kconfig
-@@ -79,3 +79,52 @@ config NFSD_V4
- 	  available from http://linux-nfs.org/.
- 
- 	  If unsure, say N.
++int
++spnfs_init_proc(void)
++{
++	struct proc_dir_entry *entry;
 +
-+config PNFSD
-+	bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)"
-+	depends on NFSD_V4 && EXPERIMENTAL
-+	select EXPORTFS_FILE_LAYOUT
-+	help
-+	  This option enables support for the parallel NFS features of the
-+	  minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1)
-+	  in the kernel's NFS server.
++	entry = proc_mkdir("fs/spnfs", NULL);
++	if (!entry)
++		return -ENOMEM;
 +
-+	  Unless you're an NFS developer, say N.
++	entry = create_proc_entry("fs/spnfs/ctl", 0, NULL);
++	if (!entry)
++		return -ENOMEM;
++	entry->proc_fops = &ctl_ops;
 +
-+config PNFSD_LOCAL_EXPORT
-+	bool "Enable pNFS support for exporting local filesystems for debugging purposes"
-+	depends on PNFSD
-+	help
-+	  Say Y here if you want your pNFS server to export local file systems
-+	  over the files layout type.  With this option the MDS (metadata
-+	  server) functions also as a single DS (data server).  This is mostly
-+	  useful for development and debugging purposes.
++	entry = create_proc_entry("fs/spnfs/config", 0, NULL);
++	if (!entry)
++		return -ENOMEM;
++	entry->proc_fops = &config_ops;
 +
-+	  If unsure, say N.
++	entry = create_proc_entry("fs/spnfs/getfh", 0, NULL);
++	if (!entry)
++		return -ENOMEM;
++	entry->proc_fops = &getfh_ops;
 +
-+config SPNFS
-+	bool "Provide spNFS server support (EXPERIMENTAL)"
-+	depends on PNFSD
-+	select RPCSEC_GSS_KRB5
-+	help
-+	  Say Y here if you want spNFS server support.
++	entry = create_proc_entry("fs/spnfs/recall", 0, NULL);
++	if (!entry)
++		return -ENOMEM;
++	entry->proc_fops = &recall_ops;
 +
-+	  If unsure, say N.
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++	entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL);
++	if (!entry)
++		return -ENOMEM;
++	entry->proc_fops = &layoutseg_ops;
 +
-+config SPNFS_LAYOUTSEGMENTS
-+	bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)"
-+	depends on SPNFS
-+	select RPCSEC_GSS_KRB5
-+	help
-+	  Say Y here if you want spNFS to be able to return layout segments.
++	entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL);
++	if (!entry)
++		return -ENOMEM;
++	entry->proc_fops = &layoutsegsize_ops;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
 +
-+	  If unsure, say N.
++	return 0;
++}
++#endif /* CONFIG_PROC_FS */
+diff -up linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c
+--- linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c.orig	2011-01-28 09:43:53.369768328 -0500
++++ linux-2.6.37.noarch/fs/nfsd/spnfs_ops.c	2011-01-28 09:43:53.369768328 -0500
+@@ -0,0 +1,878 @@
++/*
++ * fs/nfsd/spnfs_ops.c
++ *
++ * Communcation layer between spNFS kernel and userspace
++ *
++ */
++/******************************************************************************
 +
-+config SPNFS_BLOCK
-+	bool "Provide Block Layout server support (EXPERIMENTAL)"
-+	depends on SPNFS
-+	select EXPORTFS_BLOCK_LAYOUT
-+	help
-+	  Say Y here if you want spNFS block layout support
++(c) 2007 Network Appliance, Inc.  All Rights Reserved.
 +
-+	  If unsure, say N.
-diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
-index 9b118ee..fed6c25 100644
---- a/fs/nfsd/Makefile
-+++ b/fs/nfsd/Makefile
-@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
- nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
- nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
- 			   nfs4acl.o nfs4callback.o nfs4recover.o
-+nfsd-$(CONFIG_PNFSD)	+= nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o
-+nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o
-+nfsd-$(CONFIG_SPNFS)	+= spnfs_com.o spnfs_ops.o
-+nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o
-diff --git a/fs/nfsd/bl_com.c b/fs/nfsd/bl_com.c
-new file mode 100644
-index 0000000..aac98c7
---- /dev/null
-+++ b/fs/nfsd/bl_com.c
-@@ -0,0 +1,292 @@
-+#if defined(CONFIG_SPNFS_BLOCK)
++Network Appliance provides this source code under the GPL v2 License.
++The GPL v2 license is available at
++http://opensource.org/licenses/gpl-license.php.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++******************************************************************************/
 +
-+#include <linux/module.h>
-+#include <linux/mutex.h>
-+#include <linux/init.h>
-+#include <linux/types.h>
-+#include <linux/slab.h>
-+#include <linux/socket.h>
-+#include <linux/in.h>
 +#include <linux/sched.h>
-+#include <linux/exportfs.h>
++#include <linux/file.h>
 +#include <linux/namei.h>
-+#include <linux/mount.h>
-+#include <linux/path.h>
-+#include <linux/sunrpc/clnt.h>
-+#include <linux/workqueue.h>
-+#include <linux/sunrpc/rpc_pipe_fs.h>
-+#include <linux/proc_fs.h>
 +#include <linux/nfs_fs.h>
-+
++#include <linux/nfsd4_spnfs.h>
 +#include <linux/nfsd/debug.h>
-+#include <linux/nfsd4_block.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++
++#include "pnfsd.h"
++
++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */
++/* #define CONFIG_SPNFS_TEST 1 */
++
++#define	NFSDDBG_FACILITY		NFSDDBG_PNFS
++
++/*
++ * The functions that are called from elsewhere in the kernel
++ * to perform tasks in userspace
++ *
++ */
++
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++extern int spnfs_use_layoutsegments;
++extern uint64_t layoutsegment_size;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++extern struct spnfs *global_spnfs;
 +
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
++int
++spnfs_layout_type(struct super_block *sb)
++{
++	return LAYOUT_NFSV4_1_FILES;
++}
 +
-+static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-+    char __user *, size_t);
-+static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
++enum nfsstat4
++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr,
++		const struct nfsd4_pnfs_layoutget_arg *lg_arg,
++		struct nfsd4_pnfs_layoutget_res *lg_res)
++{
++	struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
++	struct spnfs_msg *im = NULL;
++	union spnfs_msg_res *res = NULL;
++	struct pnfs_filelayout_layout *flp = NULL;
++	int status, i;
++	enum nfsstat4 nfserr;
 +
-+static struct rpc_pipe_ops bl_upcall_ops = {
-+	.upcall		= bl_pipe_upcall,
-+	.downcall	= bl_pipe_downcall,
-+	.destroy_msg	= bl_pipe_destroy_msg,
-+};
++	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++	if (im == NULL) {
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
 +
-+bl_comm_t	*bl_comm_global;
++	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++	if (res == NULL) {
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
 +
-+int
-+nfsd_bl_start(void)
-+{
-+	bl_comm_t	*bl_comm = NULL;
-+	struct path path;
-+	struct nameidata nd;
-+	int rc;
++	im->im_type = SPNFS_TYPE_LAYOUTGET;
++	im->im_args.layoutget_args.inode = inode->i_ino;
++	im->im_args.layoutget_args.generation = inode->i_generation;
 +
-+	dprintk("%s: starting pipe\n", __func__);
-+	if (bl_comm_global)
-+		return -EEXIST;
++	/* call function to queue the msg for upcall */
++	if (spnfs_upcall(spnfs, im, res) != 0) {
++		dprintk("failed spnfs upcall: layoutget\n");
++		nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
++		goto layoutget_cleanup;
++	}
++	status = res->layoutget_res.status;
++	if (status != 0) {
++		/* FIXME? until user mode is fixed, translate system error */
++		switch (status) {
++		case -E2BIG:
++		case -ETOOSMALL:
++			nfserr = NFS4ERR_TOOSMALL;
++			break;
++		case -ENOMEM:
++		case -EAGAIN:
++		case -EINTR:
++			nfserr = NFS4ERR_LAYOUTTRYLATER;
++			break;
++		case -ENOENT:
++			nfserr = NFS4ERR_BADLAYOUT;
++			break;
++ 		default:
++			nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
++		}
++		dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n",
++			status, nfserr);
++		goto layoutget_cleanup;
++	}
 +
-+	path.mnt = rpc_get_mount();
-+	if (IS_ERR(path.mnt))
-+		return PTR_ERR(path.mnt);
++	lg_res->lg_return_on_close = 0;
++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
++	/* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */
++	/* the amount requested by the client.			      */
++	if (spnfs_use_layoutsegments) {
++		if (layoutsegment_size != 0)
++			lg_res->lg_seg.length = layoutsegment_size;
++	} else
++		lg_res->lg_seg.length = NFS4_MAX_UINT64;
++#else
++	lg_res->lg_seg.length = NFS4_MAX_UINT64;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
 +
-+	/* FIXME: do not abuse rpc_pipefs/nfs */
-+	rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
-+	if (rc)
-+		goto err;
++	flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL);
++	if (flp == NULL) {
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
++	flp->device_id.sbid = lg_arg->lg_sbid;
++	flp->device_id.devid = res->layoutget_res.devid;
++	flp->lg_layout_type = 1; /* XXX */
++	flp->lg_stripe_type = res->layoutget_res.stripe_type;
++	flp->lg_commit_through_mds = 0;
++	flp->lg_stripe_unit =  res->layoutget_res.stripe_size;
++	flp->lg_first_stripe_index = 0;
++	flp->lg_pattern_offset = 0;
++	flp->lg_fh_length = res->layoutget_res.stripe_count;
 +
-+	bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL);
-+	if (!bl_comm) {
-+		rc = -ENOMEM;
-+		goto err;
++	flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh),
++				  GFP_KERNEL);
++	if (flp->lg_fh_list == NULL) {
++		nfserr = NFS4ERR_LAYOUTTRYLATER;
++		goto layoutget_cleanup;
++	}
++	/*
++	 * FIX: Doing an extra copy here.  Should group res.flist's fh_len
++	 * and fh_val into a knfsd_fh structure.
++	 */
++	for (i = 0; i < flp->lg_fh_length; i++) {
++		flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len;
++		memcpy(&flp->lg_fh_list[i].fh_base,
++		       res->layoutget_res.flist[i].fh_val,
++		       res->layoutget_res.flist[i].fh_len);
 +	}
 +
-+	/* FIXME: rename to "spnfs_block" */
-+	bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm,
-+					 &bl_upcall_ops, 0);
-+	if (IS_ERR(bl_comm->pipe_dentry)) {
-+		rc = -EPIPE;
-+		goto err;
++	/* encode the layoutget body */
++	nfserr = filelayout_encode_layout(xdr, flp);
++
++layoutget_cleanup:
++	if (flp) {
++		if (flp->lg_fh_list)
++			kfree(flp->lg_fh_list);
++		kfree(flp);
 +	}
-+	mutex_init(&bl_comm->lock);
-+	mutex_init(&bl_comm->pipe_lock);
-+	init_waitqueue_head(&bl_comm->pipe_wq);
++	kfree(im);
++	kfree(res);
 +
-+	bl_comm_global = bl_comm;
-+	return 0;
-+err:
-+	rpc_put_mount();
-+	kfree(bl_comm);
-+	return rc;
++	return nfserr;
 +}
 +
-+void
-+nfsd_bl_stop(void)
++int
++spnfs_layoutcommit(void)
 +{
-+	bl_comm_t	*c = bl_comm_global;
-+
-+	dprintk("%s: stopping pipe\n", __func__);
-+	if (!c)
-+		return;
-+	rpc_unlink(c->pipe_dentry);
-+	rpc_put_mount();
-+	bl_comm_global = NULL;
-+	kfree(c);
++	return 0;
 +}
 +
-+static ssize_t
-+bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst,
-+    size_t buflen)
++int
++spnfs_layoutreturn(struct inode *inode,
++		   const struct nfsd4_pnfs_layoutreturn_arg *args)
 +{
-+	char	*data	= (char *)msg->data + msg->copied;
-+	ssize_t	mlen	= msg->len - msg->copied,
-+		left;
-+
-+	if (mlen > buflen)
-+		mlen = buflen;
-+
-+	left = copy_to_user(dst, data, mlen);
-+	if (left < 0) {
-+		msg->errno = left;
-+		return left;
-+	}
-+	mlen		-= left;
-+	msg->copied	+= mlen;
-+	msg->errno	= 0;
-+
-+	return mlen;
++	return 0;
 +}
 +
-+static ssize_t
-+bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
++int
++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
 +{
-+	struct rpc_inode	*rpci	= RPC_I(filp->f_dentry->d_inode);
-+	bl_comm_t		*bc	= (bl_comm_t *)rpci->private;
-+	bl_comm_msg_t		*im	= &bc->msg;
-+	int			ret;
-+	bl_comm_res_t		*res;
-+	
++	struct super_block *sb;
++	struct nfsd4_pnfs_cb_layout lr;
 +
-+	if (mlen == 0) {
-+		im->msg_status = PNFS_BLOCK_FAILURE;
-+		im->msg_res = NULL;
-+		wake_up(&bc->pipe_wq);
-+		return -EFAULT;
-+	}
-+	
-+	if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL)
-+		return -ENOMEM;
-+	
-+	if (copy_from_user(res, src, mlen)) {
-+		kfree(res);
-+		return -EFAULT;
++	switch (type) {
++	case RETURN_FILE:
++		sb = inode->i_sb;
++		dprintk("%s: recalling layout for ino = %lu\n",
++			__func__, inode->i_ino);
++		break;
++	case RETURN_FSID:
++		sb = inode->i_sb;
++		dprintk("%s: recalling layout for fsid x (unimplemented)\n",
++			__func__);
++		return 0;
++	case RETURN_ALL:
++		/* XXX figure out how to get a sb since there's no inode ptr */
++		dprintk("%s: recalling all layouts (unimplemented)\n",
++			__func__);
++		return 0;
++	default:
++		return -EINVAL;
 +	}
-+	
-+	mutex_lock(&bc->pipe_lock);
-+	
-+	ret		= mlen;
-+	im->msg_status	= res->res_status;
-+	im->msg_res	= res;
-+	
-+	wake_up(&bc->pipe_wq);
-+	mutex_unlock(&bc->pipe_lock);
-+	return ret;
-+}
 +
-+static void
-+bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-+{
-+	bl_comm_msg_t	*im = msg->data;
-+	bl_comm_t	*bc = container_of(im, struct bl_comm, msg);
-+	
-+	if (msg->errno >= 0)
-+		return;
++	lr.cbl_recall_type = type;
++	lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES;
++	lr.cbl_seg.clientid = 0;
++	lr.cbl_seg.offset = offset;
++	lr.cbl_seg.length = len;
++	lr.cbl_seg.iomode = IOMODE_ANY;
++	lr.cbl_layoutchanged = 0;
++
++	nfsd_layout_recall_cb(sb, inode, &lr);
 +
-+	mutex_lock(&bc->pipe_lock);
-+	im->msg_status = PNFS_BLOCK_FAILURE;
-+	wake_up(&bc->pipe_wq);
-+	mutex_unlock(&bc->pipe_lock);
++	return 0;
 +}
 +
++
 +int
-+bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res)
++spnfs_test_layoutrecall(char *path, u64 offset, u64 len)
 +{
-+	struct rpc_pipe_msg	msg;
-+	DECLARE_WAITQUEUE(wq, current);
-+	int			rval	= 1;
-+	bl_comm_msg_t		*m	= &bc->msg;
-+	
-+	if (bc == NULL) {
-+		dprintk("%s: No pNFS block daemon available\n", __func__);
-+		return 1;
-+	}
-+	
-+	mutex_lock(&bc->lock);
-+	mutex_lock(&bc->pipe_lock);
-+	
-+	memcpy(m, upmsg, sizeof (*m));
-+	
-+	memset(&msg, 0, sizeof (msg));
-+	msg.data = m;
-+	msg.len = sizeof (*m);
-+	
-+	add_wait_queue(&bc->pipe_wq, &wq);
-+	rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg);
-+	if (rval < 0) {
-+		remove_wait_queue(&bc->pipe_wq, &wq);
-+		goto out;
-+	}
-+	
-+	set_current_state(TASK_UNINTERRUPTIBLE);
-+	mutex_unlock(&bc->pipe_lock);
-+	schedule();
-+	__set_current_state(TASK_RUNNING);
-+	remove_wait_queue(&bc->pipe_wq, &wq);
-+	mutex_lock(&bc->pipe_lock);
-+	
-+	if (m->msg_status == PNFS_BLOCK_SUCCESS) {
-+		*res = m->msg_res;
-+		rval = 0;
-+	} else
-+		rval = 1;
-+	
-+out:
-+	mutex_unlock(&bc->pipe_lock);
-+	mutex_unlock(&bc->lock);
-+	return rval;
-+}
++	struct nameidata nd;
++	struct inode *inode;
++	int type, rc;
 +
-+static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len,
-+    loff_t *offset)
-+{
-+	int		cmd,
-+			rc;
-+	bl_comm_t	*bc	= bl_comm_global;
-+	bl_comm_msg_t	msg;
-+	bl_comm_res_t	*res;
++	dprintk("%s: path=%s, offset=%llu, len=%llu\n",
++		__func__, path, offset, len);
 +
-+	if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int)))
-+		return -EFAULT;
-+	switch (cmd) {
-+	case PNFS_BLOCK_CTL_STOP:
-+		msg.msg_type = PNFS_UPCALL_MSG_STOP;
-+		(void) bl_upcall(bc, &msg, &res);
-+		kfree(res);
-+		nfsd_bl_stop();
-+		break;
-+		
-+	case PNFS_BLOCK_CTL_START:
-+		rc = nfsd_bl_start();
++	if (strcmp(path, "all") == 0) {
++		inode = NULL;
++		type = RETURN_ALL;
++	} else {
++		rc = path_lookup(path, 0, &nd);
 +		if (rc != 0)
-+			return rc;
-+		break;
-+		
-+	case PNFS_BLOCK_CTL_VERS:
-+		msg.msg_type = PNFS_UPCALL_MSG_VERS;
-+		msg.u.msg_vers = PNFS_UPCALL_VERS;
-+		if (bl_upcall(bc, &msg, &res)) {
-+			dprintk("%s: Failed to contact pNFS block daemon\n",
-+			    __func__);
-+			return 0;
-+		}
-+		kfree(res);
-+		break;
-+		
-+	default:
-+		dprintk("%s: unknown ctl command %d\n", __func__, cmd);
-+		break;
++			return -ENOENT;
++
++		/*
++		 * XXX todo: add a RETURN_FSID scenario here...maybe if
++		 * inode is a dir...
++		 */
++
++		inode = nd.path.dentry->d_inode;
++		type = RETURN_FILE;
 +	}
-+	return len;
-+}
 +
-+static struct file_operations ctl_ops = {
-+	.write	= ctl_write,
-+};
++	if (len == 0)
++		len = NFS4_MAX_UINT64;
++
++	rc = spnfs_layoutrecall(inode, type, offset, len);
++
++	if (type != RETURN_ALL)
++		path_put(&nd.path);
++	return rc;
++}
 +
-+/*
-+ * bl_init_proc -- set up proc interfaces
-+ *
-+ * Creating a pnfs_block directory isn't really required at this point
-+ * since we've only got a single node in that directory. If the need for
-+ * more nodes doesn't present itself shortly this code should revert
-+ * to a single top level node. McNeal 11-Aug-2008.
-+ */
 +int
-+bl_init_proc(void)
++spnfs_getdeviceiter(struct super_block *sb,
++		    u32 layout_type,
++		    struct nfsd4_pnfs_dev_iter_res *gd_res)
 +{
-+	struct proc_dir_entry *e;
++	struct spnfs *spnfs = global_spnfs;   /* XXX keep up the pretence */
++	struct spnfs_msg *im = NULL;
++	union spnfs_msg_res *res = NULL;
++	int status = 0;
 +
-+	e = proc_mkdir("fs/pnfs_block", NULL);
-+	if (!e)
-+		return -ENOMEM;
++	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++	if (im == NULL) {
++		status = -ENOMEM;
++		goto getdeviceiter_out;
++	}
 +
-+	e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL);
-+	if (!e)
-+		return -ENOMEM;
-+	e->proc_fops = &ctl_ops;
++	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++	if (res == NULL) {
++		status = -ENOMEM;
++		goto getdeviceiter_out;
++	}
 +
-+	return 0;
++	im->im_type = SPNFS_TYPE_GETDEVICEITER;
++	im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie;
++	im->im_args.getdeviceiter_args.verf = gd_res->gd_verf;
++
++	/* call function to queue the msg for upcall */
++	status = spnfs_upcall(spnfs, im, res);
++	if (status != 0) {
++		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++		status = -EIO;
++		goto getdeviceiter_out;
++	}
++	status = res->getdeviceiter_res.status;
++
++	if (res->getdeviceiter_res.eof)
++		gd_res->gd_eof = 1;
++	else {
++		gd_res->gd_devid = res->getdeviceiter_res.devid;
++		gd_res->gd_cookie = res->getdeviceiter_res.cookie;
++		gd_res->gd_verf = res->getdeviceiter_res.verf;
++		gd_res->gd_eof = 0;
++	}
++
++getdeviceiter_out:
++	kfree(im);
++	kfree(res);
++
++	return status;
 +}
-+#endif /* CONFIG_SPNFS_BLOCK */
-diff --git a/fs/nfsd/bl_ops.c b/fs/nfsd/bl_ops.c
-new file mode 100644
-index 0000000..e41b61b
---- /dev/null
-+++ b/fs/nfsd/bl_ops.c
-@@ -0,0 +1,1672 @@
++
++#ifdef CONFIG_SPNFS_TEST
 +/*
-+ *  bl_ops.c
-+ *  spNFS
++ * Setup the rq_res xdr_buf.  The svc_rqst rq_respages[1] page contains the
++ * 1024 encoded stripe indices.
 + *
-+ *  Created by Rick McNeal on 4/1/08.
-+ *  Copyright 2008 __MyCompanyName__. All rights reserved.
++ * Skip the devaddr4 length and encode the indicies count (1024) in the
++ * rq_res.head and set the rq_res.head length.
 + *
-+ */
-+
-+/*
-+ * Block layout operations.
++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices).
++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the
++ * rq_res head to hold the rest of the getdeviceinfo return.
 + *
-+ * These functions, with the exception of pnfs_block_enabled, are assigned to
-+ * the super block s_export_op structure.
++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and
++ * rq_respages[rq_resused] contains the rq_res.pages.
 + */
-+#if defined(CONFIG_SPNFS_BLOCK)
-+
-+#include <linux/module.h>
-+#include <linux/genhd.h>
-+#include <linux/fs.h>
-+#include <linux/exportfs.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/nfsd/export.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+#include <linux/nfsd/debug.h>
-+#include <linux/spinlock_types.h>
-+#include <linux/dm-ioctl.h>
-+#include <asm/uaccess.h>
-+#include <linux/falloc.h>
-+#include <linux/nfsd4_block.h>
-+
-+#include "pnfsd.h"
++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info,
++				  const struct pnfs_filelayout_device *fdev)
++{
++	struct nfsd4_compoundres *resp = info->resp;
++	struct svc_rqst *rqstp = resp->rqstp;
++	struct xdr_buf *xb = &resp->rqstp->rq_res;
++	__be32 *p;
 +
-+#define NFSDDBG_FACILITY	NFSDDBG_PNFS
++	p = nfsd4_xdr_reserve_space(resp, 8);
++	p++; /* Fill in length later */
++	*p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */
++	resp->p = p;
 +
-+#define MIN(a, b) ((a) < (b) ? (a) : (b))
++	xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base;
++	xb->pages = &rqstp->rq_respages[rqstp->rq_resused];
++	xb->page_base = 0;
++	xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */
++	xb->tail[0].iov_base = resp->p;
++	resp->end = xb->head[0].iov_base + PAGE_SIZE;
++	xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p;
++	return 0;
++}
++/*
++ * Return a stripeindices of length 1024 to test
++ * the pNFS client multipage getdeviceinfo implementation.
++ *
++ * Encode a page of stripe indices.
++ */
++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev,
++				  struct spnfs_device *dev,
++				  struct pnfs_devinfo_arg *info)
++{
++	struct svc_rqst *rqstp = info->xdr.resp->rqstp;
++	__be32 *p;
++	int i, j = 0;
 +
-+#define BL_LAYOUT_HASH_BITS	4
-+#define BL_LAYOUT_HASH_SIZE	(1 << BL_LAYOUT_HASH_BITS)
-+#define BL_LAYOUT_HASH_MASK	(BL_LAYOUT_HASH_SIZE - 1)
-+#define BL_LIST_REQ	(sizeof (struct dm_ioctl) + 256)
++	p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]);
++	fldev->fl_stripeindices_length = 1024;
++	/* round-robin the data servers device index into the stripe indicie */
++	for (i = 0; i < 1024; i++) {
++		*p++ = cpu_to_be32(j);
++		if (j < dev->dscount - 1)
++			j++;
++		else
++			j = 0;
++	}
++	fldev->fl_stripeindices_list = NULL;
++}
++#endif /* CONFIG_SPNFS_TEST */
 +
-+#define bl_layout_hashval(id) \
-+	((id) & BL_LAYOUT_HASH_MASK)
++int
++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
++		    u32 layout_type,
++		    const struct nfsd4_pnfs_deviceid *devid)
++{
++	struct spnfs *spnfs = global_spnfs;
++	struct spnfs_msg *im = NULL;
++	union spnfs_msg_res *res = NULL;
++	struct spnfs_device *dev;
++	struct pnfs_filelayout_device *fldev = NULL;
++	struct pnfs_filelayout_multipath *mp = NULL;
++	struct pnfs_filelayout_devaddr *fldap = NULL;
++	int status = 0, i, len;
 +
-+#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len)
-+#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len)
-+#define _2SECTS(v) ((v) >> 9)
++	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++	if (im == NULL) {
++		status = -ENOMEM;
++		goto getdeviceinfo_out;
++	}
 +
-+#ifndef READ32
-+#define READ32(x)	(x) = ntohl(*p++)
-+#define READ64(x)	do {			\
-+(x) = (u64)ntohl(*p++) << 32;	\
-+(x) |= ntohl(*p++);		\
-+} while (0)
-+#endif
++	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++	if (res == NULL) {
++		status = -ENOMEM;
++		goto getdeviceinfo_out;
++	}
 +
++	im->im_type = SPNFS_TYPE_GETDEVICEINFO;
++	/* XXX FIX: figure out what to do about fsid */
++	im->im_args.getdeviceinfo_args.devid = devid->devid;
 +
-+typedef enum {True, False} boolean_t;
-+/* ---- block layoutget and commit structure ---- */
-+typedef struct bl_layout_rec {
-+	struct list_head	blr_hash,
-+				blr_layouts;
-+	dev_t			blr_rdev;
-+	struct inode		*blr_inode;
-+	int			blr_recalled;	// debug
-+	u64			blr_orig_size,
-+				blr_commit_size,
-+				blr_ext_size;
-+	spinlock_t		blr_lock;	// Protects blr_layouts
-+} bl_layout_rec_t;
++	/* call function to queue the msg for upcall */
++	status = spnfs_upcall(spnfs, im, res);
++	if (status != 0) {
++		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++		status = -EIO;
++		goto getdeviceinfo_out;
++	}
++	status = res->getdeviceinfo_res.status;
++	if (status != 0)
++		goto getdeviceinfo_out;
 +
-+static struct list_head layout_hash;
-+static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE];
-+static spinlock_t layout_hashtbl_lock;
++	dev = &res->getdeviceinfo_res.devinfo;
 +
-+/* ---- prototypes ---- */
-+static boolean_t device_slice(dev_t devid);
-+static boolean_t device_dm(dev_t devid);
-+static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **);
-+static bl_layout_rec_t *layout_inode_find(struct inode *i);
-+static void layout_inode_del(struct inode *i);
-+static char *map_state2name(enum pnfs_block_extent_state4 s);
-+static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type);
-+static void bld_free(pnfs_blocklayout_devinfo_t *bld);
-+static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes,
-+    dev_t devid, int local_index);
-+static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes,
-+    dev_t devid, int my_loc, int idx);
-+static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
-+    struct nfsd4_layout_seg *seg);
-+struct list_head *layout_cache_iter(bl_layout_rec_t *r,
-+    struct list_head *bl_possible, struct nfsd4_layout_seg *seg);
-+static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h);
-+static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h);
-+static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg);
-+static void print_bll(pnfs_blocklayout_layout_t *b, char *);
-+static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r,
-+    struct list_head *h, struct nfsd4_layout_seg *seg);
-+static inline void bll_collapse(bl_layout_rec_t *r,
-+    pnfs_blocklayout_layout_t *c);
-+static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len,
-+    enum bl_cache_state state, struct list_head *h);
-+static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b,
-+    enum bl_cache_state c, struct list_head *h);
-+static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
-+    enum pnfs_block_extent_state4 *s);
-+static void extents_setup(struct fiemap_extent_info *fei);
-+static void extents_count(struct fiemap_extent_info *fei, struct inode *i,
-+    u64 foff, u64 len);
-+static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i,
-+    u64 foff, u64 len);
-+static boolean_t extents_process(struct fiemap_extent_info *fei,
-+    struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev,
-+    pnfs_blocklayout_layout_t *b);
-+static void extents_cleanup(struct fiemap_extent_info *fei);
++	/* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */
++	fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL);
++	if (fldev == NULL) {
++		status = -ENOMEM;
++		goto getdeviceinfo_out;
++	}
 +
-+void
-+nfsd_bl_init(void)
-+{
-+	int	i;
-+	dprintk("%s loaded\n", __func__);
++	/*
++	 * Stripe count is the same as data server count for our purposes
++	 */
++	fldev->fl_stripeindices_length = dev->dscount;
++	fldev->fl_device_length = dev->dscount;
 +
-+	spin_lock_init(&layout_hashtbl_lock);
-+	INIT_LIST_HEAD(&layout_hash);
-+	for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++)
-+		INIT_LIST_HEAD(&layout_hashtbl[i]);
-+	bl_init_proc();
-+}
++	/* Set stripe indices */
++#ifdef CONFIG_SPNFS_TEST
++	spnfs_set_test_indices(fldev, dev, info);
++	fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr;
++#else /* CONFIG_SPNFS_TEST */
++	fldev->fl_stripeindices_list =
++		kmalloc(fldev->fl_stripeindices_length * sizeof(u32),
++			GFP_KERNEL);
++	if (fldev->fl_stripeindices_list == NULL) {
++		status = -ENOMEM;
++		goto getdeviceinfo_out;
++	}
++	for (i = 0; i < fldev->fl_stripeindices_length; i++)
++		fldev->fl_stripeindices_list[i] = i;
++#endif /* CONFIG_SPNFS_TEST */
 +
-+/*
-+ * pnfs_block_enabled -- check to see if this file system should be export as
-+ * block pnfs
-+ */
-+int
-+pnfs_block_enabled(struct inode *inode, int ex_flags)
-+{
-+	bl_comm_msg_t	msg;
-+	bl_comm_res_t	*res	= NULL;
-+	static int bl_comm_once	= 0;
-+	
-+	dprintk("--> %s\n", __func__);
 +	/*
-+	 * FIXME: Figure out method to determine if this file system should
-+	 * be exported. The following areas need to be checked.
-+	 * (1) Validate that this file system was exported as a pNFS
-+	 *     block-layout
-+	 * (2) Has there been successful communication with the
-+	 *     volume daemon?
++	 * Set the device's data server addresses  No multipath for spnfs,
++	 * so mp length is always 1.
++	 *
 +	 */
-+	/* Check #1 */
-+#ifdef notyet
-+	if (!(ex_flags & NFSEXP_PNFS_BLOCK)) {
-+		dprintk("%s: pnfs_block not set in export\n", __func__);
-+		return 0;
++	fldev->fl_device_list =
++		kmalloc(fldev->fl_device_length *
++			sizeof(struct pnfs_filelayout_multipath),
++			GFP_KERNEL);
++	if (fldev->fl_device_list == NULL) {
++		status = -ENOMEM;
++		goto getdeviceinfo_out;
 +	}
-+#endif
-+	
-+	/* Check #1 */
-+	if (!bl_comm_once) {
-+		msg.msg_type = PNFS_UPCALL_MSG_VERS;
-+		msg.u.msg_vers = PNFS_UPCALL_VERS;
-+		if (bl_upcall(bl_comm_global, &msg, &res)) {
-+			dprintk("%s: Failed to contact pNFS block daemon\n",
-+				__func__);
-+			return 0;
++	for (i = 0; i < fldev->fl_device_length; i++) {
++		mp = &fldev->fl_device_list[i];
++		mp->fl_multipath_length = 1;
++		mp->fl_multipath_list =
++			kmalloc(sizeof(struct pnfs_filelayout_devaddr),
++				GFP_KERNEL);
++		if (mp->fl_multipath_list == NULL) {
++			status = -ENOMEM;
++			goto getdeviceinfo_out;
 +		}
-+		if (msg.u.msg_vers != res->u.vers) {
-+			dprintk("%s: vers mismatch, kernel != daemon\n",
-+				__func__);
-+			kfree(res);
-+			return 0;
++		fldap = mp->fl_multipath_list;
++
++		/*
++		 * Copy the netid into the device address, for example: "tcp"
++		 */
++		len = strlen(dev->dslist[i].netid);
++		fldap->r_netid.data = kmalloc(len, GFP_KERNEL);
++		if (fldap->r_netid.data == NULL) {
++			status = -ENOMEM;
++			goto getdeviceinfo_out;
++		}
++		memcpy(fldap->r_netid.data, dev->dslist[i].netid, len);
++		fldap->r_netid.len = len;
++
++		/*
++		 * Copy the network address into the device address,
++		 * for example: "10.35.9.16.08.01"
++		 */
++		len = strlen(dev->dslist[i].addr);
++		fldap->r_addr.data = kmalloc(len, GFP_KERNEL);
++		if (fldap->r_addr.data == NULL) {
++			status = -ENOMEM;
++			goto getdeviceinfo_out;
++		}
++		memcpy(fldap->r_addr.data, dev->dslist[i].addr, len);
++		fldap->r_addr.len = len;
++	}
++
++	/* encode the device data */
++	status = filelayout_encode_devinfo(xdr, fldev);
++
++getdeviceinfo_out:
++	if (fldev) {
++		kfree(fldev->fl_stripeindices_list);
++		if (fldev->fl_device_list) {
++			for (i = 0; i < fldev->fl_device_length; i++) {
++				fldap =
++				    fldev->fl_device_list[i].fl_multipath_list;
++				kfree(fldap->r_netid.data);
++				kfree(fldap->r_addr.data);
++				kfree(fldap);
++			}
++			kfree(fldev->fl_device_list);
 +		}
++		kfree(fldev);
 +	}
-+	bl_comm_once = 1;
 +
++	kfree(im);
 +	kfree(res);
-+	
-+	dprintk("<-- %s okay\n", __func__);
-+	return 1;
++
++	return status;
 +}
 +
 +int
-+bl_layout_type(struct super_block *sb)
++spnfs_setattr(void)
 +{
-+	return LAYOUT_BLOCK_VOLUME;
++	return 0;
 +}
 +
 +int
-+bl_getdeviceiter(struct super_block *sb,
-+		 u32 layout_type,
-+		 struct nfsd4_pnfs_dev_iter_res *res)
++spnfs_open(struct inode *inode, struct nfsd4_open *open)
++{
++	struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
++	struct spnfs_msg *im = NULL;
++	union spnfs_msg_res *res = NULL;
++	int status = 0;
++
++	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++	if (im == NULL) {
++		status = -ENOMEM;
++		goto open_out;
++	}
++
++	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++	if (res == NULL) {
++		status = -ENOMEM;
++		goto open_out;
++	}
++
++	im->im_type = SPNFS_TYPE_OPEN;
++	im->im_args.open_args.inode = inode->i_ino;
++	im->im_args.open_args.generation = inode->i_generation;
++	im->im_args.open_args.create = open->op_create;
++	im->im_args.open_args.createmode = open->op_createmode;
++	im->im_args.open_args.truncate = open->op_truncate;
++
++	/* call function to queue the msg for upcall */
++	status = spnfs_upcall(spnfs, im, res);
++	if (status != 0) {
++		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++		status = -EIO;
++		goto open_out;
++	}
++	status = res->open_res.status;
++
++open_out:
++	kfree(im);
++	kfree(res);
++
++	return status;
++}
++
++int
++spnfs_create(void)
 +{
-+	res->gd_eof = 1;	
-+	if (res->gd_cookie)
-+		return -ENOENT;
-+	res->gd_devid	= sb->s_dev;
-+	res->gd_verf	= 1;
-+	res->gd_cookie	= 1;
 +	return 0;
 +}
 +
-+static int
-+bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr,
-+		       const struct nfsd4_pnfs_deviceid *devid)
++/*
++ * Invokes the spnfsd with the inode number of the object to remove.
++ * The file has already been removed on the MDS, so all the spnsfd
++ * daemon does is remove the stripes.
++ * Returns 0 on success otherwise error code
++ */
++int
++spnfs_remove(unsigned long ino, unsigned long generation)
 +{
-+	pnfs_blocklayout_devinfo_t	*bld_slice_p,
-+					*bld_simple_p,
-+					*bld;
-+	int				status		= -EIO,
-+					location	= 0;
-+	struct list_head		volumes;
-+	
-+	dprintk("--> %s\n", __func__);
-+	INIT_LIST_HEAD(&volumes);
++	struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
++	struct spnfs_msg *im = NULL;
++	union spnfs_msg_res *res = NULL;
++	int status = 0;
 +
-+	bld_simple_p = bld_simple(&volumes, devid->devid,
-+				  location++);
-+	if (!bld_simple_p)
-+		goto out;
-+	bld_slice_p = bld_slice(&volumes, devid->devid, location++,
-+	    bld_simple_p->bld_index_loc);
++	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++	if (im == NULL) {
++		status = -ENOMEM;
++		goto remove_out;
++	}
 +
-+	if (!bld_slice_p)
-+		goto out;
-+	
-+	status = blocklayout_encode_devinfo(xdr, &volumes);
++	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++	if (res == NULL) {
++		status = -ENOMEM;
++		goto remove_out;
++	}
 +
-+out:
-+	while (!list_empty(&volumes)) {
-+		bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
-+		    bld_list);
-+		if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE)
-+			kfree(bld->u.simple.bld_sig);
-+		bld_free(bld);
++	im->im_type = SPNFS_TYPE_REMOVE;
++	im->im_args.remove_args.inode = ino;
++	im->im_args.remove_args.generation = generation;
++
++	/* call function to queue the msg for upcall */
++	status = spnfs_upcall(spnfs, im, res);
++	if (status != 0) {
++		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++		status = -EIO;
++		goto remove_out;
 +	}
-+	
-+	dprintk("<-- %s (rval %d)\n", __func__, status);
++	status = res->remove_res.status;
++
++remove_out:
++	kfree(im);
++	kfree(res);
++
 +	return status;
 +}
 +
 +static int
-+bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr,
-+		    const struct nfsd4_pnfs_deviceid *devid)
++read_one(struct inode *inode, loff_t offset, size_t len, char *buf,
++	 struct file **filp)
 +{
-+	pnfs_blocklayout_devinfo_t	*bld		= NULL;
-+	int				status		= -EIO,	// default to error
-+					i,
-+					location	= 0;
-+	struct list_head		volumes;
-+	bl_comm_msg_t			msg;
-+	bl_comm_res_t			*res;
-+	
-+	dprintk("--> %s\n", __func__);
-+	INIT_LIST_HEAD(&volumes);
-+	
-+	msg.msg_type = PNFS_UPCALL_MSG_DMGET;
-+	msg.u.msg_dev = devid->devid;
-+	if (bl_upcall(bl_comm_global, &msg, &res)) {
-+		dprintk("%s: upcall for DMGET failed\n", __func__);
-+		goto out;
++	loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
++	size_t iolen;
++	int completed = 0, ds, err;
++
++	while (len > 0) {
++		tmp = offset;
++		soff = do_div(tmp, spnfs_config->stripe_size);
++		snum = tmp;
++		ds = do_div(tmp, spnfs_config->num_ds);
++		if (spnfs_config->dense_striping == 0)
++			soffset = offset;
++		else {
++			tmp = snum;
++			do_div(tmp, spnfs_config->num_ds);
++			soffset = tmp * spnfs_config->stripe_size + soff;
++		}
++		if (len < spnfs_config->stripe_size - soff)
++			iolen = len;
++		else
++			iolen = spnfs_config->stripe_size - soff;
++
++		pos = soffset;
++		err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos);
++		if (err < 0)
++			return -EIO;
++		if (err == 0)
++			break;
++		filp[ds]->f_pos = pos;
++		iolen = err;
++		completed += iolen;
++		len -= iolen;
++		offset += iolen;
++		bufoffset += iolen;
 +	}
-+		
++
++	return completed;
++}
++
++static __be32
++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
++     struct svc_rqst *rqstp)
++{
++	int i, vnum, err, bytecount = 0;
++	char path[128];
++	struct file *filp[SPNFS_MAX_DATA_SERVERS];
++	size_t iolen;
++	__be32 status = nfs_ok;
++
 +	/*
-+	 * Don't use bld_alloc() here. If used this will be the first volume
-+	 * type added to the list whereas the protocol requires it to be the
-+	 * last.
++	 * XXX We should just be doing this at open time, but it gets
++	 * kind of messy storing this info in nfsd's state structures
++	 * and piggybacking its path through the various state handling
++	 * functions.  Revisit this.
 +	 */
-+	bld = kmalloc(sizeof (*bld), GFP_KERNEL);
-+	if (!bld)
-+		goto out;
-+	memset(bld, 0, sizeof (*bld));
-+	bld->bld_type			= PNFS_BLOCK_VOLUME_STRIPE;
-+	bld->u.stripe.bld_stripes	= res->u.stripe.num_stripes;
-+	bld->u.stripe.bld_chunk_size	= res->u.stripe.stripe_size * 512LL;
-+	dprintk("%s: stripes %d, chunk_size %Lu\n", __func__,
-+	    bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL);
-+	
-+	bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes *
-+						  sizeof (int), GFP_KERNEL);
-+	if (!bld->u.stripe.bld_stripe_indexs)
-+		goto out;
-+
-+	for (i = 0; i < bld->u.stripe.bld_stripes; i++) {
-+		dev_t			dev;
-+		pnfs_blocklayout_devinfo_t	*bldp;
-+		
-+		dev = MKDEV(res->u.stripe.devs[i].major,
-+			    res->u.stripe.devs[i].minor);
-+		if (dev == 0)
-+			goto out;
-+		
-+		bldp = bld_simple(&volumes, dev, location++);
-+		if (!bldp) {
-+			dprintk("%s: bld_simple failed\n", __func__);
-+			goto out;
++	memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
++	for (i = 0; i < spnfs_config->num_ds; i++) {
++		sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
++			inode->i_ino, inode->i_generation);
++		filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0);
++		if (filp[i] == NULL) {
++			status = nfserr_io;
++			goto read_out;
 +		}
-+		bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc);
++		get_file(filp[i]);
++	}
 +
-+		if (!bldp) {
-+			dprintk("%s: bld_slice failed\n", __func__);
-+			goto out;
++	for (vnum = 0 ; vnum < vlen ; vnum++) {
++		iolen = rqstp->rq_vec[vnum].iov_len;
++		err = read_one(inode, offset + bytecount, iolen,
++			       (char *)rqstp->rq_vec[vnum].iov_base, filp);
++		if (err < 0) {
++			status = nfserr_io;
++			goto read_out;
 +		}
-+		bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc;
-+
++		if (err < iolen) {
++			bytecount += err;
++			goto read_out;
++		}
++		bytecount += rqstp->rq_vec[vnum].iov_len;
 +	}
-+	list_add_tail(&bld->bld_list, &volumes);
-+	status = blocklayout_encode_devinfo(xdr, &volumes);
-+	
-+out:
-+	while (!list_empty(&volumes)) {
-+		bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
-+		    bld_list);
-+		switch (bld->bld_type) {
-+			case PNFS_BLOCK_VOLUME_SLICE:
-+			case PNFS_BLOCK_VOLUME_CONCAT:
-+				// No memory to release for these
-+				break;
-+			case PNFS_BLOCK_VOLUME_SIMPLE:
-+				kfree(bld->u.simple.bld_sig);
-+				break;
-+			case PNFS_BLOCK_VOLUME_STRIPE:
-+				kfree(bld->u.stripe.bld_stripe_indexs);
-+				break;
++
++read_out:
++	*lenp = bytecount;
++	for (i = 0; i < spnfs_config->num_ds; i++) {
++		if (filp[i]) {
++			filp_close(filp[i], current->files);
++			fput(filp[i]);
 +		}
-+		bld_free(bld);
 +	}
-+	kfree(res);
-+	dprintk("<-- %s (rval %d)\n", __func__, status);
 +	return status;
 +}
 +
-+/*
-+ * bl_getdeviceinfo -- determine device tree for requested devid
-+ */
-+int
-+bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
-+		 u32 layout_type,
-+		 const struct nfsd4_pnfs_deviceid *devid)
++__be32
++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
++	   struct svc_rqst *rqstp)
 +{
-+	if (device_slice(devid->devid) == True)
-+		return bl_getdeviceinfo_slice(sb, xdr, devid);
-+	else if (device_dm(devid->devid) == True)
-+		return bl_getdeviceinfo_dm(sb, xdr, devid);
-+	return -EINVAL;
++	if (spnfs_config)
++		return read(inode, offset, lenp, vlen, rqstp);
++	else {
++		printk(KERN_ERR "Please upgrade to latest spnfsd\n");
++		return nfserr_notsupp;
++	}
 +}
 +
-+enum nfsstat4
-+bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr,
-+	     const struct nfsd4_pnfs_layoutget_arg *arg,
-+	     struct nfsd4_pnfs_layoutget_res *res)
++static int
++write_one(struct inode *inode, loff_t offset, size_t len, char *buf,
++	  struct file **filp)
 +{
-+	pnfs_blocklayout_layout_t	*b;
-+	bl_layout_rec_t			*r;
-+	struct list_head		bl_possible,
-+					*bl_candidates	= NULL;
-+	boolean_t			del_on_error	= False;
-+	int				adj;
-+	enum nfsstat4			nfserr		= NFS4_OK;
-+	
-+	dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n",
-+	    __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset),
-+	    _2SECTS(res->lg_seg.length), res->lg_seg.iomode);
++	loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
++	size_t iolen;
++	int completed = 0, ds, err;
 +
-+	if (res->lg_seg.length == 0) {
-+		printk("%s: request length of 0, error condition\n", __func__);
-+		return NFS4ERR_BADLAYOUT;
++	while (len > 0) {
++		tmp = offset;
++		soff = do_div(tmp, spnfs_config->stripe_size);
++		snum = tmp;
++		ds = do_div(tmp, spnfs_config->num_ds);
++		if (spnfs_config->dense_striping == 0)
++			soffset = offset;
++		else {
++			tmp = snum;
++			do_div(tmp, spnfs_config->num_ds);
++			soffset = tmp * spnfs_config->stripe_size + soff;
++		}
++		if (len < spnfs_config->stripe_size - soff)
++			iolen = len;
++		else
++			iolen = spnfs_config->stripe_size - soff;
++
++		pos = soffset;
++		err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos);
++		if (err < 0)
++			return -EIO;
++		filp[ds]->f_pos = pos;
++		iolen = err;
++		completed += iolen;
++		len -= iolen;
++		offset += iolen;
++		bufoffset += iolen;
 +	}
-+	
++
++	return completed;
++}
++
++static __be32
++write(struct inode *inode, loff_t offset, size_t len, int vlen,
++      struct svc_rqst *rqstp)
++{
++	int i, vnum, err, bytecount = 0;
++	char path[128];
++	struct file *filp[SPNFS_MAX_DATA_SERVERS];
++	size_t iolen;
++	__be32 status = nfs_ok;
++
 +	/*
-+	 * Adjust the length as required per spec.
-+	 * - First case is were the length is set to (u64)-1. Cheap means to
-+	 *   define the end of the file.
-+	 * - Second case is were the I/O mode is read-only, but the request is
-+	 *   past the end of the file so the request needs to be trimed.
++	 * XXX We should just be doing this at open time, but it gets
++	 * kind of messy storing this info in nfsd's state structures
++	 * and piggybacking its path through the various state handling
++	 * functions.  Revisit this.
 +	 */
-+	if ((res->lg_seg.length == NFS4_MAX_UINT64) ||
-+	    (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) &&
-+	     (res->lg_seg.iomode == IOMODE_READ)))
-+		res->lg_seg.length = i->i_size - res->lg_seg.offset;
-+	
-+	adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0;
-+	res->lg_seg.offset -= adj;
-+	res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511;
-+	
-+	if (res->lg_seg.iomode != IOMODE_READ)
-+		if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE,
-+				       res->lg_seg.offset, res->lg_seg.length))
-+			return NFS4ERR_IO;
-+		
-+	INIT_LIST_HEAD(&bl_possible);
-+	
-+	if ((r = layout_inode_find(i)) == NULL) {
-+		if (layout_inode_add(i, &r) == False) {
-+			printk("%s: layout_inode_add failed\n", __func__);
-+			return NFS4ERR_IO;
++	memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
++	for (i = 0; i < spnfs_config->num_ds; i++) {
++		sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
++			inode->i_ino, inode->i_generation);
++		filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0);
++		if (filp[i] == NULL) {
++			status = nfserr_io;
++			goto write_out;
 +		}
-+		del_on_error = True;
-+	}
-+	BUG_ON(!r);
-+	
-+	spin_lock(&r->blr_lock);
-+	
-+	if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) {
-+		/*
-+		 * This will send LAYOUTTRYAGAIN error to the client.
-+		 */
-+		dprintk("%s: layout_cache_fill_from() failed\n", __func__);
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
-+	}
-+	
-+	res->lg_return_on_close	= 1;
-+	res->lg_seg.length	= 0;
-+	
-+	bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg);
-+	if (!bl_candidates) {
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
++		get_file(filp[i]);
 +	}
-+	
-+	layout_cache_merge(r, bl_candidates);
-+	if (layout_cache_update(r, bl_candidates)) {
-+		/* ---- Failed to allocate memory. ---- */
-+		dprintk("%s: layout_cache_update() failed\n", __func__);
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
++
++	for (vnum = 0; vnum < vlen; vnum++) {
++		iolen = rqstp->rq_vec[vnum].iov_len;
++		err = write_one(inode, offset + bytecount, iolen,
++				(char *)rqstp->rq_vec[vnum].iov_base, filp);
++		if (err != iolen) {
++			dprintk("spnfs_write: err=%d expected %Zd\n", err, len);
++			status = nfserr_io;
++			goto write_out;
++		}
++		bytecount += rqstp->rq_vec[vnum].iov_len;
 +	}
-+	
-+	nfserr = blocklayout_encode_layout(xdr, bl_candidates);
-+	if (nfserr)
-+		dprintk("%s: layoutget xdr routine failed\n", __func__);
-+	
-+layoutget_cleanup:
-+	if (bl_candidates) {
-+		while (!list_empty(bl_candidates)) {
-+			b = list_entry(bl_candidates->next,
-+			    struct pnfs_blocklayout_layout, bll_list);
-+			list_del(&b->bll_list);
-+			kfree(b);
++
++write_out:
++	for (i = 0; i < spnfs_config->num_ds; i++) {
++		if (filp[i]) {
++			filp_close(filp[i], current->files);
++			fput(filp[i]);
 +		}
 +	}
 +
-+	spin_unlock(&r->blr_lock);
-+	if (unlikely(nfserr)) {
-+		if (del_on_error == True)
-+			layout_inode_del(i);
-+		res->lg_seg.length = 0;
-+		res->lg_seg.offset = 0;
++	return status;
++}
++
++__be32
++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen,
++	    struct svc_rqst *rqstp)
++{
++	if (spnfs_config)
++		return write(inode, offset, len, vlen, rqstp);
++	else {
++		printk(KERN_ERR "Please upgrade to latest spnfsd\n");
++		return nfserr_notsupp;
 +	}
-+	
-+	dprintk("<-- %s (rval %u)\n", __func__, nfserr);
-+	return nfserr;
++}
++
++int
++spnfs_commit(void)
++{
++	return 0;
 +}
 +
 +/*
-+ * bl_layoutcommit -- commit changes, especially size, to file systemj
-+ *
-+ * Currently this routine isn't called and everything is handled within
-+ * nfsd4_layoutcommit(). By not calling this routine the server doesn't
-+ * handle a partial return, a set of extents, of the layout. The extents
-+ * are decoded here, but nothing is done with them. If this routine is
-+ * be called the interface must change to pass the 'dentry' pointer such
-+ * that notify_change() can be called.
++ * Return the state for this object.
++ * At this time simply return 0 to indicate success and use the existing state
 + */
 +int
-+bl_layoutcommit(struct inode *i,
-+		const struct nfsd4_pnfs_layoutcommit_arg *args,
-+		struct nfsd4_pnfs_layoutcommit_res *res)
++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg)
 +{
-+	bl_layout_rec_t			*r;
-+	int				status	= 0;
-+	u64				lw_plus;
-+	
-+	dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
-+	r = layout_inode_find(i);
-+	if (r) {
-+		lw_plus = args->lc_last_wr + 1;
-+		if (args->lc_newoffset) {
-+			dprintk("  lc_last_wr %Lu\n", lw_plus);
-+			if (r->blr_orig_size < lw_plus) {
-+				r->blr_orig_size	= lw_plus;
-+				res->lc_size_chg	= 1;
-+				res->lc_newsize		= lw_plus;
-+			}
-+		}
-+
-+		if (args->lc_up_len) {
-+			int	extents,
-+				i;
-+			struct pnfs_blocklayout_layout *b;
-+			__be32 *p = args->lc_up_layout;
-+			
-+			/*
-+			 * Client is returning a set of extents which
-+			 * should/could be used to update the file system.
-+			 * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08
-+			 */
-+			READ32(extents);
-+			dprintk("  Client returning %d extents: data size %d\n",
-+			    extents, args->lc_up_len);
-+			b = kmalloc(sizeof (struct pnfs_blocklayout_layout) *
-+				    extents, GFP_KERNEL);
-+			if (b) {
-+				for (i = 0; i < extents; i++) {
-+					READ64(b[i].bll_vol_id.sbid);
-+					READ64(b[i].bll_vol_id.devid);
-+					READ64(b[i].bll_foff);
-+					READ64(b[i].bll_len);
-+					READ64(b[i].bll_soff);
-+					READ32(b[i].bll_es);
-+					dprintk("  %d: foff %Lu, len %Lu, soff %Lu "
-+					    "state %s\n",
-+					    i, _2SECTS(b[i].bll_foff),
-+					    _2SECTS(b[i].bll_len),
-+					    _2SECTS(b[i].bll_soff),
-+					    map_state2name(b[i].bll_es));
-+				}
-+				kfree(b);
-+			} else {
-+				status = -ENOMEM;
-+			}
-+		}
-+	} else
-+		dprintk("%s: Unexpected commit to inode %p\n", __func__, i);
-+	
-+	dprintk("<-- %s (rval %d)\n", __func__, status);
-+	return status;
++	return 0;
 +}
 +
++/*
++ * Return the filehandle for the specified file descriptor
++ */
 +int
-+bl_layoutreturn(struct inode *i,
-+		const struct nfsd4_pnfs_layoutreturn_arg *args)
++spnfs_getfh(int fd, struct nfs_fh *fh)
 +{
-+	int				status	= 0;
-+	bl_layout_rec_t			*r;
++	struct file *file;
 +
-+	dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
-+	
-+	r = layout_inode_find(i);
-+	if (r) {
-+		spin_lock(&r->blr_lock);
-+		layout_cache_del(r, &args->lr_seg);
-+		spin_unlock(&r->blr_lock);
-+		dprintk("    ext_size %Lu, i_size %Lu, orig_size %Lu\n",
-+		    r->blr_ext_size, i->i_size, r->blr_orig_size);
-+	}
++	file = fget(fd);
++	if (file == NULL)
++		return -EIO;
 +
-+	layout_inode_del(i);
-+	dprintk("<-- %s (rval %d)\n", __func__, status);
-+	return status;
++	memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh));
++	fput(file);
++	return 0;
 +}
+diff -up linux-2.6.37.noarch/fs/nfsd/state.h.orig linux-2.6.37.noarch/fs/nfsd/state.h
+--- linux-2.6.37.noarch/fs/nfsd/state.h.orig	2011-01-28 09:37:32.568979046 -0500
++++ linux-2.6.37.noarch/fs/nfsd/state.h	2011-01-28 09:43:53.370768171 -0500
+@@ -37,6 +37,7 @@
+ 
+ #include <linux/sunrpc/svc_xprt.h>
+ #include <linux/nfsd/nfsfh.h>
++#include <linux/nfsd/export.h>
+ #include "nfsfh.h"
+ 
+ typedef struct {
+@@ -65,17 +66,6 @@ typedef struct {
+ 	(s)->si_fileid, \
+ 	(s)->si_generation
+ 
+-struct nfsd4_callback {
+-	void *cb_op;
+-	struct nfs4_client *cb_clp;
+-	struct list_head cb_per_client;
+-	u32 cb_minorversion;
+-	struct rpc_message cb_msg;
+-	const struct rpc_call_ops *cb_ops;
+-	struct work_struct cb_work;
+-	bool cb_done;
+-};
+-
+ struct nfs4_delegation {
+ 	struct list_head	dl_perfile;
+ 	struct list_head	dl_perclnt;
+@@ -267,6 +257,12 @@ struct nfs4_client {
+ 	unsigned long		cl_cb_slot_busy;
+ 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
+ 						/* wait here for slots */
++#if defined(CONFIG_PNFSD)
++	struct list_head	cl_layouts;	/* outstanding layouts */
++	struct list_head	cl_layoutrecalls; /* outstanding layoutrecall
++						     callbacks */
++	atomic_t		cl_deviceref;	/* Num outstanding devs */
++#endif /* CONFIG_PNFSD */
+ };
+ 
+ static inline void
+@@ -383,6 +379,14 @@ struct nfs4_file {
+ 	u32                     fi_id;      /* used with stateowner->so_id 
+ 					     * for stateid_hashtbl hash */
+ 	bool			fi_had_conflict;
++#if defined(CONFIG_PNFSD)
++	struct list_head	fi_layouts;
++	struct list_head	fi_layout_states;
++	/* used by layoutget / layoutrecall */
++	struct nfs4_fsid	fi_fsid;
++	u32			fi_fhlen;
++	u8			fi_fhval[NFS4_FHSIZE];
++#endif /* CONFIG_PNFSD */
+ };
+ 
+ /* XXX: for first cut may fall back on returning file that doesn't work
+@@ -411,6 +415,15 @@ static inline struct file *find_any_file
+ 		return f->fi_fds[O_RDONLY];
+ }
+ 
++#if defined(CONFIG_PNFSD)
++/* pNFS Metadata server state */
 +
-+int
-+bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
++struct pnfs_ds_dev_entry {
++	struct list_head	dd_dev_entry; /* st_pnfs_ds_id entry */
++	u32			dd_dsid;
++};
++#endif /* CONFIG_PNFSD */
++
+ /*
+ * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+ *
+@@ -433,6 +446,9 @@ struct nfs4_stateid {
+ 	struct list_head              st_perfile;
+ 	struct list_head              st_perstateowner;
+ 	struct list_head              st_lockowners;
++#if defined(CONFIG_PNFSD)
++	struct list_head              st_pnfs_ds_id;
++#endif /* CONFIG_PNFSD */
+ 	struct nfs4_stateowner      * st_stateowner;
+ 	struct nfs4_file            * st_file;
+ 	stateid_t                     st_stateid;
+@@ -485,6 +501,34 @@ extern void nfsd4_recdir_purge_old(void)
+ extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+ extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+ extern void release_session_client(struct nfsd4_session *);
++extern void nfsd4_free_slab(struct kmem_cache **);
++extern struct nfs4_file *find_file(struct inode *);
++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *);
++extern void put_nfs4_file(struct nfs4_file *);
++extern void get_nfs4_file(struct nfs4_file *);
++extern struct nfs4_client *find_confirmed_client(clientid_t *);
++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags);
++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *);
++extern __be32 nfs4_check_stateid(stateid_t *);
++extern void expire_client_lock(struct nfs4_client *);
++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *);
++
++#if defined(CONFIG_PNFSD)
++extern int nfsd4_init_pnfs_slabs(void);
++extern void nfsd4_free_pnfs_slabs(void);
++extern void pnfs_expire_client(struct nfs4_client *);
++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *);
++extern void nfs4_pnfs_state_init(void);
++extern void nfs4_pnfs_state_shutdown(void);
++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *);
++#else /* CONFIG_PNFSD */
++static inline void nfsd4_free_pnfs_slabs(void) {}
++static inline int nfsd4_init_pnfs_slabs(void) { return 0; }
++static inline void pnfs_expire_client(struct nfs4_client *clp) {}
++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {}
++static inline void nfs4_pnfs_state_shutdown(void) {}
++#endif /* CONFIG_PNFSD */
+ 
+ static inline void
+ nfs4_put_stateowner(struct nfs4_stateowner *so)
+@@ -498,4 +542,24 @@ nfs4_get_stateowner(struct nfs4_stateown
+ 	kref_get(&so->so_ref);
+ }
+ 
++static inline u64
++end_offset(u64 start, u64 len)
 +{
-+	struct super_block		*sb;
-+	struct nfsd4_pnfs_cb_layout	lr;
-+	bl_layout_rec_t			*r;
-+	pnfs_blocklayout_layout_t	*b;
-+	u64				adj;
-+	
-+	dprintk("--> %s\n", __func__);
-+	BUG_ON(!len);
-+	switch (type) {
-+		case RETURN_FILE:
-+			sb = inode->i_sb;
-+			dprintk("  recalling layout [0x%x:%lu], %Lu:%Lu\n",
-+			    inode->i_sb->s_dev, inode->i_ino,
-+				_2SECTS(offset), _2SECTS(len));
-+			break;
-+		case RETURN_FSID:
-+			sb = inode->i_sb;
-+			dprintk("%s: recalling layout for fsid x (unimplemented)\n",
-+				__func__);
-+			return 0;
-+		case RETURN_ALL:
-+			/*
-+			 * XXX figure out how to get a sb since there's no
-+			 * inode ptr
-+			 */
-+			dprintk("%s: recalling all layouts (unimplemented)\n",
-+				__func__);
-+			return 0;
-+		default:
-+			return -EINVAL;
-+	}
-+	
-+restart:
-+	r = layout_inode_find(inode);
-+	if (r && len && !r->blr_recalled) {
-+		spin_lock(&r->blr_lock);
-+		list_for_each_entry(b, &r->blr_layouts, bll_list) {
-+			if (!r->blr_recalled && !b->bll_recalled &&
-+			    (offset >= b->bll_foff) && (offset < BLL_F_END(b))) {
-+				b->bll_recalled		= 1;
-+				lr.cbl_recall_type	= type;
-+				lr.cbl_seg.layout_type	= LAYOUT_BLOCK_VOLUME;
-+				lr.cbl_seg.clientid	= 0;
-+				lr.cbl_seg.offset	= 0;
-+				lr.cbl_seg.length	= NFS4_MAX_UINT64;
-+				r->blr_recalled		= 1;
-+				dprintk("  FULL LAYOUTRECALL\n");
-+				lr.cbl_seg.iomode = IOMODE_ANY;
++	u64 end;
++
++	end = start + len;
++	return end >= start ? end : NFS4_MAX_UINT64;
++}
++
++/* last octet in a range */
++static inline u64
++last_byte_offset(u64 start, u64 len)
++{
++	u64 end;
 +
-+				/*
-+				 * Currently there are only two cases where the
-+				 * layout is being returned.
-+				 *    (1) Someone is issuing a NFS_WRITE operation
-+				 *        to this layout.
-+				 *    (2) The file has been truncated which means
-+				 *        the layout is immediately made invalid.
-+				 * In both cases the client must write any
-+				 * uncommitted modifications to the server via
-+				 * NFS_WRITE.
-+				 */
-+				lr.cbl_layoutchanged = 1;
++	BUG_ON(!len);
++	end = start + len;
++	return end > start ? end - 1 : NFS4_MAX_UINT64;
++}
 +
-+				/*
-+				 * Need to drop the lock because we'll get a
-+				 * layoutreturn which will block waiting for
-+				 * the lock. The request will come in on the
-+				 * same thread which will cause a deadlock.
-+				 */
-+				spin_unlock(&r->blr_lock);
-+				nfsd_layout_recall_cb(sb, inode, &lr);
-+				adj = MIN(b->bll_len - (offset - b->bll_foff),
-+				    len);
-+				offset += adj;
-+				len -= adj;
-+				if (!len) {
-+					spin_lock(&r->blr_lock);
-+					break;
-+				}
-+				/*
-+				 * Since layoutreturn will have been called we
-+				 * can't assume blr_layouts is still valid,
-+				 * so restart.
-+				 */
-+				goto restart;
+ #endif   /* NFSD4_STATE_H */
+diff -up linux-2.6.37.noarch/fs/nfsd/vfs.c.orig linux-2.6.37.noarch/fs/nfsd/vfs.c
+--- linux-2.6.37.noarch/fs/nfsd/vfs.c.orig	2011-01-28 09:37:32.569979012 -0500
++++ linux-2.6.37.noarch/fs/nfsd/vfs.c	2011-01-28 09:43:53.371768014 -0500
+@@ -36,7 +36,11 @@
+ #ifdef CONFIG_NFSD_V4
+ #include "acl.h"
+ #include "idmap.h"
++#include <linux/nfsd4_spnfs.h>
+ #endif /* CONFIG_NFSD_V4 */
++#if defined(CONFIG_SPNFS_BLOCK)
++#include <linux/nfsd4_block.h>
++#endif
+ 
+ #include "nfsd.h"
+ #include "vfs.h"
+@@ -380,6 +384,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str
+ 					NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
+ 			if (err)
+ 				goto out;
++#if defined(CONFIG_SPNFS_BLOCK)
++			if (pnfs_block_enabled(inode, 0)) {
++				err = bl_layoutrecall(inode, RETURN_FILE,
++				    iap->ia_size, inode->i_size - iap->ia_size);
 +			}
-+		}
-+		spin_unlock(&r->blr_lock);
++#endif /* CONFIG_SPNFS_BLOCK */
+ 		}
+ 
+ 		host_err = get_write_access(inode);
+@@ -1685,6 +1695,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru
+ 	struct inode	*fdir, *tdir;
+ 	__be32		err;
+ 	int		host_err;
++#ifdef CONFIG_SPNFS
++	unsigned long ino = 0;
++	unsigned long generation = 0;
++	unsigned int nlink = 0;
++#endif /* CONFIG_SPNFS */
+ 
+ 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
+ 	if (err)
+@@ -1744,7 +1759,27 @@ nfsd_rename(struct svc_rqst *rqstp, stru
+ 	host_err = nfsd_break_lease(odentry->d_inode);
+ 	if (host_err)
+ 		goto out_drop_write;
++
++#ifdef CONFIG_SPNFS
++	/*
++	 * if the target is a preexisting regular file, remember the
++	 * inode number and generation so we can delete the stripes;
++	 * save the link count as well so that the stripes only get
++	 * get deleted when the last link is deleted
++	 */
++	if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) {
++		ino = ndentry->d_inode->i_ino;
++		generation = ndentry->d_inode->i_generation;
++		nlink = ndentry->d_inode->i_nlink;
 +	}
-+	
-+	dprintk("<-- %s\n", __func__);
-+	return 0;
-+}
++#endif /* CONFIG_SPNFS */
 +
-+/*
-+ * []------------------------------------------------------------------[]
-+ * | Support functions from here on down.				|
-+ * []------------------------------------------------------------------[]
-+ */
+ 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
++#ifdef CONFIG_SPNFS
++	if (spnfs_enabled() && (!host_err && ino && nlink == 1))
++		spnfs_remove(ino, generation);
++#endif /* CONFIG_SPNFS */
 +
-+/*
-+ * bld_simple -- given a dev_t build a simple volume structure
-+ *
-+ * Simple volume contains the device signature and offset to that data in
-+ * the storage volume.
-+ */
-+static pnfs_blocklayout_devinfo_t *
-+bld_simple(struct list_head *volumes, dev_t devid, int local_index)
-+{
-+	pnfs_blocklayout_devinfo_t	*bld	= NULL;
-+	bl_comm_msg_t			msg;
-+	bl_comm_res_t			*res	= NULL;
-+	
-+	msg.msg_type = PNFS_UPCALL_MSG_GETSIG;
-+	msg.u.msg_dev = devid;
-+	if (bl_upcall(bl_comm_global, &msg, &res)) {
-+		dprintk("%s: Failed to get signature information\n", __func__);
-+		goto error;
-+	}
-+	
-+	bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE);
-+	if (!bld)
-+		return NULL;
-+	
-+	bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset;
-+	bld->u.simple.bld_sig_len = res->u.sig.len;
-+	bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL);
-+	if (!bld->u.simple.bld_sig)
-+		goto error;
-+	
-+	memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len);
-+	kfree(res);
-+	return bld;
-+	
-+error:
-+	if (bld)
-+		bld_free(bld);
-+	if (res)
-+		kfree(res);
-+	dprintk("%s: error in bld_simple\n", __func__);
-+	return NULL;
-+}
+ 	if (!host_err) {
+ 		host_err = commit_metadata(tfhp);
+ 		if (!host_err)
+@@ -1784,6 +1819,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
+ 	struct inode	*dirp;
+ 	__be32		err;
+ 	int		host_err;
++#if defined(CONFIG_SPNFS)
++	unsigned long	ino;
++	unsigned long	generation;
++	unsigned int	nlink;
++#endif /* defined(CONFIG_SPNFS) */
+ 
+ 	err = nfserr_acces;
+ 	if (!flen || isdotent(fname, flen))
+@@ -1807,6 +1847,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
+ 		goto out;
+ 	}
+ 
++#if defined(CONFIG_SPNFS)
++	/*
++	 * Remember the inode number to communicate to the spnfsd
++	 * for removal of stripes; save the link count as well so that
++	 * the stripes only get get deleted when the last link is deleted
++	 */
++	ino = rdentry->d_inode->i_ino;
++	generation = rdentry->d_inode->i_generation;
++	nlink = rdentry->d_inode->i_nlink;
++#endif /* defined(CONFIG_SPNFS) */
 +
+ 	if (!type)
+ 		type = rdentry->d_inode->i_mode & S_IFMT;
+ 
+@@ -1827,6 +1878,29 @@ out_put:
+ 	if (!host_err)
+ 		host_err = commit_metadata(fhp);
+ 
++#if defined(CONFIG_SPNFS)
++	/*
++	 * spnfs: notify spnfsd of removal to destroy stripes
++	 */
 +/*
-+ * bld_slice -- given a dev_t build a slice volume structure
-+ *
-+ * A slice volume contains the length of the slice/partition and its offset
-+ * from the beginning of the storage volume. There's also a reference to
-+ * the "simple" volume which contains this slice.
-+ */
-+static pnfs_blocklayout_devinfo_t *
-+bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc)
-+{
-+	pnfs_blocklayout_devinfo_t	*bld;
-+	bl_comm_msg_t			msg;
-+	bl_comm_res_t			*res;
-+	
-+	dprintk("--> %s\n", __func__);
-+	bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE);
-+	if (!bld)
-+		return NULL;
-+	
-+	msg.msg_type	= PNFS_UPCALL_MSG_GETSLICE;
-+	msg.u.msg_dev	= devid;
-+	if (bl_upcall(bl_comm_global, &msg, &res)) {
-+		dprintk("Upcall to get slice info failed\n");
-+		bld_free(bld);
-+		return NULL;
++	sb = current_fh->fh_dentry->d_inode->i_sb;
++	if (sb->s_export_op->spnfs_remove) {
++*/
++	dprintk("%s check if spnfs_enabled\n", __FUNCTION__);
++	if (spnfs_enabled() && nlink == 1) {
++		BUG_ON(ino == 0);
++		dprintk("%s calling spnfs_remove inumber=%ld\n",
++			__FUNCTION__, ino);
++		if (spnfs_remove(ino, generation) == 0) {
++			dprintk("%s spnfs_remove success\n", __FUNCTION__);
++		} else {
++			/* XXX How do we make this atomic? */
++			printk(KERN_WARNING "nfsd: pNFS could not "
++				"remove stripes for inode: %ld\n", ino);
++		}
 +	}
-+	
-+	bld->bld_devid.devid = devid;
-+	bld->bld_index_loc	= my_loc;
-+	bld->u.slice.bld_start	= res->u.slice.start * 512LL;
-+	bld->u.slice.bld_len	= res->u.slice.length * 512LL;
-+	bld->u.slice.bld_index	= simple_loc;
++#endif /* defined(CONFIG_SPNFS) */
++
+ 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ out_nfserr:
+ 	err = nfserrno(host_err);
+diff -up linux-2.6.37.noarch/fs/nfsd/xdr4.h.orig linux-2.6.37.noarch/fs/nfsd/xdr4.h
+--- linux-2.6.37.noarch/fs/nfsd/xdr4.h.orig	2011-01-28 09:37:32.570978977 -0500
++++ linux-2.6.37.noarch/fs/nfsd/xdr4.h	2011-01-28 09:43:53.372767858 -0500
+@@ -37,6 +37,8 @@
+ #ifndef _LINUX_NFSD_XDR4_H
+ #define _LINUX_NFSD_XDR4_H
+ 
++#include <linux/nfsd/nfsd4_pnfs.h>
++
+ #include "state.h"
+ #include "nfsd.h"
+ 
+@@ -390,6 +392,51 @@ struct nfsd4_reclaim_complete {
+ 	u32 rca_one_fs;
+ };
+ 
++struct nfsd4_pnfs_getdevinfo {
++	struct nfsd4_pnfs_deviceid gd_devid;	/* request */
++	u32			gd_layout_type;	/* request */
++	u32			gd_maxcount;	/* request */
++	u32			gd_notify_types;/* request */
++	struct super_block	*gd_sb;
++};
++
++struct nfsd4_pnfs_getdevlist {
++	u32             gd_layout_type;	/* request */
++	u32		gd_maxdevices;	/* request */
++	u64		gd_cookie;	/* request - response */
++	u64		gd_verf;	/* request - response */
++	struct svc_fh 	*gd_fhp;	/* response */
++	u32		gd_eof;		/* response */
++};
 +
-+	dprintk("%s: start %Lu, len %Lu\n", __func__,
-+		bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL);
++struct nfsd4_pnfs_layoutget {
++	u64			lg_minlength;	/* request */
++	u32			lg_signal;	/* request */
++	u32			lg_maxcount;	/* request */
++	struct svc_fh		*lg_fhp;	/* request */
++	stateid_t		lg_sid;		/* request/response */
++	struct nfsd4_layout_seg	lg_seg;		/* request/response */
++	u32			lg_roc;		/* response */
++};
 +
-+	kfree(res);
-+	dprintk("<-- %s (rval %p)\n", __func__, bld);
-+	return bld;
-+}
++struct nfsd4_pnfs_layoutcommit {
++	struct nfsd4_pnfs_layoutcommit_arg args;
++	stateid_t		lc_sid;		/* request */
++	struct nfsd4_pnfs_layoutcommit_res res;
++};
 +
-+static int
-+layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
-+    struct nfsd4_layout_seg *seg)
-+{
-+	pnfs_blocklayout_layout_t	*n;
-+	
-+	dprintk("--> %s\n", __func__);
-+	
-+	if (!list_empty(&r->blr_layouts))
-+		if (layout_cache_fill_from_list(r, h, seg) == False)
-+			return -EIO;
-+	
-+	/*
-+	 * This deals with two conditions.
-+	 *    (1) When blr_layouts is empty we need to create the first entry
-+	 *    (2) When the range requested falls past the end of any current
-+	 *        layout the residual must be taken care of.
-+	 */	
-+	if (seg->length) {
-+		n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h);
-+		if (!n)
-+			return -ENOMEM;
-+		dprintk("  remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff),
-+			_2SECTS(n->bll_len));
-+	}
-+	
-+	dprintk("<-- %s\n", __func__);
-+	return 0;
-+}
++enum layoutreturn_flags {
++	LR_FLAG_INTERN = 1 << 0,	/* internal return */
++	LR_FLAG_EXPIRE = 1 << 1,	/* return on client expiration */
++};
 +
-+struct list_head *
-+layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible,
-+    struct nfsd4_layout_seg *seg)
-+{
-+	pnfs_blocklayout_layout_t	*b,
-+					*n		= NULL;
-+	struct list_head		*bl_candidates	= NULL;
-+	struct fiemap_extent_info	fei;
-+	struct inode			*i;
-+	dev_t				dev;
-+	
-+	dev	= r->blr_rdev;
-+	i	= r->blr_inode;
-+	
-+	dprintk("--> %s\n", __func__);
-+	bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL);
-+	if (!bl_candidates)
-+		return NULL;
-+	INIT_LIST_HEAD(bl_candidates);
-+	extents_setup(&fei);
-+	
-+	list_for_each_entry(b, bl_possible, bll_list) {
-+		if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
-+			
-+			extents_count(&fei, i, b->bll_foff, b->bll_len);
-+			if (fei.fi_extents_mapped) {
-+				
-+				/*
-+				 * Common case here. Got a range which has
-+				 * extents. Now get those extents and process
-+				 * them into pNFS extents.
-+				 */
-+				if (extents_get(&fei, i, b->bll_foff,
-+				    b->bll_len) == False)
-+					goto cleanup;
-+				if (extents_process(&fei, bl_candidates,
-+				    seg, dev, b) == False)
-+					goto cleanup;
-+				extents_cleanup(&fei);
-+				
-+			} else if (seg->iomode == IOMODE_READ) {
-+				
-+				/*
-+				 * Found a hole in a file while reading. No 
-+				 * problem, just create a pNFS extent for the
-+				 * range and let the client know there's no
-+				 * backing store.
-+				 */
-+				n = bll_alloc(b->bll_foff, b->bll_len,
-+				    BLOCK_LAYOUT_NEW, bl_candidates);
-+				n->bll_es = PNFS_BLOCK_NONE_DATA;
-+				n->bll_vol_id.sbid = 0;
-+				n->bll_vol_id.devid = dev;
-+				seg->length += b->bll_len;
-+			} else {
-+				
-+				/*
-+				 * There's a problem here. Since the iomode
-+				 * is read/write fallocate should have allocated
-+				 * any necessary storage for the given range.
-+				 */
-+				dprintk("    Extent count for RW is 0\n");
-+				goto cleanup;
-+			}
-+			
-+		} else {
-+			n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates);
-+			seg->length += n->bll_len;
-+		}
++struct nfsd4_pnfs_layoutreturn {
++	struct nfsd4_pnfs_layoutreturn_arg args;
++	u32			lr_flags;
++	stateid_t		lr_sid;		/* request/resopnse */
++	u32			lrs_present;	/* response */
++};
 +
-+		if (r->blr_ext_size < (b->bll_foff + b->bll_len))
-+			r->blr_ext_size = b->bll_foff + b->bll_len;
-+	}
-+	
-+	while (!list_empty(bl_possible)) {
-+		b = list_entry(bl_possible->next,
-+		    struct pnfs_blocklayout_layout, bll_list);
-+		list_del(&b->bll_list);
-+		kfree(b);
+ struct nfsd4_op {
+ 	int					opnum;
+ 	__be32					status;
+@@ -432,6 +479,13 @@ struct nfsd4_op {
+ 		struct nfsd4_destroy_session	destroy_session;
+ 		struct nfsd4_sequence		sequence;
+ 		struct nfsd4_reclaim_complete	reclaim_complete;
++#if defined(CONFIG_PNFSD)
++		struct nfsd4_pnfs_getdevlist	pnfs_getdevlist;
++		struct nfsd4_pnfs_getdevinfo	pnfs_getdevinfo;
++		struct nfsd4_pnfs_layoutget	pnfs_layoutget;
++		struct nfsd4_pnfs_layoutcommit	pnfs_layoutcommit;
++		struct nfsd4_pnfs_layoutreturn	pnfs_layoutreturn;
++#endif /* CONFIG_PNFSD */
+ 	} u;
+ 	struct nfs4_replay *			replay;
+ };
+diff -up linux-2.6.37.noarch/fs/nfs/file.c.orig linux-2.6.37.noarch/fs/nfs/file.c
+--- linux-2.6.37.noarch/fs/nfs/file.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/file.c	2011-01-28 09:43:53.316775510 -0500
+@@ -381,16 +381,16 @@ static int nfs_write_begin(struct file *
+ 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ 	struct page *page;
+ 	int once_thru = 0;
++	struct pnfs_layout_segment *lseg;
+ 
+ 	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+ 		file->f_path.dentry->d_parent->d_name.name,
+ 		file->f_path.dentry->d_name.name,
+ 		mapping->host->i_ino, len, (long long) pos);
+ 
+-	pnfs_update_layout(mapping->host,
+-			   nfs_file_open_context(file),
+-			   IOMODE_RW);
+-
++	lseg = pnfs_update_layout(mapping->host,
++				  nfs_file_open_context(file),
++				  pos, len, IOMODE_RW);
+ start:
+ 	/*
+ 	 * Prevent starvation issues if someone is doing a consistency
+@@ -399,17 +399,22 @@ start:
+ 	ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+ 			nfs_wait_bit_killable, TASK_KILLABLE);
+ 	if (ret)
+-		return ret;
++		goto out;
+ 
+ 	page = grab_cache_page_write_begin(mapping, index, flags);
+-	if (!page)
+-		return -ENOMEM;
++	if (!page) {
++		ret = -ENOMEM;
++		goto out;
 +	}
-+		
-+	b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout,
-+	    bll_list);
-+	seg->offset = b->bll_foff;
-+	dprintk("<-- %s okay\n", __func__);
-+	return bl_candidates;
-+	
-+cleanup:
-+	extents_cleanup(&fei);
-+	if (bl_candidates)
-+		kfree(bl_candidates);
-+	dprintk("<-- %s, error occurred\n", __func__);
-+	return NULL;
-+}
-+
-+/*
-+ * layout_cache_merge -- collapse layouts which make up a contiguous range.
-+ */
-+static void
-+layout_cache_merge(bl_layout_rec_t *r, struct list_head *h)
-+{
-+	pnfs_blocklayout_layout_t	*b,
-+					*p;
-+	
-+	dprintk("--> %s\n", __func__);
-+restart:
-+	p = NULL;
-+	list_for_each_entry(b, h, bll_list) {
-+		if (p && (BLL_S_END(p) == b->bll_soff) &&
-+		    (p->bll_es == b->bll_es) &&
-+		    (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
-+			/*
-+			 * We've got a condidate.
-+			 */
-+#ifdef too_verbose
-+			dprintk("  merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n",
-+				_2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+				_2SECTS(b->bll_soff),
-+				_2SECTS(p->bll_foff), _2SECTS(p->bll_len),
-+				_2SECTS(b->bll_soff));
-+#endif
-+			
-+			if (p->bll_cache_state == BLOCK_LAYOUT_CACHE)
-+				p->bll_cache_state = BLOCK_LAYOUT_UPDATE;
-+			p->bll_len += b->bll_len;
-+			list_del(&b->bll_list);
-+			kfree(b);
-+			goto restart;
-+		} else if (p && (BLL_F_END(p) == b->bll_foff) &&
-+			   (p->bll_es == b->bll_es) &&
-+			   (b->bll_es == PNFS_BLOCK_NONE_DATA)) {
-+			p->bll_len += b->bll_len;
-+			list_del(&b->bll_list);
-+			kfree(b);
-+			goto restart;
-+		} else
-+			p = b;
+ 	*pagep = page;
+ 
+-	ret = nfs_flush_incompatible(file, page);
++	ret = nfs_flush_incompatible(file, page, lseg);
+ 	if (ret) {
+ 		unlock_page(page);
+ 		page_cache_release(page);
++		*pagep = NULL;
++		*fsdata = NULL;
++		goto out;
+ 	} else if (!once_thru &&
+ 		   nfs_want_read_modify_write(file, page, pos, len)) {
+ 		once_thru = 1;
+@@ -418,6 +423,12 @@ start:
+ 		if (!ret)
+ 			goto start;
+ 	}
++	ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
++ out:
++	if (ret) {
++		put_lseg(lseg);
++		*fsdata = NULL;
 +	}
-+	dprintk("<-- %s\n", __func__);
-+}
+ 	return ret;
+ }
+ 
+@@ -427,6 +438,7 @@ static int nfs_write_end(struct file *fi
+ {
+ 	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ 	int status;
++	struct pnfs_layout_segment *lseg;
+ 
+ 	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+ 		file->f_path.dentry->d_parent->d_name.name,
+@@ -453,10 +465,17 @@ static int nfs_write_end(struct file *fi
+ 			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ 	}
+ 
+-	status = nfs_updatepage(file, page, offset, copied);
++	lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
++	status = pnfs_write_end(file, page, pos, len, copied, lseg);
++	if (status)
++		goto out;
++	status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
+ 
++ out:
+ 	unlock_page(page);
+ 	page_cache_release(page);
++	pnfs_write_end_cleanup(file, fsdata);
++	put_lseg(lseg);
+ 
+ 	if (status < 0)
+ 		return status;
+@@ -567,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm
+ 	/* make sure the cache has finished storing the page */
+ 	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+ 
++	/* XXX Do we want to call pnfs_update_layout here? */
 +
-+static int
-+layout_cache_update(bl_layout_rec_t *r, struct list_head *h)
-+{
-+	pnfs_blocklayout_layout_t	*b,
-+					*c,
-+					*n;
-+	boolean_t			status = 0;
-+	
-+	dprintk("--> %s\n", __func__);
-+	if (list_empty(&r->blr_layouts)) {
-+		/* ---- Just add entries and return ---- */
-+		dprintk("  cache empty for inode 0x%x:%ld\n", r->blr_rdev,
-+			r->blr_inode->i_ino);
-+		list_for_each_entry(b, h, bll_list) {
-+			c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE,
-+					  &r->blr_layouts);
-+			if (!c) {
-+				status = -ENOMEM;
-+				break;
-+			}
-+			dprintk("    adding %Lu(f):%Lu(l):%Lu(s):%d\n",
-+				_2SECTS(c->bll_foff), _2SECTS(c->bll_len),
-+				_2SECTS(c->bll_soff), c->bll_es);
-+		}
-+		return status;
+ 	lock_page(page);
+ 	mapping = page->mapping;
+ 	if (mapping != dentry->d_inode->i_mapping)
+@@ -577,8 +598,8 @@ static int nfs_vm_page_mkwrite(struct vm
+ 		goto out_unlock;
+ 
+ 	ret = VM_FAULT_LOCKED;
+-	if (nfs_flush_incompatible(filp, page) == 0 &&
+-	    nfs_updatepage(filp, page, 0, pagelen) == 0)
++	if (nfs_flush_incompatible(filp, page, NULL) == 0 &&
++	    nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0)
+ 		goto out;
+ 
+ 	ret = VM_FAULT_SIGBUS;
+diff -up linux-2.6.37.noarch/fs/nfs/inode.c.orig linux-2.6.37.noarch/fs/nfs/inode.c
+--- linux-2.6.37.noarch/fs/nfs/inode.c.orig	2011-01-28 09:37:32.529980398 -0500
++++ linux-2.6.37.noarch/fs/nfs/inode.c	2011-01-28 09:43:53.317775328 -0500
+@@ -648,6 +648,7 @@ struct nfs_open_context *get_nfs_open_co
+ 		atomic_inc(&ctx->lock_context.count);
+ 	return ctx;
+ }
++EXPORT_SYMBOL(get_nfs_open_context);
+ 
+ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+ {
+@@ -1003,6 +1004,7 @@ void nfs_fattr_init(struct nfs_fattr *fa
+ 	fattr->time_start = jiffies;
+ 	fattr->gencount = nfs_inc_attr_generation_counter();
+ }
++EXPORT_SYMBOL(nfs_fattr_init);
+ 
+ struct nfs_fattr *nfs_alloc_fattr(void)
+ {
+@@ -1212,6 +1214,14 @@ static int nfs_update_inode(struct inode
+ 		server->fsid = fattr->fsid;
+ 
+ 	/*
++	 * file needs layout commit, server attributes may be stale
++	 */
++	if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) {
++		dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n",
++			__func__, inode->i_sb->s_id, inode->i_ino);
++		return 0;
 +	}
-+	
-+	list_for_each_entry(b, h, bll_list) {
-+		BUG_ON(!b->bll_vol_id.devid);
-+		if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) {
-+			boolean_t found = False;
-+			list_for_each_entry(c, &r->blr_layouts, bll_list) {
-+				if ((b->bll_soff >= c->bll_soff) &&
-+				    (b->bll_soff < BLL_S_END(c)) &&
-+				    (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
-+					u64	u;
-+					
-+					if ((b->bll_foff < c->bll_foff) ||
-+					    (b->bll_foff > BLL_F_END(c)))
-+						BUG();
-+					
-+					u = BLL_S_END(b) - BLL_S_END(c);
-+					/*
-+					 * The updated cache entry has to be
-+					 * different than the current.
-+					 * Otherwise the cache state for 'b'
-+					 * should be BLOCK_LAYOUT_CACHE.
-+					 */
-+					BUG_ON(BLL_S_END(b) < BLL_S_END(c));
-+					
-+					dprintk("  "
-+						"updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n",
-+						_2SECTS(c->bll_foff),
-+						_2SECTS(c->bll_len),
-+						_2SECTS(c->bll_soff),
-+						_2SECTS(c->bll_len + u));
-+					c->bll_len += u;
-+					bll_collapse(r, c);
-+					found = True;
-+					break;
-+				}
-+			}
++	/*
+ 	 * Update the read time so we don't revalidate too often.
+ 	 */
+ 	nfsi->read_cache_jiffies = fattr->time_start;
+@@ -1410,9 +1420,10 @@ static int nfs_update_inode(struct inode
+  */
+ void nfs4_evict_inode(struct inode *inode)
+ {
+-	pnfs_destroy_layout(NFS_I(inode));
++	pnfs_return_layout(inode, NULL, true);
+ 	truncate_inode_pages(&inode->i_data, 0);
+ 	end_writeback(inode);
++	pnfs_destroy_layout(NFS_I(inode));
+ 	/* If we are holding a delegation, return it! */
+ 	nfs_inode_return_delegation_noreclaim(inode);
+ 	/* First call standard NFS clear_inode() code */
+@@ -1457,6 +1468,8 @@ static inline void nfs4_init_once(struct
+ 	nfsi->delegation = NULL;
+ 	nfsi->delegation_state = 0;
+ 	init_rwsem(&nfsi->rwsem);
++	rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layoutreturn");
++	rpc_init_wait_queue(&nfsi->lo_rpcwaitq_stateid, "pNFS Layoutstateid");
+ 	nfsi->layout = NULL;
+ #endif
+ }
+diff -up linux-2.6.37.noarch/fs/nfs/internal.h.orig linux-2.6.37.noarch/fs/nfs/internal.h
+--- linux-2.6.37.noarch/fs/nfs/internal.h.orig	2011-01-28 09:37:32.529980398 -0500
++++ linux-2.6.37.noarch/fs/nfs/internal.h	2011-01-28 09:43:53.318775148 -0500
+@@ -149,6 +149,16 @@ extern struct nfs_server *nfs_clone_serv
+ 					   struct nfs_fattr *);
+ extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+ extern int nfs4_check_client_ready(struct nfs_client *clp);
++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1,
++		const struct sockaddr *sa2);
++extern int nfs4_set_client(struct nfs_server *server,
++		const char *hostname,
++		const struct sockaddr *addr,
++		const size_t addrlen,
++		const char *ip_addr,
++		rpc_authflavor_t authflavour,
++		int proto, const struct rpc_timeout *timeparms,
++		u32 minorversion);
+ #ifdef CONFIG_PROC_FS
+ extern int __init nfs_fs_proc_init(void);
+ extern void nfs_fs_proc_exit(void);
+@@ -214,6 +224,8 @@ extern const u32 nfs41_maxwrite_overhead
+ extern struct rpc_procinfo nfs4_procedures[];
+ #endif
+ 
++extern int nfs4_recover_expired_lease(struct nfs_client *clp);
 +
-+			if (found == False) {
-+				dprintk("  ERROR Expected to find"
-+				    " %Lu(f):%Lu(l):%Lu(s), but didn't\n",
-+				    _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+				    _2SECTS(b->bll_soff));
-+				list_for_each_entry(c, &r->blr_layouts, bll_list)
-+					print_bll(c, "Cached");
-+				BUG();
-+			}
-+		} else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
-+			
-+			c = list_first_entry(&r->blr_layouts,
-+			    struct pnfs_blocklayout_layout, bll_list);
-+			if (b->bll_foff < c->bll_foff) {
-+				/*
-+				 * Special case where new entry is before
-+				 * first cached entry.
-+				 */
-+				c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL);
-+				list_add(&c->bll_list, &r->blr_layouts);
-+				dprintk("  new entry at head of list at %Lu, "
-+					"len %Lu\n",
-+					_2SECTS(c->bll_foff), _2SECTS(c->bll_len));
-+			} else {
-+				list_for_each_entry(c, &r->blr_layouts,
-+				    bll_list) {
-+					n = list_entry(c->bll_list.next,
-+					    struct pnfs_blocklayout_layout,
-+					    bll_list);
-+					/*
-+					 * This is ugly, but can't think of
-+					 * another way to examine this case.
-+					 * Consider the following. Need to
-+					 * add an entry which starts at 40
-+					 * and the cache has the following
-+					 * entries:
-+					 * Start    Length
-+					 * 10       5
-+					 * 30       5
-+					 * 50       5
-+					 * So, need to look and see if the new
-+					 * entry starts after the current
-+					 * cache, but before the next one.
-+					 * There's a catch in that the next
-+					 * entry might not be valid as it's
-+					 * really just a pointer to the list
-+					 * head.
-+					 */
-+					if (((b->bll_foff >=
-+					      BLL_F_END(c)) &&
-+					     (c->bll_list.next == &r->blr_layouts)) ||
-+					    ((b->bll_foff >=
-+					      BLL_F_END(c)) &&
-+					     (b->bll_foff < n->bll_foff))) {
-+						
-+						n = bll_alloc_dup(b,
-+								  BLOCK_LAYOUT_CACHE, NULL);
-+						dprintk("  adding new %Lu:%Lu"
-+							" after %Lu:%Lu\n",
-+							_2SECTS(n->bll_foff),
-+							_2SECTS(n->bll_len),
-+							_2SECTS(c->bll_foff),
-+							_2SECTS(c->bll_len));
-+						list_add(&n->bll_list,
-+							 &c->bll_list);
-+						break;
-+					}
-+				}
-+			}
-+		}
-+	}
-+	dprintk("<-- %s\n", __func__);
-+	return status;
-+}
+ /* proc.c */
+ void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+ 
+@@ -263,10 +275,31 @@ extern int nfs4_get_rootfh(struct nfs_se
+ #endif
+ 
+ /* read.c */
++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++			     const struct rpc_call_ops *call_ops);
++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++			     const struct rpc_call_ops *call_ops);
+ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+ 
+ /* write.c */
++extern int nfs_initiate_write(struct nfs_write_data *data,
++			      struct rpc_clnt *clnt,
++			      const struct rpc_call_ops *call_ops,
++			      int how);
++extern int pnfs_initiate_write(struct nfs_write_data *data,
++			      struct rpc_clnt *clnt,
++			      const struct rpc_call_ops *call_ops,
++			      int how);
++extern int nfs_initiate_commit(struct nfs_write_data *data,
++			       struct rpc_clnt *clnt,
++			       const struct rpc_call_ops *call_ops,
++			       int how);
++extern int pnfs_initiate_commit(struct nfs_write_data *data,
++			       struct rpc_clnt *clnt,
++			       const struct rpc_call_ops *call_ops,
++				int how, int pnfs);
+ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
++extern void nfs_mark_list_commit(struct list_head *head);
+ #ifdef CONFIG_MIGRATION
+ extern int nfs_migrate_page(struct address_space *,
+ 		struct page *, struct page *);
+diff -up linux-2.6.37.noarch/fs/nfs/Kconfig.orig linux-2.6.37.noarch/fs/nfs/Kconfig
+--- linux-2.6.37.noarch/fs/nfs/Kconfig.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/Kconfig	2011-01-28 09:43:53.304777898 -0500
+@@ -87,6 +87,34 @@ config NFS_V4_1
+ config PNFS_FILE_LAYOUT
+ 	tristate
+ 
++config PNFS_OBJLAYOUT
++	tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
++	depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
++	help
++	  Say M here if you want your pNFS client to support the Objects Layout Driver.
++	  Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
++	  upper level driver (SCSI_OSD_ULD).
 +
-+static void
-+layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in)
++	  If unsure, say N.
++
++config PNFS_PANLAYOUT
++	tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
++	depends on PNFS_OBJLAYOUT
++	help
++	  Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver.
++
++	  If unsure, say N.
++
++config PNFS_BLOCK
++	tristate "Provide a pNFS block client (EXPERIMENTAL)"
++	depends on NFS_FS && NFS_V4_1
++	select MD
++	select BLK_DEV_DM
++	help
++	  Say M or y here if you want your pNfs client to support the block protocol
++
++	  If unsure, say N.
++
+ config ROOT_NFS
+ 	bool "Root file system on NFS"
+ 	depends on NFS_FS=y && IP_PNP
+diff -up linux-2.6.37.noarch/fs/nfs/Makefile.orig linux-2.6.37.noarch/fs/nfs/Makefile
+--- linux-2.6.37.noarch/fs/nfs/Makefile.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/Makefile	2011-01-28 09:43:53.305777685 -0500
+@@ -21,3 +21,6 @@ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o f
+ 
+ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+ nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
++
++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c
+--- linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c.orig	2011-01-28 09:37:32.537980121 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4filelayout.c	2011-01-28 09:43:53.320774796 -0500
+@@ -41,7 +41,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz
+ MODULE_DESCRIPTION("The NFSv4 file layout driver");
+ 
+ static int
+-filelayout_set_layoutdriver(struct nfs_server *nfss)
++filelayout_set_layoutdriver(struct nfs_server *nfss, const struct nfs_fh *mntfh)
+ {
+ 	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+ 						nfs4_fl_free_deviceid_callback);
+@@ -66,6 +66,200 @@ filelayout_clear_layoutdriver(struct nfs
+ 	return 0;
+ }
+ 
++/* This function is used by the layout driver to calculate the
++ * offset of the file on the dserver based on whether the
++ * layout type is STRIPE_DENSE or STRIPE_SPARSE
++ */
++static loff_t
++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 +{
-+	struct pnfs_blocklayout_layout	*b,
-+					*n;
-+	u64				len;
-+	struct nfsd4_layout_seg		seg = *seg_in;
-+	
-+	dprintk("--> %s\n", __func__);
-+	if (seg.length == NFS4_MAX_UINT64) {
-+		r->blr_recalled = 0;
-+		dprintk("  Fast return of all layouts\n");
-+		while (!list_empty(&r->blr_layouts)) {
-+			b = list_entry(r->blr_layouts.next,
-+				       struct pnfs_blocklayout_layout, bll_list);
-+			dprintk("    foff %Lu, len %Lu, soff %Lu\n",
-+				_2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+				_2SECTS(b->bll_soff));
-+			list_del(&b->bll_list);
-+			kfree(b);
-+		}
-+		dprintk("<-- %s\n", __func__);
-+		return;
-+	}
++	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
 +
-+restart:
-+	list_for_each_entry(b, &r->blr_layouts, bll_list) {
-+		if (seg.offset == b->bll_foff) {
-+			/*
-+			 * This handle the following three cases:
-+			 * (1) return layout matches entire cache layout
-+			 * (2) return layout matches beginning portion of cache
-+			 * (3) return layout matches entire cache layout and
-+			 *     into next entry. Varies from #1 in end case.
-+			 */
-+			dprintk("  match on offsets, %Lu:%Lu\n",
-+				_2SECTS(seg.offset), _2SECTS(seg.length));
-+			len = MIN(seg.length, b->bll_len);
-+			b->bll_foff	+= len;
-+			b->bll_soff	+= len;
-+			b->bll_len	-= len;
-+			seg.length	-= len;
-+			seg.offset	+= len;
-+			if (!b->bll_len) {
-+				list_del(&b->bll_list);
-+				kfree(b);
-+				dprintk("    removing cache line\n");
-+				if (!seg.length) {
-+					dprintk("    also finished\n");
-+					goto complete;
-+				}
-+				/*
-+				 * Since 'b' was freed we can't continue at the
-+				 * next entry which is referenced as
-+				 * b->bll_list.next by the list_for_each_entry
-+				 * macro. Need to restart the loop.
-+				 * TODO: Think about creating a dummy 'b' which
-+				 *       would keep list_for_each_entry() happy.
-+				 */
-+				goto restart;
-+			}
-+			if (!seg.length) {
-+				dprintk("    finished, but cache line not"
-+					"empty\n");
-+				goto complete;
-+			}
-+		} else if ((seg.offset >= b->bll_foff) &&
-+		    (seg.offset < BLL_F_END(b))) {
-+			/*
-+			 * layout being returned is within this cache line.
-+			 */
-+			dprintk("  layout %Lu:%Lu within cache line %Lu:%Lu\n",
-+				_2SECTS(seg.offset), _2SECTS(seg.length),
-+				_2SECTS(b->bll_foff), _2SECTS(b->bll_len));
-+			BUG_ON(!seg.length);
-+			if ((seg.offset + seg.length) >= BLL_F_END(b)) {
-+				/*
-+				 * Layout returned starts in the middle of
-+				 * cache entry and just need to trim back
-+				 * cache to shorter length.
-+				 */
-+				dprintk("    trim back cache line\n");
-+				len = seg.offset - b->bll_foff;
-+				seg.offset += b->bll_len - len;
-+				seg.length -= b->bll_len - len;
-+				b->bll_len = len;
-+				if (!seg.length)
-+					return;
-+			} else {
-+				/*
-+				 * Need to split current cache layout because
-+				 * chunk is being removed from the middle.
-+				 */
-+				dprintk("    split cache line\n");
-+				len = seg.offset + seg.length;
-+				n = bll_alloc(len,
-+					      (b->bll_foff + b->bll_len) - len,
-+					      BLOCK_LAYOUT_CACHE, NULL);
-+				n->bll_soff = b->bll_soff + len;
-+				list_add(&n->bll_list, &b->bll_list);
-+				b->bll_len = seg.offset - b->bll_foff;
-+				return;
-+			}
-+		}
++	switch (flseg->stripe_type) {
++	case STRIPE_SPARSE:
++		return offset;
++
++	case STRIPE_DENSE:
++	{
++		u32 stripe_width;
++		u64 tmp, off;
++		u32 unit = flseg->stripe_unit;
++
++		stripe_width = unit * flseg->dsaddr->stripe_count;
++		tmp = off = offset - flseg->pattern_offset;
++		do_div(tmp, stripe_width);
++		return tmp * unit + do_div(off, unit);
 +	}
-+complete:
-+	if (list_empty(&r->blr_layouts))
-+		r->blr_recalled = 0;
-+	dprintk("<-- %s\n", __func__);
++	default:
++		BUG();
++	}
++
++	/* We should never get here... just to stop the gcc warning */
++	return 0;
 +}
 +
 +/*
-+ * layout_cache_fill_from_list -- fills from cache list
-+ *
-+ * NOTE: This routine was only seperated out from layout_cache_file_from()
-+ * to reduce the indentation level which makes the code easier to read.
-+ */
-+static inline boolean_t
-+layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h,
-+    struct nfsd4_layout_seg *seg)
-+{
-+	pnfs_blocklayout_layout_t	*b,
-+					*n;
-+	enum pnfs_block_extent_state4	s;
-+	
-+	list_for_each_entry(b, &r->blr_layouts, bll_list) {
-+		if (seg->offset < b->bll_foff) {
-+			n = bll_alloc(seg->offset,
-+			    MIN(seg->length, b->bll_foff - seg->offset),
-+			    BLOCK_LAYOUT_NEW, NULL);
-+			if (!n)
-+				return False;
-+			
-+			list_add(&n->bll_list, h->prev);
-+			dprintk("  new: %Lu:%Lu, added before %Lu:%Lu\n",
-+			    _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
-+			    _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
-+			seg->offset += n->bll_len;
-+			seg->length -= n->bll_len;
-+			if (!seg->length)
-+				break;
-+		}
-+		
-+		if ((seg->offset >= b->bll_foff) &&
-+		    (seg->offset < BLL_F_END(b))) {
-+			if (layout_conflict(b, seg->iomode, &s) == False) {
-+				dprintk("  CONFLICT FOUND: "
-+				    "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n",
-+				    _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+				    _2SECTS(b->bll_soff), b->bll_es,
-+				    seg->iomode);
-+				return False;
-+			}
-+			n = bll_alloc(seg->offset,
-+			    MIN(seg->length, BLL_F_END(b) - seg->offset),
-+			    BLOCK_LAYOUT_CACHE, h);
-+			dprintk("  CACHE hit: Found %Lu(f):%Lu(l): "
-+			    "in %Lu(f):%Lu(l):%Lu(s):%d\n",
-+			    _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
-+			    _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+			    _2SECTS(b->bll_soff), b->bll_es);
-+			if (!n)
-+				return False;
-+			
-+			n->bll_soff = b->bll_soff + seg->offset - b->bll_foff;
-+			n->bll_vol_id.sbid = 0;
-+			n->bll_vol_id.devid = b->bll_vol_id.devid;
-+			n->bll_es = s;
-+			seg->offset += n->bll_len;
-+			seg->length -= n->bll_len;
-+			if (!seg->length)
-+				break;
-+		}
++ * Call ops for the async read/write cases
++ * In the case of dense layouts, the offset needs to be reset to its
++ * original value.
++ */
++static void filelayout_read_call_done(struct rpc_task *task, void *data)
++{
++	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
++
++	if (rdata->fldata.orig_offset) {
++		dprintk("%s new off %llu orig offset %llu\n", __func__,
++			rdata->args.offset, rdata->fldata.orig_offset);
++		rdata->args.offset = rdata->fldata.orig_offset;
 +	}
-+	return True;
++
++	/* Note this may cause RPC to be resent */
++	rdata->pdata.call_ops->rpc_call_done(task, data);
 +}
 +
-+static u64
-+bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length,
-+    dev_t dev)
++static void filelayout_read_release(void *data)
 +{
-+	pnfs_blocklayout_layout_t	*n;
-+	
-+	n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates);
-+	if (!n)
-+		return 0;
-+	n->bll_es = PNFS_BLOCK_NONE_DATA;
-+	n->bll_vol_id.sbid = 0;
-+	n->bll_vol_id.devid = dev;
-+	
-+	return n->bll_len;
++	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
++
++	put_lseg(rdata->pdata.lseg);
++	rdata->pdata.lseg = NULL;
++	rdata->pdata.call_ops->rpc_release(data);
 +}
 +
-+static void
-+extents_setup(struct fiemap_extent_info *fei)
++static void filelayout_write_call_done(struct rpc_task *task, void *data)
 +{
-+	fei->fi_extents_start	= NULL;
++	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
++
++	if (wdata->fldata.orig_offset) {
++		dprintk("%s new off %llu orig offset %llu\n", __func__,
++			wdata->args.offset, wdata->fldata.orig_offset);
++		wdata->args.offset = wdata->fldata.orig_offset;
++	}
++
++	/* Note this may cause RPC to be resent */
++	wdata->pdata.call_ops->rpc_call_done(task, data);
 +}
 +
-+/*
-+ * extents_count -- Determine the number of extents for a given range.
-+ *
-+ * No need to call set_fs() here because the function
-+ * doesn't use copy_to_user() if it's only counting
-+ * the number of extents needed.
-+ */
-+static void
-+extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
++static void filelayout_write_release(void *data)
 +{
-+	dprintk("    Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len));
-+	fei->fi_flags		= FIEMAP_FLAG_SYNC;
-+	fei->fi_extents_max	= 0;
-+	fei->fi_extents_start	= NULL;
-+	fei->fi_extents_mapped	= 0;
-+	i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1);
++	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
++
++	put_lseg(wdata->pdata.lseg);
++	wdata->pdata.lseg = NULL;
++	wdata->pdata.call_ops->rpc_release(data);
 +}
 +
-+/*
-+ * extents_get -- Get list of extents for range
++struct rpc_call_ops filelayout_read_call_ops = {
++	.rpc_call_prepare = nfs_read_prepare,
++	.rpc_call_done = filelayout_read_call_done,
++	.rpc_release = filelayout_read_release,
++};
++
++struct rpc_call_ops filelayout_write_call_ops = {
++	.rpc_call_prepare = nfs_write_prepare,
++	.rpc_call_done = filelayout_write_call_done,
++	.rpc_release = filelayout_write_release,
++};
++
++/* Perform sync or async reads.
 + *
-+ * extents_count() must have been called before this routine such that
-+ * fi_extents_mapped is known.
++ * An optimization for the NFS file layout driver
++ * allows the original read/write data structs to be passed in the
++ * last argument.
++ *
++ * TODO: join with write_pagelist?
 + */
-+static boolean_t
-+extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
++static enum pnfs_try_status
++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages)
 +{
-+	int			m_space,
-+				rval;
-+	struct fiemap_extent	*fe;
-+	mm_segment_t		old_fs = get_fs();
-+	
++	struct pnfs_layout_segment *lseg = data->pdata.lseg;
++	struct nfs4_pnfs_ds *ds;
++	loff_t offset = data->args.offset;
++	u32 idx;
++	struct nfs_fh *fh;
++
++	dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n",
++		__func__, data->inode->i_ino, nr_pages,
++		data->args.pgbase, (size_t)data->args.count, offset);
++
++	/* Retrieve the correct rpc_client for the byte range */
++	idx = nfs4_fl_calc_ds_index(lseg, offset);
++	ds = nfs4_fl_prepare_ds(lseg, idx);
++	if (!ds) {
++		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
++		return PNFS_NOT_ATTEMPTED;
++	}
++	dprintk("%s USE DS:ip %x %hu\n", __func__,
++		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++
++	/* just try the first data server for the index..*/
++	data->fldata.ds_nfs_client = ds->ds_clp;
++	fh = nfs4_fl_select_ds_fh(lseg, offset);
++	if (fh)
++		data->args.fh = fh;
++
 +	/*
-+	 * Now malloc the correct amount of space
-+	 * needed. It's possible for the file to have changed
-+	 * between calls which would require more space for
-+	 * the extents. If that occurs the last extent will
-+	 * not have FIEMAP_EXTENT_LAST set and the error will
-+	 * be caught in extents_process().
++	 * Now get the file offset on the dserver
++	 * Set the read offset to this offset, and
++	 * save the original offset in orig_offset
++	 * In the case of aync reads, the offset will be reset in the
++	 * call_ops->rpc_call_done() routine.
 +	 */
-+	m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent);
-+	fe = kmalloc(m_space, GFP_KERNEL);
-+	if (!fe)
-+		return False;
-+	memset(fe, 0, m_space);
-+	
-+	fei->fi_extents_max	= fei->fi_extents_mapped;
-+	fei->fi_extents_mapped	= 0;
-+	fei->fi_extents_start	= fe;
-+	
-+	set_fs(KERNEL_DS);
-+	rval = i->i_op->fiemap(i, fei, foff, len +
-+	    (1 << i->i_sb->s_blocksize_bits) - 1);
-+	set_fs(old_fs);
-+	
-+	if (rval || !fei->fi_extents_mapped) {
-+		dprintk("    No extents. Wanted %d, got %d\n",
-+			fei->fi_extents_max, fei->fi_extents_mapped);
-+		kfree(fe);
-+		fei->fi_extents_start = NULL;
-+		return False;
-+	} else
-+		return True;
++	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
++	data->fldata.orig_offset = offset;
++
++	/* Perform an asynchronous read */
++	nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
++			  &filelayout_read_call_ops);
++
++	data->pdata.pnfs_error = 0;
++
++	return PNFS_ATTEMPTED;
 +}
 +
-+/*
-+ * extents_process -- runs through the extent returned from the file system and
-+ *	 creates block layout entries.
-+ */
-+static boolean_t
-+extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates,
-+    struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b)
++/* Perform async writes. */
++static enum pnfs_try_status
++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync)
 +{
-+	struct fiemap_extent		*fep,
-+					*fep_last	= NULL;
-+	int				i;
-+	pnfs_blocklayout_layout_t	*n;
-+	u64				last_end,
-+					rval;
-+	
-+	dprintk("--> %s\n", __func__);
-+	for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped;
-+	    i++, fep++) {
-+		
-+		BUG_ON(!fep->fe_physical);
-+		/*
-+		 * Deal with corner cases of hoel-y files.
-+		 */
-+		if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) !=
-+				 fep->fe_logical)) {
-+			
-+			/*
-+			 * If the last extent doesn't end logically
-+			 * at the beginning of the current we've got
-+			 * hole and need to create a pNFS extent.
-+			 */
-+			dprintk("    Got a hole at %Ld:%Ld \n", 
-+			    _2SECTS(fep_last->fe_logical),
-+			    _2SECTS(fep_last->fe_length));
-+			last_end = fep_last->fe_logical + fep_last->fe_length;
-+			rval = bll_alloc_holey(bl_candidates, last_end,
-+			    fep->fe_logical - last_end, dev);
-+			if (!rval)
-+				return False;
-+			seg->length += rval;
-+		}
-+		
-+		n = bll_alloc(fep->fe_logical, fep->fe_length,
-+		    BLOCK_LAYOUT_NEW, bl_candidates);
-+		if (unlikely(n == NULL)) {
-+			dprintk("%s: bll_alloc failed\n", __func__);
-+			return False;
-+		}
-+		
-+		n->bll_soff = fep->fe_physical;
-+		n->bll_es = seg->iomode == IOMODE_READ ?
-+		    PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA;
-+		n->bll_vol_id.sbid = 0;
-+		n->bll_vol_id.devid = dev;
-+		seg->length += fep->fe_length;
-+		print_bll(n, "New extent");
-+		fep_last = fep;
++	struct pnfs_layout_segment *lseg = data->pdata.lseg;
++	struct nfs4_pnfs_ds *ds;
++	loff_t offset = data->args.offset;
++	u32 idx;
++	struct nfs_fh *fh;
++
++	/* Retrieve the correct rpc_client for the byte range */
++	idx = nfs4_fl_calc_ds_index(lseg, offset);
++	ds = nfs4_fl_prepare_ds(lseg, idx);
++	if (!ds) {
++		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
++		return PNFS_NOT_ATTEMPTED;
 +	}
-+	dprintk("<-- %s (i=%d)\n", __func__, i);
-+	
-+	return True;
++	dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
++		data->inode->i_ino, sync, (size_t) data->args.count, offset,
++		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++
++	data->fldata.ds_nfs_client = ds->ds_clp;
++	fh = nfs4_fl_select_ds_fh(lseg, offset);
++	if (fh)
++		data->args.fh = fh;
++	/*
++	 * Get the file offset on the dserver. Set the write offset to
++	 * this offset and save the original offset.
++	 */
++	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
++	data->fldata.orig_offset = offset;
++
++	/*
++	 * Perform an asynchronous write The offset will be reset in the
++	 * call_ops->rpc_call_done() routine
++	 */
++	nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
++			   &filelayout_write_call_ops, sync);
++
++	data->pdata.pnfs_error = 0;
++	return PNFS_ATTEMPTED;
 +}
 +
-+static void
-+extents_cleanup(struct fiemap_extent_info *fei)
+ /*
+  * filelayout_check_layout()
+  *
+@@ -82,7 +276,7 @@ filelayout_check_layout(struct pnfs_layo
+ {
+ 	struct nfs4_file_layout_dsaddr *dsaddr;
+ 	int status = -EINVAL;
+-	struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
++	struct nfs_server *nfss = NFS_SERVER(lo->inode);
+ 
+ 	dprintk("--> %s\n", __func__);
+ 
+@@ -101,7 +295,7 @@ filelayout_check_layout(struct pnfs_layo
+ 	/* find and reference the deviceid */
+ 	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+ 	if (dsaddr == NULL) {
+-		dsaddr = get_device_info(lo->plh_inode, id);
++		dsaddr = get_device_info(lo->inode, id);
+ 		if (dsaddr == NULL)
+ 			goto out;
+ 	}
+@@ -243,7 +437,7 @@ filelayout_alloc_lseg(struct pnfs_layout
+ static void
+ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+ {
+-	struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
++	struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+ 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ 
+ 	dprintk("--> %s\n", __func__);
+@@ -252,14 +446,229 @@ filelayout_free_lseg(struct pnfs_layout_
+ 	_filelayout_free_lseg(fl);
+ }
+ 
++/* Allocate a new nfs_write_data struct and initialize */
++static struct nfs_write_data *
++filelayout_clone_write_data(struct nfs_write_data *old)
 +{
-+	if (fei->fi_extents_start) {
-+		kfree(fei->fi_extents_start);
-+		fei->fi_extents_start = NULL;
-+	}
++	static struct nfs_write_data *new;
++
++	new = nfs_commitdata_alloc();
++	if (!new)
++		goto out;
++	kref_init(&new->refcount);
++	new->parent      = old;
++	kref_get(&old->refcount);
++	new->inode       = old->inode;
++	new->cred        = old->cred;
++	new->args.offset = 0;
++	new->args.count  = 0;
++	new->res.count   = 0;
++	new->res.fattr   = &new->fattr;
++	nfs_fattr_init(&new->fattr);
++	new->res.verf    = &new->verf;
++	new->args.context = get_nfs_open_context(old->args.context);
++	new->pdata.lseg = NULL;
++	new->pdata.call_ops = old->pdata.call_ops;
++	new->pdata.how = old->pdata.how;
++out:
++	return new;
 +}
 +
-+/*
-+ * device_slice -- check to see if device is a slice or DM
-+ */
-+static boolean_t
-+device_slice(dev_t devid)
++static void filelayout_commit_call_done(struct rpc_task *task, void *data)
 +{
-+	struct block_device	*bd	= open_by_devnum(devid, FMODE_READ);
-+	boolean_t		rval	= False;
-+	
-+	if (bd) {
-+		if (bd->bd_disk->minors > 1)
-+			rval = True;
-+		blkdev_put(bd, FMODE_READ);
-+	}
-+	return rval;
++	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
++
++	wdata->pdata.call_ops->rpc_call_done(task, data);
 +}
 +
++static struct rpc_call_ops filelayout_commit_call_ops = {
++	.rpc_call_prepare = nfs_write_prepare,
++	.rpc_call_done = filelayout_commit_call_done,
++	.rpc_release = filelayout_write_release,
++};
++
 +/*
-+ * device_dm -- check to see if device is a Device Mapper volume.
-+ *
-+ * Returns 1 for DM or 0 if not
++ * Execute a COMMIT op to the MDS or to each data server on which a page
++ * in 'pages' exists.
++ * Invoke the pnfs_commit_complete callback.
 + */
-+static boolean_t
-+device_dm(dev_t devid)
++enum pnfs_try_status
++filelayout_commit(struct nfs_write_data *data, int sync)
 +{
-+	boolean_t		rval = False;
-+	bl_comm_msg_t		msg;
-+	bl_comm_res_t		*res;
-+	
-+	msg.msg_type	= PNFS_UPCALL_MSG_DMCHK;
-+	msg.u.msg_dev	= devid;
-+	if (bl_upcall(bl_comm_global, &msg, &res)) {
-+		dprintk("Failed upcall to check on DM status\n");
-+	} else if (res->u.dm_vol) {
-+		rval = True;
-+		dprintk("Device is DM volume\n");
-+	} else
-+		dprintk("Device is not DM volume\n");
-+	kfree(res);
-+	
-+	return rval;
-+}
++	LIST_HEAD(head);
++	struct nfs_page *req;
++	loff_t file_offset = 0;
++	u16 idx, i;
++	struct list_head **ds_page_list = NULL;
++	u16 *indices_used;
++	int num_indices_seen = 0;
++	bool used_mds = false;
++	const struct rpc_call_ops *call_ops;
++	struct rpc_clnt *clnt;
++	struct nfs_write_data **clone_list = NULL;
++	struct nfs_write_data *dsdata;
++	struct nfs4_pnfs_ds *ds;
 +
-+static boolean_t
-+layout_inode_add(struct inode *i, bl_layout_rec_t **p)
-+{
-+	bl_layout_rec_t		*r	= NULL;
++	dprintk("%s data %p sync %d\n", __func__, data, sync);
 +
-+	if (!i->i_op->fiemap || !i->i_op->fallocate) {
-+		printk("pNFS: file system doesn't support required fiemap or"
-+		    "fallocate methods\n");
-+		return False;
++	/* Alloc room for both in one go */
++	ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) *
++			       (sizeof(u16) + sizeof(struct list_head *)),
++			       GFP_KERNEL);
++	if (!ds_page_list)
++		goto mem_error;
++	indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1);
++	/*
++	 * Sort pages based on which ds to send to.
++	 * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT.
++	 * Note we are assuming there is only a single lseg in play.
++	 * When that is not true, we could first sort on lseg, then
++	 * sort within each as we do here.
++	 */
++	while (!list_empty(&data->pages)) {
++		req = nfs_list_entry(data->pages.next);
++		nfs_list_remove_request(req);
++		if (!req->wb_lseg ||
++		    ((struct nfs4_filelayout_segment *)
++		     FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds)
++			idx = NFS4_PNFS_MAX_MULTI_CNT;
++		else {
++			file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
++			idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset);
++		}
++		if (ds_page_list[idx]) {
++			/* Already seen this idx */
++			list_add(&req->wb_list, ds_page_list[idx]);
++		} else {
++			/* New idx not seen so far */
++			list_add_tail(&req->wb_list, &head);
++			indices_used[num_indices_seen++] = idx;
++		}
++		ds_page_list[idx] = &req->wb_list;
 +	}
-+	
-+	r = kmalloc(sizeof (*r), GFP_KERNEL);
-+	if (!r)
-+		goto error;
-+
-+	r->blr_rdev	= i->i_sb->s_dev;
-+	r->blr_inode	= i;
-+	r->blr_orig_size = i->i_size;
-+	r->blr_ext_size	= 0;
-+	r->blr_recalled	= 0;
-+	INIT_LIST_HEAD(&r->blr_layouts);
-+	spin_lock_init(&r->blr_lock);
-+	spin_lock(&layout_hashtbl_lock);
-+	list_add_tail(&r->blr_hash, &layout_hash);
-+	spin_unlock(&layout_hashtbl_lock);
-+	*p = r;
-+	return True;
-+	
-+error:
-+	if (r)
-+		kfree(r);
-+	return False;
-+}
++	/* Once created, clone must be released via call_op */
++	clone_list = kzalloc(num_indices_seen *
++			     sizeof(struct nfs_write_data *), GFP_KERNEL);
++	if (!clone_list)
++		goto mem_error;
++	for (i = 0; i < num_indices_seen - 1; i++) {
++		if (indices_used[i] == NFS4_PNFS_MAX_MULTI_CNT) {
++			used_mds = true;
++			clone_list[i] = data;
++		} else {
++			clone_list[i] = filelayout_clone_write_data(data);
++			if (!clone_list[i])
++				goto mem_error;
++		}
++	}
++	if (used_mds) {
++		clone_list[i] = filelayout_clone_write_data(data);
++		if (!clone_list[i])
++			goto mem_error;
++	} else
++		clone_list[i] = data;
++	/*
++	 * Now send off the RPCs to each ds.  Note that it is important
++	 * that any RPC to the MDS be sent last (or at least after all
++	 * clones have been made.)
++	 */
++	for (i = 0; i < num_indices_seen; i++) {
++		dsdata = clone_list[i];
++		idx = indices_used[i];
++		list_cut_position(&dsdata->pages, &head, ds_page_list[idx]);
++		if (idx == NFS4_PNFS_MAX_MULTI_CNT) {
++			call_ops = data->pdata.call_ops;;
++			clnt = NFS_CLIENT(dsdata->inode);
++			ds = NULL;
++		} else {
++			struct nfs_fh *fh;
 +
-+static bl_layout_rec_t *
-+__layout_inode_find(struct inode *i)
-+{
-+	bl_layout_rec_t	*r;
-+	
-+	if (!list_empty(&layout_hash)) {
-+		list_for_each_entry(r, &layout_hash, blr_hash) {
-+			if ((r->blr_inode->i_ino == i->i_ino) &&
-+			    (r->blr_rdev == i->i_sb->s_dev)) {
-+				return r;
++			call_ops = &filelayout_commit_call_ops;
++			req = nfs_list_entry(dsdata->pages.next);
++			ds = nfs4_fl_prepare_ds(req->wb_lseg, idx);
++			if (!ds) {
++				/* Trigger retry of this chunk through MDS */
++				dsdata->task.tk_status = -EIO;
++				data->pdata.call_ops->rpc_release(dsdata);
++				continue;
 +			}
++			clnt = ds->ds_clp->cl_rpcclient;
++			dsdata->fldata.ds_nfs_client = ds->ds_clp;
++			file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
++			fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset);
++			if (fh)
++				dsdata->args.fh = fh;
++		}
++		dprintk("%s: Initiating commit: %llu USE DS:\n",
++			__func__, file_offset);
++		ifdebug(FACILITY)
++			print_ds(ds);
++
++		/* Send COMMIT to data server */
++		nfs_initiate_commit(dsdata, clnt, call_ops, sync);
++	}
++	kfree(clone_list);
++	kfree(ds_page_list);
++	data->pdata.pnfs_error = 0;
++	return PNFS_ATTEMPTED;
++
++ mem_error:
++	if (clone_list) {
++		for (i = 0; i < num_indices_seen - 1; i++) {
++			if (!clone_list[i])
++				break;
++			data->pdata.call_ops->rpc_release(clone_list[i]);
 +		}
++		kfree(clone_list);
 +	}
-+	return NULL;
++	kfree(ds_page_list);
++	/* One of these will be empty, but doesn't hurt to do both */
++	nfs_mark_list_commit(&head);
++	nfs_mark_list_commit(&data->pages);
++	data->pdata.call_ops->rpc_release(data);
++	return PNFS_ATTEMPTED;
 +}
 +
-+static bl_layout_rec_t *
-+layout_inode_find(struct inode *i)
++/*
++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
++ *
++ * return 1 :  coalesce page
++ * return 0 :  don't coalesce page
++ *
++ * By the time this is called, we know req->wb_lseg == prev->wb_lseg
++ */
++int
++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
++		   struct nfs_page *req)
 +{
-+	bl_layout_rec_t	*r;
++	u64 p_stripe, r_stripe;
++	u32 stripe_unit;
 +
-+	spin_lock(&layout_hashtbl_lock);
-+	r = __layout_inode_find(i);
-+	spin_unlock(&layout_hashtbl_lock);
-+	
-+	return r;
++	if (!req->wb_lseg)
++		return 1;
++	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
++	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
++	stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit;
++
++	do_div(p_stripe, stripe_unit);
++	do_div(r_stripe, stripe_unit);
++
++	return (p_stripe == r_stripe);
 +}
 +
-+static void
-+layout_inode_del(struct inode *i)
+ static struct pnfs_layoutdriver_type filelayout_type = {
+ 	.id = LAYOUT_NFSV4_1_FILES,
+ 	.name = "LAYOUT_NFSV4_1_FILES",
+ 	.owner = THIS_MODULE,
++	.flags                   = PNFS_USE_RPC_CODE,
+ 	.set_layoutdriver = filelayout_set_layoutdriver,
+ 	.clear_layoutdriver = filelayout_clear_layoutdriver,
+ 	.alloc_lseg              = filelayout_alloc_lseg,
+ 	.free_lseg               = filelayout_free_lseg,
++	.pg_test                 = filelayout_pg_test,
++	.read_pagelist           = filelayout_read_pagelist,
++	.write_pagelist          = filelayout_write_pagelist,
++	.commit                  = filelayout_commit,
+ };
+ 
+ static int __init nfs4filelayout_init(void)
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c
+--- linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4filelayoutdev.c	2011-01-28 09:43:53.321774623 -0500
+@@ -104,6 +104,109 @@ _data_server_lookup_locked(u32 ip_addr, 
+ 	return NULL;
+ }
+ 
++/* Create an rpc to the data server defined in 'dev_list' */
++static int
++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 +{
-+	bl_layout_rec_t	*r;
-+	
-+	spin_lock(&layout_hashtbl_lock);
-+	r = __layout_inode_find(i);
-+	if (r) {
-+		spin_lock(&r->blr_lock);
-+		if (list_empty(&r->blr_layouts)) {
-+			list_del(&r->blr_hash);
-+			spin_unlock(&r->blr_lock);
-+			kfree(r);
++	struct nfs_server	*tmp;
++	struct sockaddr_in	sin;
++	struct rpc_clnt		*mds_clnt = mds_srv->client;
++	struct nfs_client	*clp = mds_srv->nfs_client;
++	struct sockaddr		*mds_addr;
++	int err = 0;
++
++	dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
++		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
++		mds_clnt->cl_auth->au_flavor);
++
++	sin.sin_family = AF_INET;
++	sin.sin_addr.s_addr = ds->ds_ip_addr;
++	sin.sin_port = ds->ds_port;
++
++	/*
++	 * If this DS is also the MDS, use the MDS session only if the
++	 * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role.
++	 */
++	mds_addr = (struct sockaddr *)&clp->cl_addr;
++	if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) {
++		if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
++			printk(KERN_INFO
++			       "ip:port %x:%hu is not a pNFS Data Server\n",
++			       ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++			err = -ENODEV;
 +		} else {
-+			spin_unlock(&r->blr_lock);
++			atomic_inc(&clp->cl_count);
++			ds->ds_clp = clp;
++			dprintk("%s Using MDS Session for DS\n", __func__);
 +		}
-+	} else {
-+		dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n",
-+			__func__, i->i_sb->s_dev, i->i_ino);
++		goto out;
 +	}
-+	spin_unlock(&layout_hashtbl_lock);
-+}
 +
-+/*
-+ * map_state2name -- converts state in ascii string.
-+ *
-+ * Used for debug messages only.
-+ */
-+static char *
-+map_state2name(enum pnfs_block_extent_state4 s)
-+{
-+	switch (s) {
-+	case PNFS_BLOCK_READWRITE_DATA:	return "     RW";
-+	case PNFS_BLOCK_READ_DATA:	return "     RO";
-+	case PNFS_BLOCK_INVALID_DATA:	return "INVALID";
-+	case PNFS_BLOCK_NONE_DATA:	return "   NONE";
-+	default:
-+		BUG();
-+	}
-+}
++	/* Temporay server for nfs4_set_client */
++	tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
++	if (!tmp)
++		goto out;
 +
-+static pnfs_blocklayout_devinfo_t *
-+bld_alloc(struct list_head *volumes, int type)
-+{
-+	pnfs_blocklayout_devinfo_t *bld;
-+	
-+	bld = kmalloc(sizeof (*bld), GFP_KERNEL);
-+	if (!bld)
-+		return NULL;
++	/*
++	 * Set a retrans, timeout interval, and authflavor equual to the MDS
++	 * values. Use the MDS nfs_client cl_ipaddr field so as to use the
++	 * same co_ownerid as the MDS.
++	 */
++	err = nfs4_set_client(tmp,
++			      mds_srv->nfs_client->cl_hostname,
++			      (struct sockaddr *)&sin,
++			      sizeof(struct sockaddr),
++			      mds_srv->nfs_client->cl_ipaddr,
++			      mds_clnt->cl_auth->au_flavor,
++			      IPPROTO_TCP,
++			      mds_clnt->cl_xprt->timeout,
++			      1 /* minorversion */);
++	if (err < 0)
++		goto out_free;
 +
-+	memset(bld, 0, sizeof (*bld));
-+	bld->bld_type = type;
-+	list_add_tail(&bld->bld_list, volumes);
++	clp = tmp->nfs_client;
 +
-+	return bld;
-+}
++	/* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */
++	dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp);
++	clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS;
 +
-+static void
-+bld_free(pnfs_blocklayout_devinfo_t *bld)
-+{
-+	list_del(&bld->bld_list);
-+	kfree(bld);
-+}
++	err = nfs4_recover_expired_lease(clp);
++	if (!err)
++		err = nfs4_check_client_ready(clp);
++	if (err)
++		goto out_put;
 +
-+static void
-+print_bll(pnfs_blocklayout_layout_t *b, char *text)
-+{
-+	dprintk("    BLL: %s\n", text);
-+	dprintk("    foff %Lu, soff %Lu, len %Lu, state %s\n",
-+	    _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len),
-+	    map_state2name(b->bll_es));
++	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
++		printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n",
++		       ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++		err = -ENODEV;
++		goto out_put;
++	}
++	/*
++	 * Set DS lease equal to the MDS lease, renewal is scheduled in
++	 * create_session
++	 */
++	spin_lock(&mds_srv->nfs_client->cl_lock);
++	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
++	spin_unlock(&mds_srv->nfs_client->cl_lock);
++	clp->cl_last_renewal = jiffies;
++
++	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
++	ds->ds_clp = clp;
++
++	dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__,
++				ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
++				clp->cl_rpcclient);
++out_free:
++	kfree(tmp);
++out:
++	dprintk("%s Returns %d\n", __func__, err);
++	return err;
++out_put:
++	nfs_put_client(clp);
++	goto out_free;
 +}
 +
-+static inline void
-+bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c)
+ static void
+ destroy_ds(struct nfs4_pnfs_ds *ds)
+ {
+@@ -446,3 +549,72 @@ nfs4_fl_find_get_deviceid(struct nfs_cli
+ 	return (d == NULL) ? NULL :
+ 		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+ }
++
++/*
++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
++ * Then: ((res + fsi) % dsaddr->stripe_count)
++ */
++static u32
++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
 +{
-+	pnfs_blocklayout_layout_t	*n;
-+	int				dbg_count	= 0;
-+	u64				endpoint;
-+	
-+	BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA);
-+	while (c->bll_list.next != &r->blr_layouts) {
-+		n = list_entry(c->bll_list.next,
-+			       struct pnfs_blocklayout_layout, bll_list);
-+		endpoint = BLL_S_END(c);
-+		if ((n->bll_soff >= c->bll_soff) &&
-+		    (n->bll_soff < endpoint)) {
-+			if (endpoint < BLL_S_END(n)) {
-+				/*
-+				 * The following is possible.
-+				 *
-+				 * 
-+				 * Existing: +---+                 +---+
-+				 *      New: +-----------------------+
-+				 * The client request merge entries together
-+				 * but didn't require picking up all of the
-+				 * last entry. So, we still need to delete
-+				 * the last entry and add the remaining space
-+				 * to the new entry.
-+				 */
-+				c->bll_len += BLL_S_END(n) - endpoint;
-+			}
-+			dbg_count++;
-+			list_del(&n->bll_list);
-+			kfree(n);
-+		} else {
-+			break;
-+		}
-+	}
-+	/* ---- Debug only, remove before integration ---- */
-+	if (dbg_count)
-+		dprintk("  Collapsed %d cache entries between %Lu(s) and %Lu(s)\n",
-+			dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c)));
++	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
++	u64 tmp;
++
++	tmp = offset - flseg->pattern_offset;
++	do_div(tmp, flseg->stripe_unit);
++	tmp += flseg->first_stripe_index;
++	return do_div(tmp, flseg->dsaddr->stripe_count);
 +}
 +
-+static pnfs_blocklayout_layout_t *
-+bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h)
++u32
++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset)
 +{
-+	pnfs_blocklayout_layout_t	*n	= NULL;
-+	
-+	n = kmalloc(sizeof (*n), GFP_KERNEL);
-+	if (n) {
-+		memset(n, 0, sizeof (*n));
-+		n->bll_foff		= offset;
-+		n->bll_len		= len;
-+		n->bll_cache_state	= state;
-+		if (h)
-+			list_add_tail(&n->bll_list, h);
-+	}
-+	return n;
++	u32 j;
++
++	j = _nfs4_fl_calc_j_index(lseg, offset);
++	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
 +}
 +
-+static pnfs_blocklayout_layout_t *
-+bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c,
-+	      struct list_head *h)
++struct nfs_fh *
++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset)
 +{
-+	pnfs_blocklayout_layout_t	*n	= NULL;
-+	
-+	n = bll_alloc(b->bll_foff, b->bll_len, c, h);
-+	if (n) {
-+		n->bll_es			= b->bll_es;
-+		n->bll_soff			= b->bll_soff;
-+		n->bll_vol_id.devid		= b->bll_vol_id.devid;
-+	}
-+	return n;
++	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
++	u32 i;
++
++	if (flseg->stripe_type == STRIPE_SPARSE) {
++		if (flseg->num_fh == 1)
++			i = 0;
++		else if (flseg->num_fh == 0)
++			return NULL;
++		else
++			i = nfs4_fl_calc_ds_index(lseg, offset);
++	} else
++		i = _nfs4_fl_calc_j_index(lseg, offset);
++	return flseg->fh_array[i];
 +}
 +
-+static inline boolean_t
-+layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
-+		enum pnfs_block_extent_state4 *s)
++struct nfs4_pnfs_ds *
++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 +{
-+	/* ---- Normal case ---- */
-+	*s = b->bll_es;
-+	
-+	switch (b->bll_es) {
-+	case PNFS_BLOCK_READWRITE_DATA:
-+		if (iomode == IOMODE_READ)
-+			*s = PNFS_BLOCK_READ_DATA;
-+		/* ---- Any use is permitted. ---- */
-+		break;
-+	case PNFS_BLOCK_READ_DATA:
-+		/* ---- Committed as read only data. ---- */
-+		if (iomode == IOMODE_RW)
-+			return False;
-+		break;
-+	case PNFS_BLOCK_INVALID_DATA:
-+		/* ---- Blocks have been allocated, but not initialized ---- */
-+		if (iomode == IOMODE_READ)
-+			*s = PNFS_BLOCK_NONE_DATA;
-+		break;
-+	case PNFS_BLOCK_NONE_DATA:
-+		/* ---- Hole-y file. No backing store avail. ---- */
-+		if (iomode != IOMODE_READ)
-+			return False;
-+		break;
-+	default:
-+		BUG();
++	struct nfs4_file_layout_dsaddr *dsaddr;
++
++	dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
++	if (dsaddr->ds_list[ds_idx] == NULL) {
++		printk(KERN_ERR "%s: No data server for device id!\n",
++			__func__);
++		return NULL;
 +	}
-+	return True;
++
++	if (!dsaddr->ds_list[ds_idx]->ds_clp) {
++		int err;
++
++		err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->layout->inode),
++					  dsaddr->ds_list[ds_idx]);
++		if (err) {
++			printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n",
++			       __func__, err);
++			return NULL;
++		}
++	}
++	return dsaddr->ds_list[ds_idx];
++}
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h
+--- linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4filelayout.h	2011-01-28 09:43:53.321774623 -0500
+@@ -83,9 +83,15 @@ FILELAYOUT_LSEG(struct pnfs_layout_segme
+ 			    generic_hdr);
+ }
+ 
++extern struct nfs_fh *
++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset);
++
+ extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+ extern void print_ds(struct nfs4_pnfs_ds *ds);
+ extern void print_deviceid(struct nfs4_deviceid *dev_id);
++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset);
++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
++					u32 ds_idx);
+ extern struct nfs4_file_layout_dsaddr *
+ nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+ struct nfs4_file_layout_dsaddr *
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.37.noarch/fs/nfs/nfs4_fs.h
+--- linux-2.6.37.noarch/fs/nfs/nfs4_fs.h.orig	2011-01-28 09:37:32.536980156 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4_fs.h	2011-01-28 09:43:53.319774971 -0500
+@@ -44,9 +44,9 @@ enum nfs4_client_state {
+ 	NFS4CLNT_RECLAIM_REBOOT,
+ 	NFS4CLNT_RECLAIM_NOGRACE,
+ 	NFS4CLNT_DELEGRETURN,
+-	NFS4CLNT_LAYOUTRECALL,
+ 	NFS4CLNT_SESSION_RESET,
+ 	NFS4CLNT_RECALL_SLOT,
++	NFS4CLNT_LAYOUT_RECALL,
+ };
+ 
+ enum nfs4_session_state {
+@@ -236,7 +236,7 @@ extern int nfs4_proc_async_renew(struct 
+ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
+ extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
+ extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
+-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
++extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+ extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
+ 		struct nfs4_fs_locations *fs_locations, struct page *page);
+@@ -250,10 +250,12 @@ static inline struct nfs4_session *nfs4_
+ }
+ 
+ extern int nfs4_setup_sequence(const struct nfs_server *server,
++		struct nfs4_session *ds_session,
+ 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+ 		int cache_reply, struct rpc_task *task);
+ extern void nfs4_destroy_session(struct nfs4_session *session);
+ extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *);
+ extern int nfs4_proc_create_session(struct nfs_client *);
+ extern int nfs4_proc_destroy_session(struct nfs4_session *);
+ extern int nfs4_init_session(struct nfs_server *server);
+@@ -266,6 +268,7 @@ static inline struct nfs4_session *nfs4_
+ }
+ 
+ static inline int nfs4_setup_sequence(const struct nfs_server *server,
++		struct nfs4_session *ds_session,
+ 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+ 		int cache_reply, struct rpc_task *task)
+ {
+@@ -283,7 +286,7 @@ extern const struct nfs4_minor_version_o
+ extern const u32 nfs4_fattr_bitmap[2];
+ extern const u32 nfs4_statfs_bitmap[2];
+ extern const u32 nfs4_pathconf_bitmap[2];
+-extern const u32 nfs4_fsinfo_bitmap[2];
++extern const u32 nfs4_fsinfo_bitmap[3];
+ extern const u32 nfs4_fs_locations_bitmap[2];
+ 
+ /* nfs4renewd.c */
+@@ -293,13 +296,24 @@ extern void nfs4_kill_renewd(struct nfs_
+ extern void nfs4_renew_state(struct work_struct *);
+ 
+ /* nfs4state.c */
++struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+ #if defined(CONFIG_NFS_V4_1)
+-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+ #endif /* CONFIG_NFS_V4_1 */
+ 
++static inline struct rpc_cred *
++nfs4_get_machine_cred(struct nfs_client *clp)
++{
++	struct rpc_cred *cred;
++
++	spin_lock(&clp->cl_lock);
++	cred = nfs4_get_machine_cred_locked(clp);
++	spin_unlock(&clp->cl_lock);
++	return cred;
 +}
 +
-+#endif /* CONFIG_SPNFS_BLOCK */
-diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
-index c2a4f71..2e025c2 100644
---- a/fs/nfsd/export.c
-+++ b/fs/nfsd/export.c
-@@ -17,11 +17,19 @@
- #include <linux/module.h>
- #include <linux/exportfs.h>
+ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+ extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.37.noarch/fs/nfs/nfs4proc.c
+--- linux-2.6.37.noarch/fs/nfs/nfs4proc.c.orig	2011-01-28 09:37:32.539980051 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4proc.c	2011-01-28 09:43:53.324774117 -0500
+@@ -69,7 +69,7 @@ struct nfs4_opendata;
+ static int _nfs4_proc_open(struct nfs4_opendata *data);
+ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
+ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *);
+ static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+@@ -127,12 +127,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
+ 	0
+ };
+ 
+-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
+ 			| FATTR4_WORD0_MAXREAD
+ 			| FATTR4_WORD0_MAXWRITE
+ 			| FATTR4_WORD0_LEASE_TIME,
+ 			FATTR4_WORD1_TIME_DELTA
+-			| FATTR4_WORD1_FS_LAYOUT_TYPES
++			| FATTR4_WORD1_FS_LAYOUT_TYPES,
++			FATTR4_WORD2_LAYOUT_BLKSIZE
+ };
+ 
+ const u32 nfs4_fs_locations_bitmap[2] = {
+@@ -572,6 +573,7 @@ static int nfs41_setup_sequence(struct n
+ }
+ 
+ int nfs4_setup_sequence(const struct nfs_server *server,
++		struct nfs4_session *ds_session,
+ 			struct nfs4_sequence_args *args,
+ 			struct nfs4_sequence_res *res,
+ 			int cache_reply,
+@@ -580,6 +582,8 @@ int nfs4_setup_sequence(const struct nfs
+ 	struct nfs4_session *session = nfs4_get_session(server);
+ 	int ret = 0;
+ 
++	if (ds_session)
++		session = ds_session;
+ 	if (session == NULL) {
+ 		args->sa_session = NULL;
+ 		res->sr_session = NULL;
+@@ -610,7 +614,7 @@ static void nfs41_call_sync_prepare(stru
+ 
+ 	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+ 
+-	if (nfs4_setup_sequence(data->seq_server, data->seq_args,
++	if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args,
+ 				data->seq_res, data->cache_reply, task))
+ 		return;
+ 	rpc_call_start(task);
+@@ -1398,7 +1402,7 @@ static void nfs4_open_prepare(struct rpc
+ 		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
+ 	}
+ 	data->timestamp = jiffies;
+-	if (nfs4_setup_sequence(data->o_arg.server,
++	if (nfs4_setup_sequence(data->o_arg.server, NULL,
+ 				&data->o_arg.seq_args,
+ 				&data->o_res.seq_res, 1, task))
+ 		return;
+@@ -1573,9 +1577,8 @@ static int _nfs4_proc_open(struct nfs4_o
+ 	return 0;
+ }
+ 
+-static int nfs4_recover_expired_lease(struct nfs_server *server)
++int nfs4_recover_expired_lease(struct nfs_client *clp)
+ {
+-	struct nfs_client *clp = server->nfs_client;
+ 	unsigned int loop;
+ 	int ret;
+ 
+@@ -1591,6 +1594,7 @@ static int nfs4_recover_expired_lease(st
+ 	}
+ 	return ret;
+ }
++EXPORT_SYMBOL(nfs4_recover_expired_lease);
+ 
+ /*
+  * OPEN_EXPIRED:
+@@ -1679,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d
+ 		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+ 		goto out_err;
+ 	}
+-	status = nfs4_recover_expired_lease(server);
++	status = nfs4_recover_expired_lease(server->nfs_client);
+ 	if (status != 0)
+ 		goto err_put_state_owner;
+ 	if (path->dentry->d_inode != NULL)
+@@ -1839,8 +1843,6 @@ struct nfs4_closedata {
+ 	struct nfs_closeres res;
+ 	struct nfs_fattr fattr;
+ 	unsigned long timestamp;
+-	bool roc;
+-	u32 roc_barrier;
+ };
+ 
+ static void nfs4_free_closedata(void *data)
+@@ -1848,8 +1850,6 @@ static void nfs4_free_closedata(void *da
+ 	struct nfs4_closedata *calldata = data;
+ 	struct nfs4_state_owner *sp = calldata->state->owner;
+ 
+-	if (calldata->roc)
+-		pnfs_roc_release(calldata->state->inode);
+ 	nfs4_put_open_state(calldata->state);
+ 	nfs_free_seqid(calldata->arg.seqid);
+ 	nfs4_put_state_owner(sp);
+@@ -1882,9 +1882,6 @@ static void nfs4_close_done(struct rpc_t
+ 	 */
+ 	switch (task->tk_status) {
+ 		case 0:
+-			if (calldata->roc)
+-				pnfs_roc_set_barrier(state->inode,
+-						     calldata->roc_barrier);
+ 			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
+ 			renew_lease(server, calldata->timestamp);
+ 			nfs4_close_clear_stateid_flags(state,
+@@ -1897,7 +1894,7 @@ static void nfs4_close_done(struct rpc_t
+ 			if (calldata->arg.fmode == 0)
+ 				break;
+ 		default:
+-			if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
++			if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+ 				rpc_restart_call_prepare(task);
+ 	}
+ 	nfs_release_seqid(calldata->arg.seqid);
+@@ -1937,19 +1934,12 @@ static void nfs4_close_prepare(struct rp
+ 		return;
+ 	}
+ 
+-	if (calldata->arg.fmode == 0) {
++	if (calldata->arg.fmode == 0)
+ 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+-		if (calldata->roc &&
+-		    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+-			rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+-				     task, NULL);
+-			return;
+-		}
+-	}
+ 
+ 	nfs_fattr_init(calldata->res.fattr);
+ 	calldata->timestamp = jiffies;
+-	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
++	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL,
+ 				&calldata->arg.seq_args, &calldata->res.seq_res,
+ 				1, task))
+ 		return;
+@@ -1973,7 +1963,7 @@ static const struct rpc_call_ops nfs4_cl
+  *
+  * NOTE: Caller must be holding the sp->so_owner semaphore!
+  */
+-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
++int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+ {
+ 	struct nfs_server *server = NFS_SERVER(state->inode);
+ 	struct nfs4_closedata *calldata;
+@@ -2008,7 +1998,6 @@ int nfs4_do_close(struct path *path, str
+ 	calldata->res.fattr = &calldata->fattr;
+ 	calldata->res.seqid = calldata->arg.seqid;
+ 	calldata->res.server = server;
+-	calldata->roc = roc;
+ 	path_get(path);
+ 	calldata->path = *path;
+ 
+@@ -2026,8 +2015,6 @@ int nfs4_do_close(struct path *path, str
+ out_free_calldata:
+ 	kfree(calldata);
+ out:
+-	if (roc)
+-		pnfs_roc_release(state->inode);
+ 	nfs4_put_open_state(state);
+ 	nfs4_put_state_owner(sp);
+ 	return status;
+@@ -2269,6 +2256,9 @@ nfs4_proc_setattr(struct dentry *dentry,
+ 	struct nfs4_state *state = NULL;
+ 	int status;
+ 
++	if (pnfs_ld_layoutret_on_setattr(inode))
++		pnfs_return_layout(inode, NULL, true);
++
+ 	nfs_fattr_init(fattr);
+ 	
+ 	/* Search for an existing open(O_WRITE) file */
+@@ -2596,7 +2586,7 @@ static int nfs4_proc_unlink_done(struct 
  
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+#if defined(CONFIG_SPNFS)
-+#include <linux/nfsd4_spnfs.h>
-+#if defined(CONFIG_SPNFS_BLOCK)
-+#include <linux/nfsd4_block.h>
-+#endif
-+#endif
- #include <linux/nfsd/syscall.h>
- #include <net/ipv6.h>
+ 	if (!nfs4_sequence_done(task, &res->seq_res))
+ 		return 0;
+-	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
++	if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
+ 		return 0;
+ 	update_changeattr(dir, &res->cinfo);
+ 	nfs_post_op_update_inode(dir, res->dir_attr);
+@@ -2621,7 +2611,7 @@ static int nfs4_proc_rename_done(struct 
  
- #include "nfsd.h"
- #include "nfsfh.h"
-+#include "pnfsd.h"
+ 	if (!nfs4_sequence_done(task, &res->seq_res))
+ 		return 0;
+-	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
++	if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
+ 		return 0;
  
- #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
+ 	update_changeattr(old_dir, &res->old_cinfo);
+@@ -3072,19 +3062,31 @@ static int nfs4_proc_pathconf(struct nfs
+ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+ {
+ 	struct nfs_server *server = NFS_SERVER(data->inode);
++	struct nfs_client *client = server->nfs_client;
  
-@@ -352,10 +360,84 @@ static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
- 	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
- }
+ 	dprintk("--> %s\n", __func__);
  
-+#if defined(CONFIG_PNFSD)
-+static struct pnfsd_cb_operations pnfsd_cb_op = {
-+	.cb_layout_recall = nfsd_layout_recall_cb,
-+	.cb_device_notify = nfsd_device_notify_cb,
-+
-+	.cb_get_state = nfs4_pnfs_cb_get_state,
-+	.cb_change_state = nfs4_pnfs_cb_change_state,
-+};
-+
-+#if defined(CONFIG_SPNFS)
-+static struct pnfs_export_operations spnfs_export_ops = {
-+	.layout_type = spnfs_layout_type,
-+	.get_device_info = spnfs_getdeviceinfo,
-+	.get_device_iter = spnfs_getdeviceiter,
-+	.layout_get = spnfs_layoutget,
-+	.layout_return = spnfs_layoutreturn,
-+};
-+
-+static struct pnfs_export_operations spnfs_ds_export_ops = {
-+	.get_state = spnfs_get_state,
-+};
++#ifdef CONFIG_NFS_V4_1
++	if (data->pdata.pnfsflags & PNFS_NO_RPC)
++		return 0;
 +
-+#if defined(CONFIG_SPNFS_BLOCK)
-+static struct pnfs_export_operations bl_export_ops = {
-+	.layout_type = bl_layout_type,
-+	.get_device_info = bl_getdeviceinfo,
-+	.get_device_iter = bl_getdeviceiter,
-+	.layout_get = bl_layoutget,
-+	.layout_return = bl_layoutreturn,
-+};
-+#endif /* CONFIG_SPNFS_BLOCK */
-+#endif /* CONFIG_SPNFS */
-+#endif /* CONFIG_PNFSD */
++	/* Is this a DS session */
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS read\n", __func__);
++		client = data->fldata.ds_nfs_client;
++	}
++#endif /* CONFIG_NFS_V4_1 */
 +
- static struct svc_export *svc_export_update(struct svc_export *new,
- 					    struct svc_export *old);
- static struct svc_export *svc_export_lookup(struct svc_export *);
+ 	if (!nfs4_sequence_done(task, &data->res.seq_res))
+ 		return -EAGAIN;
  
-+static int pnfsd_check_export(struct inode *inode, int *flags)
+-	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+-		nfs_restart_rpc(task, server->nfs_client);
++	if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
++		nfs_restart_rpc(task, client);
+ 		return -EAGAIN;
+ 	}
+ 
+ 	nfs_invalidate_atime(data->inode);
+-	if (task->tk_status > 0)
++	if (task->tk_status > 0 && client == server->nfs_client)
+ 		renew_lease(server, data->timestamp);
+ 	return 0;
+ }
+@@ -3095,20 +3097,56 @@ static void nfs4_proc_read_setup(struct 
+ 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ }
+ 
++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data)
 +{
-+#if defined(CONFIG_PNFSD)
-+
-+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
-+	if (!inode->i_sb->s_pnfs_op)
-+		pnfsd_lexp_init(inode);
-+	return 0;
-+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
++#ifdef CONFIG_NFS_V4_1
++	pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
++	pnfs_need_layoutcommit(nfsi, data->args.context);
++#endif /* CONFIG_NFS_V4_1 */
++}
 +
-+#if defined(CONFIG_SPNFS)
-+#if defined(CONFIG_SPNFS_BLOCK)
-+	if (pnfs_block_enabled(inode, *flags)) {
-+		dprintk("set pnfs block export structure... \n");
-+		inode->i_sb->s_pnfs_op = &bl_export_ops;
-+	} else
-+#endif /* CONFIG_SPNFS_BLOCK */
-+	/*
-+	 * spnfs_enabled() indicates we're an MDS.
-+	 * XXX Better to check an export time option as well.
-+	 */
-+	if (spnfs_enabled()) {
-+		dprintk("set spnfs export structure...\n");
-+		inode->i_sb->s_pnfs_op = &spnfs_export_ops;
-+	} else {
-+		dprintk("%s spnfs not in use\n", __func__);
+ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+ {
+ 	struct inode *inode = data->inode;
+-	
++	struct nfs_server *server = NFS_SERVER(inode);
++	struct nfs_client *client = server->nfs_client;
 +
-+		/*
-+		 * get_state is needed if we're a DS using spnfs.
-+		 * XXX Better to check an export time option instead.
-+		 */
-+		inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops;
+ 	if (!nfs4_sequence_done(task, &data->res.seq_res))
+ 		return -EAGAIN;
+ 
+-	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+-		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
++#ifdef CONFIG_NFS_V4_1
++	/* restore original count after retry? */
++	if (data->pdata.orig_count) {
++		dprintk("%s: restoring original count %u\n", __func__,
++			data->pdata.orig_count);
++		data->args.count = data->pdata.orig_count;
 +	}
-+#endif /* CONFIG_SPNFS */
 +
-+#endif /* CONFIG_PNFSD */
++	if (data->pdata.pnfsflags & PNFS_NO_RPC)
++		return 0;
 +
-+	return 0;
-+}
++	/* Is this a DS session */
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS write\n", __func__);
++		client = data->fldata.ds_nfs_client;
++	}
++#endif /* CONFIG_NFS_V4_1 */
 +
- static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
- {
- 
-@@ -395,8 +477,17 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
- 		return -EINVAL;
++	if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
++		nfs_restart_rpc(task, client);
+ 		return -EAGAIN;
  	}
- 
--	return 0;
-+#if !defined(CONFIG_SPNFS)
-+	if (inode->i_sb->s_pnfs_op &&
-+	    (!inode->i_sb->s_pnfs_op->layout_type ||
-+	     !inode->i_sb->s_pnfs_op->get_device_info ||
-+	     !inode->i_sb->s_pnfs_op->layout_get)) {
-+		dprintk("exp_export: export of invalid fs pnfs export ops.\n");
-+		return -EINVAL;
-+	}
-+#endif /* !CONFIG_SPNFS */
- 
-+	return pnfsd_check_export(inode, flags);
++
++	/*
++	 * MDS write: renew lease
++	 * DS write: update lastbyte written, mark for layout commit
++	 */
+ 	if (task->tk_status >= 0) {
+-		renew_lease(NFS_SERVER(inode), data->timestamp);
+-		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
++		if (client == server->nfs_client) {
++			renew_lease(server, data->timestamp);
++			nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
++		} else
++			pnfs4_update_write_done(NFS_I(inode), data);
+ 	}
+ 	return 0;
  }
+@@ -3121,21 +3159,42 @@ static void nfs4_proc_write_setup(struct
+ 	data->res.server = server;
+ 	data->timestamp   = jiffies;
  
- #ifdef CONFIG_NFSD_V4
-@@ -586,6 +677,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
- 					if (exp.ex_uuid == NULL)
- 						err = -ENOMEM;
- 				}
-+			} else if (strcmp(buf, "pnfs") == 0) {
-+				exp.ex_pnfs = 1;
- 			} else if (strcmp(buf, "secinfo") == 0)
- 				err = secinfo_parse(&mesg, buf, &exp);
- 			else
-@@ -660,6 +753,8 @@ static int svc_export_show(struct seq_file *m,
- 				seq_printf(m, "%02x", exp->ex_uuid[i]);
- 			}
- 		}
-+		if (exp->ex_pnfs)
-+			seq_puts(m, ",pnfs");
- 		show_secinfo(m, exp);
- 	}
- 	seq_puts(m, ")\n");
-@@ -687,6 +782,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
- 	new->ex_fslocs.locations = NULL;
- 	new->ex_fslocs.locations_count = 0;
- 	new->ex_fslocs.migrated = 0;
-+	new->ex_pnfs = 0;
++#ifdef CONFIG_NFS_V4_1
++	/* writes to DS use pnfs vector */
++	if (data->fldata.ds_nfs_client) {
++		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE];
++		return;
++	}
++#endif /* CONFIG_NFS_V4_1 */
+ 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
  }
  
- static void export_update(struct cache_head *cnew, struct cache_head *citem)
-@@ -699,6 +795,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
- 	new->ex_anon_uid = item->ex_anon_uid;
- 	new->ex_anon_gid = item->ex_anon_gid;
- 	new->ex_fsid = item->ex_fsid;
-+	new->ex_pnfs = item->ex_pnfs;
- 	new->ex_uuid = item->ex_uuid;
- 	item->ex_uuid = NULL;
- 	new->ex_pathname = item->ex_pathname;
-@@ -1635,8 +1732,17 @@ nfsd_export_init(void)
- 	if (rv)
- 		return rv;
- 	rv = cache_register(&svc_expkey_cache);
--	if (rv)
-+	if (rv) {
- 		cache_unregister(&svc_export_cache);
-+		goto out;
+ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+ {
+ 	struct inode *inode = data->inode;
+-	
++	struct nfs_server *server = NFS_SERVER(data->inode);
++	struct nfs_client *client = server->nfs_client;
++
++#ifdef CONFIG_NFS_V4_1
++	if (data->pdata.pnfsflags & PNFS_NO_RPC)
++		return 0;
++
++	/* Is this a DS session */
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS commit\n", __func__);
++		client = data->fldata.ds_nfs_client;
 +	}
-+#if defined(CONFIG_PNFSD)
-+	spin_lock(&pnfsd_cb_ctl.lock);
-+	pnfsd_cb_ctl.module = THIS_MODULE;
-+	pnfsd_cb_ctl.cb_op = &pnfsd_cb_op;
-+	spin_unlock(&pnfsd_cb_ctl.lock);
-+#endif /* CONFIG_PNFSD */
-+out:
- 	return rv;
++#endif /* CONFIG_NFS_V4_1 */
++
+ 	if (!nfs4_sequence_done(task, &data->res.seq_res))
+ 		return -EAGAIN;
  
+-	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
++	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) {
+ 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+ 		return -EAGAIN;
+ 	}
+-	nfs_refresh_inode(inode, data->res.fattr);
++	if (client == server->nfs_client)
++		nfs_refresh_inode(inode, data->res.fattr);
+ 	return 0;
  }
-@@ -1664,6 +1770,12 @@ nfsd_export_shutdown(void)
  
- 	exp_writelock();
+@@ -3145,6 +3204,12 @@ static void nfs4_proc_commit_setup(struc
+ 	
+ 	data->args.bitmask = server->cache_consistency_bitmask;
+ 	data->res.server = server;
++#if defined(CONFIG_NFS_V4_1)
++	if (data->fldata.ds_nfs_client) {
++		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT];
++		return;
++	}
++#endif /* CONFIG_NFS_V4_1 */
+ 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+ }
  
-+#if defined(CONFIG_PNFSD)
-+	spin_lock(&pnfsd_cb_ctl.lock);
-+	pnfsd_cb_ctl.module = NULL;
-+	pnfsd_cb_ctl.cb_op = NULL;
-+	spin_unlock(&pnfsd_cb_ctl.lock);
-+#endif /* CONFIG_PNFSD */
- 	cache_unregister(&svc_expkey_cache);
- 	cache_unregister(&svc_export_cache);
- 	svcauth_unix_purge();
-diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
-index 988cbb3..fc8f498 100644
---- a/fs/nfsd/nfs4callback.c
-+++ b/fs/nfsd/nfs4callback.c
-@@ -41,7 +41,6 @@
+@@ -3451,9 +3516,10 @@ static int nfs4_proc_set_acl(struct inod
+ }
  
- #define NFSPROC4_CB_NULL 0
- #define NFSPROC4_CB_COMPOUND 1
--#define NFS4_STATEID_SIZE 16
+ static int
+-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp)
+ {
+-	struct nfs_client *clp = server->nfs_client;
++	if (!clp)
++		clp = server->nfs_client;
  
- /* Index of predefined Linux callback client operations */
+ 	if (task->tk_status >= 0)
+ 		return 0;
+@@ -3477,14 +3543,16 @@ nfs4_async_handle_error(struct rpc_task 
+ 		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ 		case -NFS4ERR_SEQ_FALSE_RETRY:
+ 		case -NFS4ERR_SEQ_MISORDERED:
+-			dprintk("%s ERROR %d, Reset session\n", __func__,
+-				task->tk_status);
++			dprintk("%s ERROR %d, Reset session. Exchangeid "
++				"flags 0x%x\n", __func__, task->tk_status,
++				clp->cl_exchange_flags);
+ 			nfs4_schedule_state_recovery(clp);
+ 			task->tk_status = 0;
+ 			return -EAGAIN;
+ #endif /* CONFIG_NFS_V4_1 */
+ 		case -NFS4ERR_DELAY:
+-			nfs_inc_server_stats(server, NFSIOS_DELAY);
++			if (server)
++				nfs_inc_server_stats(server, NFSIOS_DELAY);
+ 		case -NFS4ERR_GRACE:
+ 		case -EKEYEXPIRED:
+ 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
+@@ -3497,6 +3565,8 @@ nfs4_async_handle_error(struct rpc_task 
+ 	task->tk_status = nfs4_map_errors(task->tk_status);
+ 	return 0;
+ do_state_recovery:
++	if (is_ds_only_client(clp))
++		return 0;
+ 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ 	nfs4_schedule_state_recovery(clp);
+ 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+@@ -3630,8 +3700,8 @@ static void nfs4_delegreturn_done(struct
+ 		renew_lease(data->res.server, data->timestamp);
+ 		break;
+ 	default:
+-		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
+-				-EAGAIN) {
++		if (nfs4_async_handle_error(task, data->res.server, NULL, NULL)
++				== -EAGAIN) {
+ 			nfs_restart_rpc(task, data->res.server->nfs_client);
+ 			return;
+ 		}
+@@ -3651,7 +3721,7 @@ static void nfs4_delegreturn_prepare(str
  
-@@ -49,11 +48,17 @@ enum {
- 	NFSPROC4_CLNT_CB_NULL = 0,
- 	NFSPROC4_CLNT_CB_RECALL,
- 	NFSPROC4_CLNT_CB_SEQUENCE,
-+#if defined(CONFIG_PNFSD)
-+	NFSPROC4_CLNT_CB_LAYOUT,
-+	NFSPROC4_CLNT_CB_DEVICE,
-+#endif
- };
+ 	d_data = (struct nfs4_delegreturndata *)data;
  
- enum nfs_cb_opnum4 {
- 	OP_CB_RECALL            = 4,
-+	OP_CB_LAYOUT            = 5,
- 	OP_CB_SEQUENCE          = 11,
-+	OP_CB_DEVICE            = 14,
- };
+-	if (nfs4_setup_sequence(d_data->res.server,
++	if (nfs4_setup_sequence(d_data->res.server, NULL,
+ 				&d_data->args.seq_args,
+ 				&d_data->res.seq_res, 1, task))
+ 		return;
+@@ -3885,7 +3955,7 @@ static void nfs4_locku_done(struct rpc_t
+ 		case -NFS4ERR_EXPIRED:
+ 			break;
+ 		default:
+-			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
++			if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN)
+ 				nfs_restart_rpc(task,
+ 						 calldata->server->nfs_client);
+ 	}
+@@ -3903,7 +3973,7 @@ static void nfs4_locku_prepare(struct rp
+ 		return;
+ 	}
+ 	calldata->timestamp = jiffies;
+-	if (nfs4_setup_sequence(calldata->server,
++	if (nfs4_setup_sequence(calldata->server, NULL,
+ 				&calldata->arg.seq_args,
+ 				&calldata->res.seq_res, 1, task))
+ 		return;
+@@ -4058,7 +4128,7 @@ static void nfs4_lock_prepare(struct rpc
+ 	} else
+ 		data->arg.new_lock_owner = 0;
+ 	data->timestamp = jiffies;
+-	if (nfs4_setup_sequence(data->server,
++	if (nfs4_setup_sequence(data->server, NULL,
+ 				&data->arg.seq_args,
+ 				&data->res.seq_res, 1, task))
+ 		return;
+@@ -5077,7 +5147,7 @@ int nfs4_init_session(struct nfs_server 
+ 	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
+ 	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
  
- #define NFS4_MAXTAGLEN		20
-@@ -79,6 +84,19 @@ enum nfs_cb_opnum4 {
- #define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
- 					cb_sequence_dec_sz +            \
- 					op_dec_sz)
-+#define NFS4_enc_cb_layout_sz		(cb_compound_enc_hdr_sz +       \
-+					cb_sequence_enc_sz +            \
-+					1 + 3 +                         \
-+					enc_nfs4_fh_sz + 4)
-+#define NFS4_dec_cb_layout_sz		(cb_compound_dec_hdr_sz  +      \
-+					cb_sequence_dec_sz +            \
-+					op_dec_sz)
-+#define NFS4_enc_cb_device_sz		(cb_compound_enc_hdr_sz +       \
-+					cb_sequence_enc_sz +            \
-+					1 + 6)
-+#define NFS4_dec_cb_device_sz		(cb_compound_dec_hdr_sz  +      \
-+					cb_sequence_dec_sz +            \
-+					op_dec_sz)
+-	ret = nfs4_recover_expired_lease(server);
++	ret = nfs4_recover_expired_lease(server->nfs_client);
+ 	if (!ret)
+ 		ret = nfs4_check_client_ready(clp);
+ 	return ret;
+@@ -5330,24 +5400,53 @@ static void
+ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+ {
+ 	struct nfs4_layoutget *lgp = calldata;
+-	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
++	struct inode *ino = lgp->args.inode;
++	struct nfs_inode *nfsi = NFS_I(ino);
++	struct nfs_server *server = NFS_SERVER(ino);
++	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
  
- /*
- * Generic encode routines from fs/nfs/nfs4xdr.c
-@@ -95,6 +113,10 @@ xdr_writemem(__be32 *p, const void *ptr, int nbytes)
+ 	dprintk("--> %s\n", __func__);
++	spin_lock(&clp->cl_lock);
++	if (matches_outstanding_recall(ino, &lgp->args.range)) {
++		rpc_sleep_on(&clp->cl_rpcwaitq_recall, task, NULL);
++		spin_unlock(&clp->cl_lock);
++		return;
++	}
++	spin_unlock(&clp->cl_lock);
+ 	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
+ 	 * right now covering the LAYOUTGET we are about to send.
+ 	 * However, that is not so catastrophic, and there seems
+ 	 * to be no way to prevent it completely.
+ 	 */
+-	if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+-				&lgp->res.seq_res, 0, task))
++	spin_lock(&ino->i_lock);
++	if (pnfs_layoutgets_blocked(nfsi->layout, NULL)) {
++		rpc_sleep_on(&nfsi->lo_rpcwaitq_stateid, task, NULL);
++		spin_unlock(&ino->i_lock);
+ 		return;
++	}
++	/* This needs after above check but atomic with it in order to properly
++	 * serialize openstateid LAYOUTGETs.
++	 */
++	atomic_inc(&nfsi->layout->plh_outstanding);
+ 	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ 					  NFS_I(lgp->args.inode)->layout,
+ 					  lgp->args.ctx->state)) {
+ 		rpc_exit(task, NFS4_OK);
+-		return;
++		goto err_out_locked;
++	}
++	spin_unlock(&ino->i_lock);
++
++	if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
++				&lgp->res.seq_res, 0, task)) {
++		goto err_out;
+ 	}
+ 	rpc_call_start(task);
++	return;
++err_out:
++	spin_lock(&ino->i_lock);
++err_out_locked:
++	atomic_dec(&nfsi->layout->plh_outstanding);
++	spin_unlock(&ino->i_lock);
  }
  
- #define WRITE32(n)               *p++ = htonl(n)
-+#define WRITE64(n)               do {				\
-+	*p++ = htonl((u32)((n) >> 32));				\
-+	*p++ = htonl((u32)(n));					\
-+} while (0)
- #define WRITEMEM(ptr,nbytes)     do {                           \
- 	p = xdr_writemem(p, ptr, nbytes);                       \
- } while (0)
-@@ -268,6 +290,111 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
- 	hdr->nops++;
- }
+ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+@@ -5357,9 +5456,14 @@ static void nfs4_layoutget_done(struct r
  
-+#if defined(CONFIG_PNFSD)
-+
-+#include "pnfsd.h"
-+
-+static void
-+encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr,
-+		 struct nfs4_cb_compound_hdr *hdr)
-+{
-+	u32 *p;
-+
-+	BUG_ON(hdr->minorversion == 0);
-+
-+	RESERVE_SPACE(20);
-+	WRITE32(OP_CB_LAYOUT);
-+	WRITE32(clr->cb.cbl_seg.layout_type);
-+	WRITE32(clr->cb.cbl_seg.iomode);
-+	WRITE32(clr->cb.cbl_layoutchanged);
-+	WRITE32(clr->cb.cbl_recall_type);
-+	if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) {
-+		struct nfs4_fsid fsid = clr->cb.cbl_fsid;
-+
-+		RESERVE_SPACE(16);
-+		WRITE64(fsid.major);
-+		WRITE64(fsid.minor);
-+		dprintk("%s: type %x iomode %d changed %d recall_type %d "
-+			"fsid 0x%llx-0x%llx\n",
-+			__func__, clr->cb.cbl_seg.layout_type,
-+			clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
-+			clr->cb.cbl_recall_type, fsid.major, fsid.minor);
-+	} else if (clr->cb.cbl_recall_type == RETURN_FILE) {
-+		int len = clr->clr_file->fi_fhlen;
-+		stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid;
+ 	dprintk("--> %s\n", __func__);
+ 
+-	if (!nfs4_sequence_done(task, &lgp->res.seq_res))
++	if (!nfs4_sequence_done(task, &lgp->res.seq_res)) {
++		/* layout code relies on fact that in this case
++		 * code falls back to tk_action=call_start, but not
++		 * back to rpc_prepare_task, to keep plh_outstanding
++		 * correct.
++		 */
+ 		return;
+-
++	}
+ 	switch (task->tk_status) {
+ 	case 0:
+ 		break;
+@@ -5368,7 +5472,12 @@ static void nfs4_layoutget_done(struct r
+ 		task->tk_status = -NFS4ERR_DELAY;
+ 		/* Fall through */
+ 	default:
+-		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
++		if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
++			struct inode *ino = lgp->args.inode;
 +
-+		RESERVE_SPACE(20 + len);
-+		WRITE32(len);
-+		WRITEMEM(clr->clr_file->fi_fhval, len);
-+		WRITE64(clr->cb.cbl_seg.offset);
-+		WRITE64(clr->cb.cbl_seg.length);
-+		encode_stateid(xdr, cbl_sid);
-+		dprintk("%s: type %x iomode %d changed %d recall_type %d "
-+			"offset %lld length %lld stateid " STATEID_FMT "\n",
-+			__func__, clr->cb.cbl_seg.layout_type,
-+			clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
-+			clr->cb.cbl_recall_type,
-+			clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length,
-+			STATEID_VAL(cbl_sid));
-+	} else {
-+		dprintk("%s: type %x iomode %d changed %d recall_type %d\n",
-+			__func__, clr->cb.cbl_seg.layout_type,
-+			clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
-+			clr->cb.cbl_recall_type);
++			spin_lock(&ino->i_lock);
++			atomic_dec(&NFS_I(ino)->layout->plh_outstanding);
++			spin_unlock(&ino->i_lock);
+ 			rpc_restart_call_prepare(task);
+ 			return;
+ 		}
+@@ -5381,6 +5490,7 @@ static void nfs4_layoutget_release(void 
+ 	struct nfs4_layoutget *lgp = calldata;
+ 
+ 	dprintk("--> %s\n", __func__);
++	put_layout_hdr(NFS_I(lgp->args.inode)->layout);
+ 	if (lgp->res.layout.buf != NULL)
+ 		free_page((unsigned long) lgp->res.layout.buf);
+ 	put_nfs_open_context(lgp->args.ctx);
+@@ -5429,11 +5539,279 @@ int nfs4_proc_layoutget(struct nfs4_layo
+ 		status = task->tk_status;
+ 	if (status == 0)
+ 		status = pnfs_layout_process(lgp);
++	else {
++		struct inode *ino = lgp->args.inode;
++		struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
++
++		spin_lock(&ino->i_lock);
++		atomic_dec(&lo->plh_outstanding);
++		if (!pnfs_layoutgets_blocked(lo, NULL))
++			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
++		spin_unlock(&ino->i_lock);
 +	}
-+	hdr->nops++;
++	rpc_put_task(task);
++	dprintk("<-- %s status=%d\n", __func__, status);
++	return status;
 +}
 +
-+static void
-+encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd,
-+		 struct nfs4_cb_compound_hdr *hdr)
++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data)
 +{
-+	u32 *p;
-+	int i;
-+	int len					= nd->nd_list->cbd_len;
-+	struct nfsd4_pnfs_cb_dev_item *cbd	= nd->nd_list->cbd_list;
-+
-+	dprintk("NFSD %s: --> num %d\n", __func__, len);
-+
-+	BUG_ON(hdr->minorversion == 0);
++	struct nfs4_layoutcommit_data *ldata =
++		(struct nfs4_layoutcommit_data *)data;
++	struct nfs_server *server = NFS_SERVER(ldata->args.inode);
 +
-+	RESERVE_SPACE(8);
-+	WRITE32(OP_CB_DEVICE);
++	if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args,
++				&ldata->res.seq_res, 1, task))
++		return;
++	ldata->res.status = -1;
++	rpc_call_start(task);
++}
 +
-+	/* notify4 cnda_changes<>; */
-+	WRITE32(len);
-+	for (i = 0; i < len; i++) {
-+		dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n",
-+			__func__, cbd[i].cbd_notify_type,
-+			cbd[i].cbd_layout_type,
-+			cbd[i].cbd_devid.sbid,
-+			cbd[i].cbd_devid.devid,
-+			cbd[i].cbd_immediate, i);
++static void
++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
++{
++	struct nfs4_layoutcommit_data *data =
++		(struct nfs4_layoutcommit_data *)calldata;
++	struct nfs_server *server = NFS_SERVER(data->args.inode);
 +
-+		BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
-+		       cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE);
-+		RESERVE_SPACE(32);
-+		/* bitmap4         notify_mask; */
-+		WRITE32(1);
-+		WRITE32(cbd[i].cbd_notify_type);
-+		/* opaque     notify_vals<>; */
-+		if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
-+			WRITE32(24);
-+		else
-+			WRITE32(20);
-+		WRITE32(cbd[i].cbd_layout_type);
-+		WRITE64(cbd[i].cbd_devid.sbid);
-+		WRITE64(cbd[i].cbd_devid.devid);
++	if (!nfs4_sequence_done(task, &data->res.seq_res))
++		return;
 +
-+		if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) {
-+			RESERVE_SPACE(4);
-+			WRITE32(cbd[i].cbd_immediate);
-+		}
-+	}
-+	hdr->nops++;
++	if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
++		nfs_restart_rpc(task, server->nfs_client);
 +}
-+#endif /* CONFIG_PNFSD */
 +
- static int
- nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
- {
-@@ -297,6 +424,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
- 	return 0;
- }
- 
-+#if defined(CONFIG_PNFSD)
-+static int
-+nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p,
-+		       struct nfs4_rpc_args *rpc_args)
++static void nfs4_layoutcommit_release(void *lcdata)
 +{
-+	struct xdr_stream xdr;
-+	struct nfs4_layoutrecall *args = rpc_args->args_op;
-+	struct nfs4_cb_compound_hdr hdr = {
-+		.ident = 0,
-+		.minorversion = rpc_args->args_seq.cbs_minorversion,
-+	};
++	struct nfs4_layoutcommit_data *data =
++		(struct nfs4_layoutcommit_data *)lcdata;
 +
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_cb_compound_hdr(&xdr, &hdr);
-+	encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
-+	encode_cb_layout(&xdr, args, &hdr);
-+	encode_cb_nops(&hdr);
-+	return 0;
++	pnfs_cleanup_layoutcommit(data->args.inode, data);
++	/* Matched by get_layout in pnfs_layoutcommit_inode */
++	put_layout_hdr(NFS_I(data->args.inode)->layout);
++	put_rpccred(data->cred);
++	kfree(lcdata);
 +}
 +
-+static int
-+nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p,
-+		       struct nfs4_rpc_args *rpc_args)
++static const struct rpc_call_ops nfs4_layoutcommit_ops = {
++	.rpc_call_prepare = nfs4_layoutcommit_prepare,
++	.rpc_call_done = nfs4_layoutcommit_done,
++	.rpc_release = nfs4_layoutcommit_release,
++};
++
++/* Execute a layoutcommit to the server */
++int
++nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
 +{
-+	struct xdr_stream xdr;
-+	struct nfs4_notify_device *args = rpc_args->args_op;
-+	struct nfs4_cb_compound_hdr hdr = {
-+		.ident = 0,
-+		.minorversion = rpc_args->args_seq.cbs_minorversion,
++	struct rpc_message msg = {
++		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
++		.rpc_argp = &data->args,
++		.rpc_resp = &data->res,
++		.rpc_cred = data->cred,
++	};
++	struct rpc_task_setup task_setup_data = {
++		.task = &data->task,
++		.rpc_client = NFS_CLIENT(data->args.inode),
++		.rpc_message = &msg,
++		.callback_ops = &nfs4_layoutcommit_ops,
++		.callback_data = data,
++		.flags = RPC_TASK_ASYNC,
 +	};
++	struct rpc_task *task;
++	int status = 0;
 +
-+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+	encode_cb_compound_hdr(&xdr, &hdr);
-+	encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
-+	encode_cb_device(&xdr, args, &hdr);
-+	encode_cb_nops(&hdr);
-+	return 0;
-+}
-+#endif /* CONFIG_PNFSD */
- 
- static int
- decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-@@ -413,6 +579,48 @@ out:
- 	return status;
- }
- 
-+#if defined(CONFIG_PNFSD)
-+static int
-+nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p,
-+		       struct nfsd4_cb_sequence *seq)
-+{
-+	struct xdr_stream xdr;
-+	struct nfs4_cb_compound_hdr hdr;
-+	int status;
++	dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu "
++		"type: %d issync %d\n",
++		data->task.tk_pid,
++		data->args.range.length,
++		data->args.range.offset,
++		data->args.lastbytewritten,
++		data->args.layout_type, issync);
 +
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_cb_compound_hdr(&xdr, &hdr);
-+	if (status)
++	task = rpc_run_task(&task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
++	if (!issync)
 +		goto out;
-+	status = decode_cb_sequence(&xdr, seq, rqstp);
-+	if (status)
++	status = nfs4_wait_for_completion_rpc_task(task);
++	if (status != 0)
 +		goto out;
-+	status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT);
++	status = task->tk_status;
 +out:
++	dprintk("%s: status %d\n", __func__, status);
++	rpc_put_task(task);
 +	return status;
 +}
 +
-+static int
-+nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p,
-+		       struct nfsd4_cb_sequence *seq)
++static void
++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
 +{
-+	struct xdr_stream xdr;
-+	struct nfs4_cb_compound_hdr hdr;
-+	int status;
-+
-+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+	status = decode_cb_compound_hdr(&xdr, &hdr);
-+	if (status)
-+		goto out;
-+	status = decode_cb_sequence(&xdr, seq, rqstp);
-+	if (status)
-+		goto out;
-+	status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE);
-+out:
-+	return status;
-+}
-+#endif /* CONFIG_PNFSD */
++	struct nfs4_layoutreturn *lrp = calldata;
 +
- /*
-  * RPC procedure tables
-  */
-@@ -430,6 +638,10 @@ out:
- static struct rpc_procinfo     nfs4_cb_procedures[] = {
-     PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
-     PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
-+#if defined(CONFIG_PNFSD)
-+    PROC(CB_LAYOUT,    COMPOUND,   enc_cb_layout,      dec_cb_layout),
-+    PROC(CB_DEVICE,    COMPOUND,   enc_cb_device,      dec_cb_device),
-+#endif
- };
- 
- static struct rpc_version       nfs_cb_version4 = {
-@@ -615,10 +827,9 @@ out:
-  * TODO: cb_sequence should support referring call lists, cachethis, multiple
-  * slots, and mark callback channel down on communication errors.
-  */
--static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
-+static void nfsd4_cb_prepare_sequence(struct rpc_task *task,
-+				      struct nfs4_client *clp)
- {
--	struct nfs4_delegation *dp = calldata;
--	struct nfs4_client *clp = dp->dl_client;
- 	struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
- 	u32 minorversion = clp->cl_cb_conn.cb_minorversion;
- 	int status = 0;
-@@ -638,11 +849,15 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
- 	rpc_call_start(task);
- }
- 
--static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
-+static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata)
- {
- 	struct nfs4_delegation *dp = calldata;
--	struct nfs4_client *clp = dp->dl_client;
-+	nfsd4_cb_prepare_sequence(task, dp->dl_client);
-+}
- 
-+static void nfsd4_cb_done_sequence(struct rpc_task *task,
-+				   struct nfs4_client *clp)
-+{
- 	dprintk("%s: minorversion=%d\n", __func__,
- 		clp->cl_cb_conn.cb_minorversion);
- 
-@@ -666,7 +881,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
- 	struct nfs4_client *clp = dp->dl_client;
- 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
- 
--	nfsd4_cb_done(task, calldata);
-+	nfsd4_cb_done_sequence(task, clp);
- 
- 	if (current_rpc_client == NULL) {
- 		/* We're shutting down; give up. */
-@@ -713,7 +928,7 @@ static void nfsd4_cb_recall_release(void *calldata)
- }
- 
- static const struct rpc_call_ops nfsd4_cb_recall_ops = {
--	.rpc_call_prepare = nfsd4_cb_prepare,
-+	.rpc_call_prepare = nfsd4_cb_recall_prepare,
- 	.rpc_call_done = nfsd4_cb_recall_done,
- 	.rpc_release = nfsd4_cb_recall_release,
- };
-@@ -788,3 +1003,173 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
- {
- 	queue_work(callback_wq, &dp->dl_recall.cb_work);
- }
++	dprintk("--> %s\n", __func__);
++	if (lrp->args.return_type == RETURN_FILE) {
++		struct nfs_inode *nfsi = NFS_I(lrp->args.inode);
 +
-+#if defined(CONFIG_PNFSD)
-+static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata)
-+{
-+	struct nfs4_layoutrecall *clr = calldata;
-+	nfsd4_cb_prepare_sequence(task, clr->clr_client);
++		if (pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
++			dprintk("%s: waiting on barrier\n", __func__);
++			rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
++			return;
++		}
++	}
++	if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
++				&lrp->res.seq_res, 0, task))
++		return;
++	rpc_call_start(task);
 +}
 +
-+static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata)
++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 +{
-+	struct nfs4_layoutrecall *clr = calldata;
-+	struct nfs4_client *clp = clr->clr_client;
++	struct nfs4_layoutreturn *lrp = calldata;
++	struct nfs_server *server;
 +
-+	nfsd4_cb_done_sequence(task, clp);
++	dprintk("--> %s\n", __func__);
 +
-+	if (!task->tk_status)
++	if (!nfs4_sequence_done(task, &lrp->res.seq_res))
 +		return;
 +
-+	printk("%s: clp %p cb_client %p fp %p failed with status %d\n",
-+	       __func__,
-+	       clp,
-+	       clp->cl_cb_client,
-+	       clr->clr_file,
-+	       task->tk_status);
++	if (lrp->args.return_type == RETURN_FILE)
++		server = NFS_SERVER(lrp->args.inode);
++	else
++		server = NULL;
++	if (nfs4_async_handle_error(task, server, NULL, lrp->clp) == -EAGAIN) {
++		nfs_restart_rpc(task, lrp->clp);
++		return;
++	}
++	if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
++		struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
 +
-+	switch (task->tk_status) {
-+	case -EIO:
-+		/* Network partition? */
-+		atomic_set(&clp->cl_cb_set, 0);
-+		warn_no_callback_path(clp, task->tk_status);
-+		/* FIXME:
-+		 * The pnfs standard states that we need to only expire
-+		 * the client after at-least "lease time" .eg lease-time * 2
-+		 * when failing to communicate a recall
-+		 */
-+		break;
-+	case -NFS4ERR_DELAY:
-+		/* Poll the client until it's done with the layout */
-+		rpc_delay(task, HZ/100); /* 10 mili-seconds */
-+		task->tk_status = 0;
-+		rpc_restart_call_prepare(task);
-+		break;
-+	case -NFS4ERR_NOMATCHING_LAYOUT:
-+		task->tk_status = 0;
-+		nomatching_layout(clr);
++		spin_lock(&lo->inode->i_lock);
++		if (lrp->res.lrs_present)
++			pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
++		else
++			BUG_ON(!list_empty(&lo->segs));
++		spin_unlock(&lo->inode->i_lock);
 +	}
++	dprintk("<-- %s\n", __func__);
 +}
 +
-+static void nfsd4_cb_layout_release(void *calldata)
++static void nfs4_layoutreturn_release(void *calldata)
 +{
-+	struct nfs4_layoutrecall *clr = calldata;
-+	kfree(clr->clr_args);
-+	clr->clr_args = NULL;
-+	put_layoutrecall(clr);
++	struct nfs4_layoutreturn *lrp = calldata;
++
++	dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
++	if (lrp->args.return_type == RETURN_FILE) {
++		struct inode *ino = lrp->args.inode;
++		struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
++
++		spin_lock(&ino->i_lock);
++		lo->plh_block_lgets--;
++		atomic_dec(&lo->plh_outstanding);
++		if (!pnfs_layoutgets_blocked(lo, NULL))
++			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
++		spin_unlock(&ino->i_lock);
++		put_layout_hdr(lo);
++	}
++	kfree(calldata);
++	dprintk("<-- %s\n", __func__);
 +}
 +
-+static const struct rpc_call_ops nfsd4_cb_layout_ops = {
-+	.rpc_call_prepare = nfsd4_cb_layout_prepare,
-+	.rpc_call_done = nfsd4_cb_layout_done,
-+	.rpc_release = nfsd4_cb_layout_release,
++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
++	.rpc_call_prepare = nfs4_layoutreturn_prepare,
++	.rpc_call_done = nfs4_layoutreturn_done,
++	.rpc_release = nfs4_layoutreturn_release,
 +};
 +
-+/*
-+ * Called with state lock.
-+ */
-+int
-+nfsd4_cb_layout(struct nfs4_layoutrecall *clr)
++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
 +{
-+	struct nfs4_client *clp = clr->clr_client;
-+	struct rpc_clnt *clnt = clp->cl_cb_client;
-+	struct nfs4_rpc_args *args;
++	struct rpc_task *task;
 +	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT],
-+		.rpc_cred = callback_cred
++		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
++		.rpc_argp = &lrp->args,
++		.rpc_resp = &lrp->res,
 +	};
-+	int status;
++	struct rpc_task_setup task_setup_data = {
++		.rpc_client = lrp->clp->cl_rpcclient,
++		.rpc_message = &msg,
++		.callback_ops = &nfs4_layoutreturn_call_ops,
++		.callback_data = lrp,
++		.flags = RPC_TASK_ASYNC,
++	};
++	int status = 0;
 +
-+	args = kzalloc(sizeof(*args), GFP_KERNEL);
-+	if (!args) {
-+		status = -ENOMEM;
-+		goto out;
++	dprintk("--> %s\n", __func__);
++	if (lrp->args.return_type == RETURN_FILE) {
++		struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
++		/* FIXME we should test for BULK here */
++		spin_lock(&lo->inode->i_lock);
++		BUG_ON(lo->plh_block_lgets == 0);
++		atomic_inc(&lo->plh_outstanding);
++		spin_unlock(&lo->inode->i_lock);
 +	}
-+	clr->clr_args = args;
-+	args->args_op = clr;
-+	msg.rpc_argp = args;
-+	status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-+				&nfsd4_cb_layout_ops, clr);
++	task = rpc_run_task(&task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
++	if (!issync)
++		goto out;
++	status = nfs4_wait_for_completion_rpc_task(task);
++	if (status != 0)
++		goto out;
++	status = task->tk_status;
 +out:
-+	if (status) {
-+		kfree(args);
-+		put_layoutrecall(clr);
-+	}
-+	dprintk("NFSD: nfsd4_cb_layout: status %d\n", status);
++	dprintk("<-- %s\n", __func__);
+ 	rpc_put_task(task);
 +	return status;
 +}
 +
-+static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata)
++/*
++ * Retrieve the list of Data Server devices from the MDS.
++ */
++static int _nfs4_getdevicelist(struct nfs_server *server,
++				    const struct nfs_fh *fh,
++				    struct pnfs_devicelist *devlist)
 +{
-+	struct nfs4_notify_device *cbnd = calldata;
-+	nfsd4_cb_prepare_sequence(task, cbnd->nd_client);
-+}
++	struct nfs4_getdevicelist_args args = {
++		.fh = fh,
++		.layoutclass = server->pnfs_curr_ld->id,
++	};
++	struct nfs4_getdevicelist_res res = {
++		.devlist = devlist,
++	};
++	struct rpc_message msg = {
++		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
++		.rpc_argp = &args,
++		.rpc_resp = &res,
++		.rpc_cred = nfs4_get_machine_cred(server->nfs_client),
++	};
++	int status;
 +
-+static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata)
++	dprintk("--> %s\n", __func__);
++	status = nfs4_call_sync(server, &msg, &args, &res, 0);
++	put_rpccred(msg.rpc_cred);
+ 	dprintk("<-- %s status=%d\n", __func__, status);
+ 	return status;
+ }
+ 
++int nfs4_proc_getdevicelist(struct nfs_server *server,
++			    const struct nfs_fh *fh,
++			    struct pnfs_devicelist *devlist)
 +{
-+	struct nfs4_notify_device *cbnd = calldata;
-+	struct nfs4_client *clp = cbnd->nd_client;
++	struct nfs4_exception exception = { };
++	int err;
 +
-+	nfsd4_cb_done_sequence(task, clp);
++	do {
++		err = nfs4_handle_exception(server,
++				_nfs4_getdevicelist(server, fh, devlist),
++				&exception);
++	} while (exception.retry);
 +
-+	dprintk("%s: clp %p cb_client %p: status %d\n",
-+	       __func__,
-+	       clp,
-+	       clp->cl_cb_client,
-+	       task->tk_status);
++	dprintk("%s: err=%d, num_devs=%u\n", __func__,
++		err, devlist->num_devs);
 +
-+	if (task->tk_status == -EIO) {
-+		/* Network partition? */
-+		atomic_set(&clp->cl_cb_set, 0);
-+		warn_no_callback_path(clp, task->tk_status);
-+	}
++	return err;
 +}
++EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 +
-+static void nfsd4_cb_device_release(void *calldata)
-+{
-+	struct nfs4_notify_device *cbnd = calldata;
-+	kfree(cbnd->nd_args);
-+	cbnd->nd_args = NULL;
-+	kfree(cbnd);
-+}
+ static int
+ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+ {
+@@ -5447,11 +5825,13 @@ _nfs4_proc_getdeviceinfo(struct nfs_serv
+ 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+ 		.rpc_argp = &args,
+ 		.rpc_resp = &res,
++		.rpc_cred = nfs4_get_machine_cred(server->nfs_client),
+ 	};
+ 	int status;
+ 
+ 	dprintk("--> %s\n", __func__);
+ 	status = nfs4_call_sync(server, &msg, &args, &res, 0);
++	put_rpccred(msg.rpc_cred);
+ 	dprintk("<-- %s status=%d\n", __func__, status);
+ 
+ 	return status;
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.37.noarch/fs/nfs/nfs4renewd.c
+--- linux-2.6.37.noarch/fs/nfs/nfs4renewd.c.orig	2011-01-28 09:37:32.540980017 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4renewd.c	2011-01-28 09:43:53.326773791 -0500
+@@ -65,7 +65,7 @@ nfs4_renew_state(struct work_struct *wor
+ 	dprintk("%s: start\n", __func__);
+ 
+ 	rcu_read_lock();
+-	if (list_empty(&clp->cl_superblocks)) {
++	if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) {
+ 		rcu_read_unlock();
+ 		goto out;
+ 	}
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4state.c.orig linux-2.6.37.noarch/fs/nfs/nfs4state.c
+--- linux-2.6.37.noarch/fs/nfs/nfs4state.c.orig	2011-01-28 09:37:32.542979947 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4state.c	2011-01-28 09:43:53.327773630 -0500
+@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(str
+ 	int status;
+ 	struct nfs_fsinfo fsinfo;
+ 
++	if (is_ds_only_client(clp)) {
++		nfs4_schedule_state_renewal(clp);
++		return 0;
++	}
 +
-+static const struct rpc_call_ops nfsd4_cb_device_ops = {
-+	.rpc_call_prepare = nfsd4_cb_device_prepare,
-+	.rpc_call_done = nfsd4_cb_device_done,
-+	.rpc_release = nfsd4_cb_device_release,
-+};
+ 	status = nfs4_proc_get_lease_time(clp, &fsinfo);
+ 	if (status == 0) {
+ 		/* Update lease time and schedule renewal */
+@@ -224,6 +229,7 @@ static int nfs4_begin_drain_session(stru
+ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+ {
+ 	int status;
++	u32 req_exchange_flags = clp->cl_exchange_flags;
+ 
+ 	nfs4_begin_drain_session(clp);
+ 	status = nfs4_proc_exchange_id(clp, cred);
+@@ -238,6 +244,16 @@ int nfs41_init_clientid(struct nfs_clien
+ 		nfs_callback_down(1);
+ 		status = 0;
+ 	}
++	if (is_ds_only_session(req_exchange_flags)) {
++		clp->cl_exchange_flags &=
++		     ~(EXCHGID4_FLAG_USE_PNFS_MDS | EXCHGID4_FLAG_USE_NON_PNFS);
++		if (!is_ds_only_session(clp->cl_exchange_flags)) {
++			nfs4_destroy_session(clp->cl_session);
++			clp->cl_session = NULL;
++			status = -ENOTSUPP;
++			goto out;
++		}
++	}
+ 	nfs41_setup_state_renewal(clp);
+ 	nfs_mark_client_ready(clp, NFS_CS_READY);
+ out:
+@@ -669,9 +685,22 @@ static void __nfs4_close(struct path *pa
+ 		nfs4_put_open_state(state);
+ 		nfs4_put_state_owner(owner);
+ 	} else {
+-		bool roc = pnfs_roc(state->inode);
++		u32 roc_iomode;
++		struct nfs_inode *nfsi = NFS_I(state->inode);
 +
-+/*
-+ * Called with state lock.
++		/* FIXME: should return the layout only on last close */
++		if (has_layout(nfsi) &&
++		    (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
++			struct pnfs_layout_range range = {
++				.iomode = roc_iomode,
++				.offset = 0,
++				.length = NFS4_MAX_UINT64,
++			};
++
++			pnfs_return_layout(state->inode, &range, wait);
++		}
+ 
+-		nfs4_do_close(path, state, gfp_mask, wait, roc);
++		nfs4_do_close(path, state, gfp_mask, wait);
+ 	}
+ }
+ 
+@@ -1661,6 +1690,10 @@ static void nfs4_state_manager(struct nf
+ 			nfs_client_return_marked_delegations(clp);
+ 			continue;
+ 		}
++		if (test_and_clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) {
++			nfs_client_return_layouts(clp);
++			continue;
++		}
+ 		/* Recall session slots */
+ 		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
+ 		   && nfs4_has_session(clp)) {
+diff -up linux-2.6.37.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.37.noarch/fs/nfs/nfs4xdr.c
+--- linux-2.6.37.noarch/fs/nfs/nfs4xdr.c.orig	2011-01-28 09:37:32.546979809 -0500
++++ linux-2.6.37.noarch/fs/nfs/nfs4xdr.c	2011-01-28 09:43:53.330773162 -0500
+@@ -90,7 +90,7 @@ static int nfs4_stat_to_errno(int);
+ #define encode_getfh_maxsz      (op_encode_hdr_maxsz)
+ #define decode_getfh_maxsz      (op_decode_hdr_maxsz + 1 + \
+ 				((3+NFS4_FHSIZE) >> 2))
+-#define nfs4_fattr_bitmap_maxsz 3
++#define nfs4_fattr_bitmap_maxsz 4
+ #define encode_getattr_maxsz    (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+ #define nfs4_name_maxsz		(1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+ #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+@@ -112,7 +112,11 @@ static int nfs4_stat_to_errno(int);
+ #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
+ #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
+ #define encode_fsinfo_maxsz	(encode_getattr_maxsz)
+-#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + 11)
++/* The 5 accounts for the PNFS attributes, and assumes that at most three
++ * layout types will be returned.
 + */
-+int
-+nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd)
++#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + \
++				 nfs4_fattr_bitmap_maxsz + 8 + 5)
+ #define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)
+ #define decode_renew_maxsz	(op_decode_hdr_maxsz)
+ #define encode_setclientid_maxsz \
+@@ -311,6 +315,17 @@ static int nfs4_stat_to_errno(int);
+ 				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+ #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)
+ #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4)
++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
++				encode_verifier_maxsz)
++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
++				2 /* nfs_cookie4 gdlr_cookie */ + \
++				decode_verifier_maxsz \
++				  /* verifier4 gdlr_verifier */ + \
++				1 /* gdlr_deviceid_list count */ + \
++				XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
++					    NFS4_DEVICEID4_SIZE) \
++				  /* gdlr_deviceid_list */ + \
++				1 /* bool gdlr_eof */)
+ #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+ 				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+ #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+@@ -324,6 +339,17 @@ static int nfs4_stat_to_errno(int);
+ #define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
+ 				decode_stateid_maxsz + \
+ 				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
++#define encode_layoutcommit_maxsz (18 +                           \
++				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
++				op_encode_hdr_maxsz +          \
++				encode_stateid_maxsz)
++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz)
++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
++				encode_stateid_maxsz + \
++				1 /* FIXME: opaque lrf_body always empty at
++				   *the moment */)
++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
++				1 + decode_stateid_maxsz)
+ #else /* CONFIG_NFS_V4_1 */
+ #define encode_sequence_maxsz	0
+ #define decode_sequence_maxsz	0
+@@ -713,6 +739,14 @@ static int nfs4_stat_to_errno(int);
+ #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \
+ 					 decode_sequence_maxsz + \
+ 					 decode_reclaim_complete_maxsz)
++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
++				encode_sequence_maxsz + \
++				encode_putfh_maxsz + \
++				encode_getdevicelist_maxsz)
++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
++				decode_sequence_maxsz + \
++				decode_putfh_maxsz + \
++				decode_getdevicelist_maxsz)
+ #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+ 				encode_sequence_maxsz +\
+ 				encode_getdeviceinfo_maxsz)
+@@ -727,6 +761,38 @@ static int nfs4_stat_to_errno(int);
+ 				decode_sequence_maxsz + \
+ 				decode_putfh_maxsz +        \
+ 				decode_layoutget_maxsz)
++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
++				encode_sequence_maxsz +\
++				encode_putfh_maxsz + \
++				encode_layoutcommit_maxsz + \
++				encode_getattr_maxsz)
++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
++				decode_sequence_maxsz + \
++				decode_putfh_maxsz + \
++				decode_layoutcommit_maxsz + \
++				decode_getattr_maxsz)
++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
++				encode_sequence_maxsz + \
++				encode_putfh_maxsz + \
++				encode_layoutreturn_maxsz)
++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
++				decode_sequence_maxsz + \
++				decode_putfh_maxsz + \
++				decode_layoutreturn_maxsz)
++#define NFS4_enc_dswrite_sz	(compound_encode_hdr_maxsz + \
++				encode_sequence_maxsz +\
++				encode_putfh_maxsz + \
++				encode_write_maxsz)
++#define NFS4_dec_dswrite_sz	(compound_decode_hdr_maxsz + \
++				decode_sequence_maxsz + \
++				decode_putfh_maxsz + \
++				decode_write_maxsz)
++#define NFS4_enc_dscommit_sz	(compound_encode_hdr_maxsz + \
++				encode_putfh_maxsz + \
++				encode_commit_maxsz)
++#define NFS4_dec_dscommit_sz	(compound_decode_hdr_maxsz + \
++				decode_putfh_maxsz + \
++				decode_commit_maxsz)
+ 
+ const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ 				      compound_encode_hdr_maxsz +
+@@ -1031,6 +1097,35 @@ static void encode_getattr_two(struct xd
+ 	hdr->replen += decode_getattr_maxsz;
+ }
+ 
++static void
++encode_getattr_three(struct xdr_stream *xdr,
++		     uint32_t bm0, uint32_t bm1, uint32_t bm2,
++		     struct compound_hdr *hdr)
 +{
-+	struct nfs4_client *clp = cbnd->nd_client;
-+	struct rpc_clnt *clnt = clp->cl_cb_client;
-+	struct nfs4_rpc_args *args;
-+	struct rpc_message msg = {
-+		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE],
-+		.rpc_cred = callback_cred
-+	};
-+	int status = -EIO;
-+
-+	dprintk("%s: clp %p\n", __func__, clp);
++	__be32 *p;
 +
-+	args = kzalloc(sizeof(*args), GFP_KERNEL);
-+	if (!args) {
-+		status = -ENOMEM;
-+		goto out;
++	p = reserve_space(xdr, 4);
++	*p = cpu_to_be32(OP_GETATTR);
++	if (bm2) {
++		p = reserve_space(xdr, 16);
++		*p++ = cpu_to_be32(3);
++		*p++ = cpu_to_be32(bm0);
++		*p++ = cpu_to_be32(bm1);
++		*p = cpu_to_be32(bm2);
++	} else if (bm1) {
++		p = reserve_space(xdr, 12);
++		*p++ = cpu_to_be32(2);
++		*p++ = cpu_to_be32(bm0);
++		*p = cpu_to_be32(bm1);
++	} else {
++		p = reserve_space(xdr, 8);
++		*p++ = cpu_to_be32(1);
++		*p = cpu_to_be32(bm0);
 +	}
-+	args->args_op = cbnd;
-+	msg.rpc_argp = args;
-+
-+	status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-+				&nfsd4_cb_device_ops, cbnd);
-+out:
-+	if (status)
-+		kfree(args);
-+	dprintk("%s: status %d\n", __func__, status);
-+	return status;
++	hdr->nops++;
++	hdr->replen += decode_getattr_maxsz;
 +}
-+#endif /* CONFIG_PNFSD */
-diff --git a/fs/nfsd/nfs4pnfsd.c b/fs/nfsd/nfs4pnfsd.c
-new file mode 100644
-index 0000000..8e8bae3
---- /dev/null
-+++ b/fs/nfsd/nfs4pnfsd.c
-@@ -0,0 +1,1688 @@
-+/******************************************************************************
-+ *
-+ * (c) 2007 Network Appliance, Inc.  All Rights Reserved.
-+ * (c) 2009 NetApp.  All Rights Reserved.
-+ *
-+ * NetApp provides this source code under the GPL v2 License.
-+ * The GPL v2 license is available at
-+ * http://opensource.org/licenses/gpl-license.php.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ *****************************************************************************/
-+
-+#include "pnfsd.h"
-+
-+#define NFSDDBG_FACILITY                NFSDDBG_PROC
-+
-+/* Globals */
-+static u32 current_layoutid = 1;
-+
-+/*
-+ * Currently used for manipulating the layout state.
-+ */
-+static DEFINE_SPINLOCK(layout_lock);
-+
-+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP)
-+#  define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock))
-+#else
-+#  define BUG_ON_UNLOCKED_LAYOUT()
-+#endif
-+
-+/*
-+ * Layout state - NFSv4.1 pNFS
-+ */
-+static struct kmem_cache *pnfs_layout_slab;
-+static struct kmem_cache *pnfs_layoutrecall_slab;
-+
-+/* hash table for nfsd4_pnfs_deviceid.sbid */
-+#define SBID_HASH_BITS	8
-+#define SBID_HASH_SIZE	(1 << SBID_HASH_BITS)
-+#define SBID_HASH_MASK	(SBID_HASH_SIZE - 1)
-+
-+struct sbid_tracker {
-+	u64 id;
-+	struct super_block *sb;
-+	struct list_head hash;
-+};
-+
-+static u64 current_sbid;
-+static struct list_head sbid_hashtbl[SBID_HASH_SIZE];
 +
-+static inline unsigned long
-+sbid_hashval(struct super_block *sb)
+ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+ {
+ 	encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+@@ -1039,8 +1134,11 @@ static void encode_getfattr(struct xdr_s
+ 
+ static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+ {
+-	encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+-			   bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
++	encode_getattr_three(xdr,
++			     bitmask[0] & nfs4_fsinfo_bitmap[0],
++			     bitmask[1] & nfs4_fsinfo_bitmap[1],
++			     bitmask[2] & nfs4_fsinfo_bitmap[2],
++			     hdr);
+ }
+ 
+ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+@@ -1767,6 +1865,26 @@ static void encode_sequence(struct xdr_s
+ 
+ #ifdef CONFIG_NFS_V4_1
+ static void
++encode_getdevicelist(struct xdr_stream *xdr,
++		     const struct nfs4_getdevicelist_args *args,
++		     struct compound_hdr *hdr)
 +{
-+	return hash_ptr(sb, SBID_HASH_BITS);
-+}
++	__be32 *p;
++	nfs4_verifier dummy = {
++		.data = "dummmmmy",
++	};
 +
-+static inline struct sbid_tracker *
-+alloc_sbid(void)
-+{
-+	return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL);
++	p = reserve_space(xdr, 20);
++	*p++ = cpu_to_be32(OP_GETDEVICELIST);
++	*p++ = cpu_to_be32(args->layoutclass);
++	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
++	xdr_encode_hyper(p, 0ULL);                          /* cookie */
++	encode_nfs4_verifier(xdr, &dummy);
++	hdr->nops++;
++	hdr->replen += decode_getdevicelist_maxsz;
 +}
 +
 +static void
-+destroy_sbid(struct sbid_tracker *sbid)
-+{
-+	spin_lock(&layout_lock);
-+	list_del(&sbid->hash);
-+	spin_unlock(&layout_lock);
-+	kfree(sbid);
-+}
+ encode_getdeviceinfo(struct xdr_stream *xdr,
+ 		     const struct nfs4_getdeviceinfo_args *args,
+ 		     struct compound_hdr *hdr)
+@@ -1812,6 +1930,102 @@ encode_layoutget(struct xdr_stream *xdr,
+ 	hdr->nops++;
+ 	hdr->replen += decode_layoutget_maxsz;
+ }
 +
-+void
-+nfsd4_free_pnfs_slabs(void)
++static void
++encode_layoutcommit(struct xdr_stream *xdr,
++		    struct inode *inode,
++		    const struct nfs4_layoutcommit_args *args,
++		    struct compound_hdr *hdr)
 +{
-+	int i;
-+	struct sbid_tracker *sbid;
++	__be32 *p;
 +
-+	nfsd4_free_slab(&pnfs_layout_slab);
-+	nfsd4_free_slab(&pnfs_layoutrecall_slab);
++	dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__,
++		args->range.length, args->range.offset, args->lastbytewritten,
++		args->layout_type);
 +
-+	for (i = 0; i < SBID_HASH_SIZE; i++) {
-+		while (!list_empty(&sbid_hashtbl[i])) {
-+			sbid = list_first_entry(&sbid_hashtbl[i],
-+						struct sbid_tracker,
-+						hash);
-+			destroy_sbid(sbid);
-+		}
++	p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE);
++	*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
++	p = xdr_encode_hyper(p, args->range.offset);
++	p = xdr_encode_hyper(p, args->range.length);
++	*p++ = cpu_to_be32(0);     /* reclaim */
++	p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
++	*p++ = cpu_to_be32(1);     /* newoffset = TRUE */
++	p = xdr_encode_hyper(p, args->lastbytewritten);
++	*p = cpu_to_be32(args->time_modify_changed != 0);
++	if (args->time_modify_changed) {
++		p = reserve_space(xdr, 12);
++		*p++ = cpu_to_be32(0);
++		*p++ = cpu_to_be32(args->time_modify.tv_sec);
++		*p = cpu_to_be32(args->time_modify.tv_nsec);
 +	}
-+}
-+
-+int
-+nfsd4_init_pnfs_slabs(void)
-+{
-+	int i;
 +
-+	pnfs_layout_slab = kmem_cache_create("pnfs_layouts",
-+			sizeof(struct nfs4_layout), 0, 0, NULL);
-+	if (pnfs_layout_slab == NULL)
-+		return -ENOMEM;
-+	pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls",
-+			sizeof(struct nfs4_layoutrecall), 0, 0, NULL);
-+	if (pnfs_layoutrecall_slab == NULL)
-+		return -ENOMEM;
++	p = reserve_space(xdr, 4);
++	*p = cpu_to_be32(args->layout_type);
 +
-+	for (i = 0; i < SBID_HASH_SIZE; i++) {
-+		INIT_LIST_HEAD(&sbid_hashtbl[i]);
++	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
++		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
++			NFS_I(inode)->layout, xdr, args);
++	} else {
++		p = reserve_space(xdr, 4);
++		xdr_encode_opaque(p, NULL, 0);
 +	}
 +
-+	return 0;
++	hdr->nops++;
++	hdr->replen += decode_layoutcommit_maxsz;
 +}
 +
-+/* XXX: Need to implement the notify types and track which
-+ * clients have which devices. */
-+void pnfs_set_device_notify(clientid_t *clid, unsigned int types)
++static void
++encode_layoutreturn(struct xdr_stream *xdr,
++		    const struct nfs4_layoutreturn_args *args,
++		    struct compound_hdr *hdr)
 +{
-+	struct nfs4_client *clp;
-+	dprintk("%s: -->\n", __func__);
++	nfs4_stateid stateid;
++	__be32 *p;
 +
-+	nfs4_lock_state();
-+	/* Indicate that client has a device so we can only notify
-+	 * the correct clients */
-+	clp = find_confirmed_client(clid);
-+	if (clp) {
-+		atomic_inc(&clp->cl_deviceref);
-+		dprintk("%s: Incr device count (clnt %p) to %d\n",
-+			__func__, clp, atomic_read(&clp->cl_deviceref));
++	p = reserve_space(xdr, 20);
++	*p++ = cpu_to_be32(OP_LAYOUTRETURN);
++	*p++ = cpu_to_be32(args->reclaim);
++	*p++ = cpu_to_be32(args->layout_type);
++	*p++ = cpu_to_be32(args->range.iomode);
++	*p = cpu_to_be32(args->return_type);
++	if (args->return_type == RETURN_FILE) {
++		p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
++		p = xdr_encode_hyper(p, args->range.offset);
++		p = xdr_encode_hyper(p, args->range.length);
++		spin_lock(&args->inode->i_lock);
++		memcpy(stateid.data, NFS_I(args->inode)->layout->stateid.data,
++		       NFS4_STATEID_SIZE);
++		spin_unlock(&args->inode->i_lock);
++		p = xdr_encode_opaque_fixed(p, &stateid.data,
++					    NFS4_STATEID_SIZE);
++		if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
++			NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
++				NFS_I(args->inode)->layout, xdr, args);
++		} else {
++			p = reserve_space(xdr, 4);
++			*p = cpu_to_be32(0);
++		}
 +	}
-+	nfs4_unlock_state();
++	hdr->nops++;
++	hdr->replen += decode_layoutreturn_maxsz;
 +}
-+
-+/* Clear notifications for this client
-+ * XXX: Do we need to loop through a clean up all
-+ *      krefs when nfsd cleans up the client? */
-+void pnfs_clear_device_notify(struct nfs4_client *clp)
++#else /* CONFIG_NFS_V4_1 */
++static int
++encode_layoutcommit(struct xdr_stream *xdr,
++		    struct inode *inode,
++		    const struct nfs4_layoutcommit_args *args,
++		    struct compound_hdr *hdr)
 +{
-+	atomic_dec(&clp->cl_deviceref);
-+	dprintk("%s: Decr device count (clnt %p) to %d\n",
-+		__func__, clp, atomic_read(&clp->cl_deviceref));
++	return 0;
 +}
 +
-+static struct nfs4_layout_state *
-+alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp,
-+			stateid_t *stateid)
++static void
++encode_layoutreturn(struct xdr_stream *xdr,
++		    const struct nfs4_layoutreturn_args *args,
++		    struct compound_hdr *hdr)
 +{
-+	struct nfs4_layout_state *new;
-+
-+	/* FIXME: use a kmem_cache */
-+	new = kzalloc(sizeof(*new), GFP_KERNEL);
-+	if (!new)
-+		return new;
-+	get_nfs4_file(fp);
-+	INIT_LIST_HEAD(&new->ls_perfile);
-+	INIT_LIST_HEAD(&new->ls_layouts);
-+	kref_init(&new->ls_ref);
-+	new->ls_client = clp;
-+	new->ls_file = fp;
-+	new->ls_stateid.si_boot = stateid->si_boot;
-+	new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */
-+	new->ls_stateid.si_generation = 1;
-+	spin_lock(&layout_lock);
-+	new->ls_stateid.si_fileid = current_layoutid++;
-+	list_add(&new->ls_perfile, &fp->fi_layout_states);
-+	spin_unlock(&layout_lock);
-+	return new;
 +}
 +
-+static inline void
-+get_layout_state(struct nfs4_layout_state *ls)
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ /*
+@@ -2408,7 +2622,7 @@ static void nfs4_xdr_enc_setclientid_con
+ 	struct compound_hdr hdr = {
+ 		.nops	= 0,
+ 	};
+-	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
++	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
+ 
+ 	encode_compound_hdr(xdr, req, &hdr);
+ 	encode_setclientid_confirm(xdr, arg, &hdr);
+@@ -2534,7 +2748,7 @@ static void nfs4_xdr_enc_get_lease_time(
+ 	struct compound_hdr hdr = {
+ 		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+ 	};
+-	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
++	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
+ 
+ 	encode_compound_hdr(xdr, req, &hdr);
+ 	encode_sequence(xdr, &args->la_seq_args, &hdr);
+@@ -2561,6 +2775,24 @@ static void nfs4_xdr_enc_reclaim_complet
+ }
+ 
+ /*
++ * Encode GETDEVICELIST request
++ */
++static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
++				       struct xdr_stream *xdr,
++				       struct nfs4_getdevicelist_args *args)
 +{
-+	kref_get(&ls->ls_ref);
++	struct compound_hdr hdr = {
++		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
++	};
++
++	encode_compound_hdr(xdr, req, &hdr);
++	encode_sequence(xdr, &args->seq_args, &hdr);
++	encode_putfh(xdr, args->fh, &hdr);
++	encode_getdevicelist(xdr, args, &hdr);
++	encode_nops(&hdr);
 +}
 +
-+static void
-+destroy_layout_state_common(struct nfs4_layout_state *ls)
++/*
+  * Encode GETDEVICEINFO request
+  */
+ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
+@@ -2601,6 +2833,81 @@ static void nfs4_xdr_enc_layoutget(struc
+ 	encode_layoutget(xdr, args, &hdr);
+ 	encode_nops(&hdr);
+ }
++
++/*
++ *  Encode LAYOUTCOMMIT request
++ */
++static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
++				      struct xdr_stream *xdr,
++				      struct nfs4_layoutcommit_args *args)
 +{
-+	struct nfs4_file *fp = ls->ls_file;
++	struct nfs4_layoutcommit_data *data =
++		container_of(args, struct nfs4_layoutcommit_data, args);
++	struct compound_hdr hdr = {
++		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
++	};
 +
-+	dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp,
-+		ls->ls_client);
-+	BUG_ON(!list_empty(&ls->ls_layouts));
-+	kfree(ls);
-+	put_nfs4_file(fp);
++	encode_compound_hdr(xdr, req, &hdr);
++	encode_sequence(xdr, &args->seq_args, &hdr);
++	encode_putfh(xdr, args->fh, &hdr);
++	encode_layoutcommit(xdr, data->args.inode, args, &hdr);
++	encode_getfattr(xdr, args->bitmask, &hdr);
++	encode_nops(&hdr);
 +}
 +
-+static void
-+destroy_layout_state(struct kref *kref)
++/*
++ * Encode LAYOUTRETURN request
++ */
++static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
++				      struct xdr_stream *xdr,
++				      struct nfs4_layoutreturn_args *args)
 +{
-+	struct nfs4_layout_state *ls =
-+			container_of(kref, struct nfs4_layout_state, ls_ref);
++	struct compound_hdr hdr = {
++		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
++	};
 +
-+	spin_lock(&layout_lock);
-+	list_del(&ls->ls_perfile);
-+	spin_unlock(&layout_lock);
-+	destroy_layout_state_common(ls);
++	encode_compound_hdr(xdr, req, &hdr);
++	encode_sequence(xdr, &args->seq_args, &hdr);
++	encode_putfh(xdr, NFS_FH(args->inode), &hdr);
++	encode_layoutreturn(xdr, args, &hdr);
++	encode_nops(&hdr);
 +}
 +
-+static void
-+destroy_layout_state_locked(struct kref *kref)
++/*
++ * Encode a pNFS File Layout Data Server WRITE request
++ */
++static void nfs4_xdr_enc_dswrite(struct rpc_rqst *req,
++				 struct xdr_stream *xdr,
++				 struct nfs_writeargs *args)
 +{
-+	struct nfs4_layout_state *ls =
-+			container_of(kref, struct nfs4_layout_state, ls_ref);
++	struct compound_hdr hdr = {
++		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
++	};
 +
-+	list_del(&ls->ls_perfile);
-+	destroy_layout_state_common(ls);
++	encode_compound_hdr(xdr, req, &hdr);
++	encode_sequence(xdr, &args->seq_args, &hdr);
++	encode_putfh(xdr, args->fh, &hdr);
++	encode_write(xdr, args, &hdr);
++	encode_nops(&hdr);
 +}
 +
-+static inline void
-+put_layout_state(struct nfs4_layout_state *ls)
++/*
++ * Encode a pNFS File Layout Data Server COMMIT request
++ */
++static void nfs4_xdr_enc_dscommit(struct rpc_rqst *req,
++				  struct xdr_stream *xdr,
++				  struct nfs_writeargs *args)
 +{
-+	dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
-+		atomic_read(&ls->ls_ref.refcount));
-+	kref_put(&ls->ls_ref, destroy_layout_state);
-+}
++	struct compound_hdr hdr = {
++		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
++	};
 +
-+static inline void
-+put_layout_state_locked(struct nfs4_layout_state *ls)
-+{
-+	dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
-+		atomic_read(&ls->ls_ref.refcount));
-+	kref_put(&ls->ls_ref, destroy_layout_state_locked);
++	encode_compound_hdr(xdr, req, &hdr);
++	encode_sequence(xdr, &args->seq_args, &hdr);
++	encode_putfh(xdr, args->fh, &hdr);
++	encode_commit(xdr, args, &hdr);
++	encode_nops(&hdr);
 +}
-+
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+@@ -2701,14 +3008,17 @@ static int decode_attr_bitmap(struct xdr
+ 		goto out_overflow;
+ 	bmlen = be32_to_cpup(p);
+ 
+-	bitmap[0] = bitmap[1] = 0;
++	bitmap[0] = bitmap[1] = bitmap[2] = 0;
+ 	p = xdr_inline_decode(xdr, (bmlen << 2));
+ 	if (unlikely(!p))
+ 		goto out_overflow;
+ 	if (bmlen > 0) {
+ 		bitmap[0] = be32_to_cpup(p++);
+-		if (bmlen > 1)
+-			bitmap[1] = be32_to_cpup(p);
++		if (bmlen > 1) {
++			bitmap[1] = be32_to_cpup(p++);
++			if (bmlen > 2)
++				bitmap[2] = be32_to_cpup(p);
++		}
+ 	}
+ 	return 0;
+ out_overflow:
+@@ -2740,8 +3050,9 @@ static int decode_attr_supported(struct 
+ 			return ret;
+ 		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
+ 	} else
+-		bitmask[0] = bitmask[1] = 0;
+-	dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
++		bitmask[0] = bitmask[1] = bitmask[2] = 0;
++	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
++		bitmask[0], bitmask[1], bitmask[2]);
+ 	return 0;
+ }
+ 
+@@ -3794,7 +4105,7 @@ out_overflow:
+ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
+ {
+ 	__be32 *savep;
+-	uint32_t attrlen, bitmap[2] = {0};
++	uint32_t attrlen, bitmap[3] = {0};
+ 	int status;
+ 
+ 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3820,7 +4131,7 @@ xdr_error:
+ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+ {
+ 	__be32 *savep;
+-	uint32_t attrlen, bitmap[2] = {0};
++	uint32_t attrlen, bitmap[3] = {0};
+ 	int status;
+ 
+ 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3852,7 +4163,7 @@ xdr_error:
+ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
+ {
+ 	__be32 *savep;
+-	uint32_t attrlen, bitmap[2] = {0};
++	uint32_t attrlen, bitmap[3] = {0};
+ 	int status;
+ 
+ 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3994,7 +4305,7 @@ static int decode_getfattr_generic(struc
+ {
+ 	__be32 *savep;
+ 	uint32_t attrlen,
+-		 bitmap[2] = {0};
++		 bitmap[3] = {0};
+ 	int status;
+ 
+ 	status = decode_op_hdr(xdr, OP_GETATTR);
+@@ -4080,10 +4391,32 @@ static int decode_attr_pnfstype(struct x
+ 	return status;
+ }
+ 
 +/*
-+ * Search the fp->fi_layout_state list for a layout state with the clientid.
-+ * If not found, then this is a 'first open/delegation/lock stateid' from
-+ * the client for this file.
-+ * Called under the layout_lock.
++ * The prefered block size for layout directed io
 + */
-+static struct nfs4_layout_state *
-+find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp)
++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
++				      uint32_t *res)
 +{
-+	struct nfs4_layout_state *ls;
++	__be32 *p;
 +
-+	BUG_ON_UNLOCKED_LAYOUT();
-+	list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) {
-+		if (ls->ls_client == clp) {
-+			dprintk("pNFS %s: before GET ls %p ls_ref %d\n",
-+				__func__, ls,
-+				atomic_read(&ls->ls_ref.refcount));
-+			get_layout_state(ls);
-+			return ls;
++	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
++	*res = 0;
++	if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
++		p = xdr_inline_decode(xdr, 4);
++		if (unlikely(!p)) {
++			print_overflow_msg(__func__, xdr);
++			return -EIO;
 +		}
++		*res = be32_to_cpup(p);
++		bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
 +	}
-+	return NULL;
-+}
-+
-+static __be32
-+verify_stateid(struct nfs4_file *fp, stateid_t *stateid)
-+{
-+	struct nfs4_stateid *local = NULL;
-+	struct nfs4_delegation *temp = NULL;
-+
-+	/* check if open or lock stateid */
-+	local = find_stateid(stateid, RD_STATE);
-+	if (local)
-+		return 0;
-+	temp = find_delegation_stateid(fp->fi_inode, stateid);
-+	if (temp)
-+		return 0;
-+	return nfserr_bad_stateid;
++	return 0;
 +}
 +
+ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+ {
+ 	__be32 *savep;
+-	uint32_t attrlen, bitmap[2];
++	uint32_t attrlen, bitmap[3];
+ 	int status;
+ 
+ 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -4111,6 +4444,9 @@ static int decode_fsinfo(struct xdr_stre
+ 	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+ 	if (status != 0)
+ 		goto xdr_error;
++	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
++	if (status)
++		goto xdr_error;
+ 
+ 	status = verify_attr_len(xdr, savep, attrlen);
+ xdr_error:
+@@ -4530,7 +4866,7 @@ static int decode_getacl(struct xdr_stre
+ {
+ 	__be32 *savep;
+ 	uint32_t attrlen,
+-		 bitmap[2] = {0};
++		 bitmap[3] = {0};
+ 	struct kvec *iov = req->rq_rcv_buf.head;
+ 	int status;
+ 
+@@ -4878,6 +5214,50 @@ out_overflow:
+ }
+ 
+ #if defined(CONFIG_NFS_V4_1)
 +/*
-+ * nfs4_preocess_layout_stateid ()
-+ *
-+ * We have looked up the nfs4_file corresponding to the current_fh, and
-+ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op()
-+ * that make sense with a layout stateid.
-+ *
-+ * Called with the state_lock held
-+ * Returns zero and stateid is updated, or error.
-+ *
-+ * Note: the struct nfs4_layout_state pointer is only set by layoutget.
++ * TODO: Need to handle case when EOF != true;
 + */
-+static __be32
-+nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp,
-+			    stateid_t *stateid, struct nfs4_layout_state **lsp)
++static int decode_getdevicelist(struct xdr_stream *xdr,
++				struct pnfs_devicelist *res)
 +{
-+	struct nfs4_layout_state *ls = NULL;
-+	__be32 status = 0;
-+
-+	dprintk("--> %s clp %p fp %p \n", __func__, clp, fp);
-+
-+	dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__,
-+		STATEID_VAL(stateid));
++	__be32 *p;
++	int status, i;
++	struct nfs_writeverf verftemp;
 +
-+	status = nfs4_check_stateid(stateid);
++	status = decode_op_hdr(xdr, OP_GETDEVICELIST);
 +	if (status)
-+		goto out;
-+
-+	/* Is this the first use of this layout ? */
-+	spin_lock(&layout_lock);
-+	ls = find_get_layout_state(clp, fp);
-+	spin_unlock(&layout_lock);
-+	if (!ls) {
-+		/* Only alloc layout state on layoutget (which sets lsp). */
-+		if (!lsp) {
-+			dprintk("%s ERROR: Not layoutget & no layout stateid\n",
-+				__func__);
-+			status = nfserr_bad_stateid;
-+			goto out;
-+		}
-+		dprintk("%s Initial stateid for layout: file %p client %p\n",
-+			__func__, fp, clp);
-+
-+		/* verify input stateid */
-+		status = verify_stateid(fp, stateid);
-+		if (status) {
-+			dprintk("%s ERROR: invalid open/deleg/lock stateid\n",
-+				__func__);
-+			goto out;
-+		}
-+		ls = alloc_init_layout_state(clp, fp, stateid);
-+		if (!ls) {
-+			dprintk("%s pNFS ERROR: no memory for layout state\n",
-+				__func__);
-+			status = nfserr_resource;
-+			goto out;
-+		}
-+	} else {
-+		dprintk("%s Not initial stateid. Layout state %p file %p\n",
-+			__func__, ls, fp);
-+
-+		/* BAD STATEID */
-+		status = nfserr_bad_stateid;
-+		if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque,
-+			sizeof(stateid_opaque_t)) != 0) {
-+
-+			/* if a LAYOUTGET operation and stateid is a valid
-+			 * open/deleg/lock stateid, accept it as a parallel
-+			 * initial layout stateid
-+			 */
-+			if (lsp && ((verify_stateid(fp, stateid)) == 0)) {
-+				dprintk("%s parallel initial layout state\n",
-+					__func__);
-+				goto verified;
-+			}
++		return status;
 +
-+			dprintk("%s ERROR bad opaque in stateid 1\n", __func__);
-+			goto out_put;
-+		}
++	p = xdr_inline_decode(xdr, 8 + 8 + 4);
++	if (unlikely(!p))
++		goto out_overflow;
 +
-+		/* stateid is a valid layout stateid for this file. */
-+		if (stateid->si_generation > ls->ls_stateid.si_generation) {
-+			dprintk("%s bad stateid 1\n", __func__);
-+			goto out_put;
-+		}
-+	}
-+verified:
-+	status = 0;
++	/* TODO: Skip cookie for now */
++	p += 2;
 +
-+	/* Return the layout state if requested */
-+	if (lsp) {
-+		get_layout_state(ls);
-+		*lsp = ls;
-+	}
-+	dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__,
-+		STATEID_VAL(&ls->ls_stateid));
-+out_put:
-+	dprintk("%s PUT LO STATE:\n", __func__);
-+	put_layout_state(ls);
-+out:
-+	dprintk("<-- %s status %d\n", __func__, htonl(status));
++	/* Read verifier */
++	p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
 +
-+	return status;
-+}
++	res->num_devs = be32_to_cpup(p);
 +
-+static inline struct nfs4_layout *
-+alloc_layout(void)
-+{
-+	return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL);
-+}
++	dprintk("%s: num_dev %d\n", __func__, res->num_devs);
 +
-+static inline void
-+free_layout(struct nfs4_layout *lp)
-+{
-+	kmem_cache_free(pnfs_layout_slab, lp);
-+}
++	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM)
++		return -NFS4ERR_REP_TOO_BIG;
 +
-+#define update_layout_stateid(ls, sid) { \
-+	update_stateid(&(ls)->ls_stateid); \
-+	dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \
-+		__func__, (ls)->ls_stateid.si_generation, (ls)); \
-+	memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \
++	p = xdr_inline_decode(xdr,
++			      res->num_devs * NFS4_DEVICEID4_SIZE + 4);
++	if (unlikely(!p))
++		goto out_overflow;
++	for (i = 0; i < res->num_devs; i++)
++		p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
++					    NFS4_DEVICEID4_SIZE);
++	res->eof = be32_to_cpup(p);
++	return 0;
++out_overflow:
++	print_overflow_msg(__func__, xdr);
++	return -EIO;
 +}
+ 
+ static int decode_getdeviceinfo(struct xdr_stream *xdr,
+ 				struct pnfs_device *pdev)
+@@ -5003,6 +5383,56 @@ out_overflow:
+ 	print_overflow_msg(__func__, xdr);
+ 	return -EIO;
+ }
 +
-+static void
-+init_layout(struct nfs4_layout_state *ls,
-+	    struct nfs4_layout *lp,
-+	    struct nfs4_file *fp,
-+	    struct nfs4_client *clp,
-+	    struct svc_fh *current_fh,
-+	    struct nfsd4_layout_seg *seg,
-+	    stateid_t *stateid)
++static int decode_layoutreturn(struct xdr_stream *xdr,
++			       struct nfs4_layoutreturn_res *res)
 +{
-+	dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__,
-+		ls, lp, clp, fp, fp->fi_inode);
++	__be32 *p;
++	int status;
 +
-+	get_nfs4_file(fp);
-+	lp->lo_client = clp;
-+	lp->lo_file = fp;
-+	get_layout_state(ls);
-+	lp->lo_state = ls;
-+	memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg));
-+	spin_lock(&layout_lock);
-+	update_layout_stateid(ls, stateid);
-+	list_add_tail(&lp->lo_perstate, &ls->ls_layouts);
-+	list_add_tail(&lp->lo_perclnt, &clp->cl_layouts);
-+	list_add_tail(&lp->lo_perfile, &fp->fi_layouts);
-+	spin_unlock(&layout_lock);
-+	dprintk("pNFS %s end\n", __func__);
++	status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
++	if (status)
++		return status;
++	p = xdr_inline_decode(xdr, 4);
++	if (unlikely(!p))
++		goto out_overflow;
++	res->lrs_present = be32_to_cpup(p);
++	if (res->lrs_present)
++		status = decode_stateid(xdr, &res->stateid);
++	return status;
++out_overflow:
++	print_overflow_msg(__func__, xdr);
++	return -EIO;
 +}
 +
-+static void
-+dequeue_layout(struct nfs4_layout *lp)
++static int decode_layoutcommit(struct xdr_stream *xdr,
++			       struct rpc_rqst *req,
++			       struct nfs4_layoutcommit_res *res)
 +{
-+	BUG_ON_UNLOCKED_LAYOUT();
-+	list_del(&lp->lo_perclnt);
-+	list_del(&lp->lo_perfile);
-+	list_del(&lp->lo_perstate);
-+}
++	__be32 *p;
++	int status;
 +
-+static void
-+destroy_layout(struct nfs4_layout *lp)
-+{
-+	struct nfs4_client *clp;
-+	struct nfs4_file *fp;
-+	struct nfs4_layout_state *ls;
++	status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
++	res->status = status;
++	if (status)
++		return status;
 +
-+	BUG_ON_UNLOCKED_LAYOUT();
-+	clp = lp->lo_client;
-+	fp = lp->lo_file;
-+	ls = lp->lo_state;
-+	dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n",
-+		__func__, lp, clp, fp, fp->fi_inode,
-+		list_empty(&ls->ls_layouts));
++	p = xdr_inline_decode(xdr, 4);
++	if (unlikely(!p))
++		goto out_overflow;
++	res->sizechanged = be32_to_cpup(p);
 +
-+	kmem_cache_free(pnfs_layout_slab, lp);
-+	/* release references taken by init_layout */
-+	put_layout_state_locked(ls);
-+	put_nfs4_file(fp);
++	if (res->sizechanged) {
++		p = xdr_inline_decode(xdr, 8);
++		if (unlikely(!p))
++			goto out_overflow;
++		xdr_decode_hyper(p, &res->newsize);
++	}
++	return 0;
++out_overflow:
++	print_overflow_msg(__func__, xdr);
++	return -EIO;
 +}
-+
-+void fs_layout_return(struct super_block *sb, struct inode *ino,
-+		      struct nfsd4_pnfs_layoutreturn *lrp, int flags,
-+		      void *recall_cookie)
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ /*
+@@ -6019,6 +6449,32 @@ static int nfs4_xdr_dec_reclaim_complete
+ }
+ 
+ /*
++ * Decode GETDEVICELIST response
++ */
++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
++				      struct xdr_stream *xdr,
++				      struct nfs4_getdevicelist_res *res)
 +{
-+	int ret;
-+
-+	if (unlikely(!sb->s_pnfs_op->layout_return))
-+		return;
-+
-+	lrp->lr_flags = flags;
-+	lrp->args.lr_cookie = recall_cookie;
++	struct compound_hdr hdr;
++	int status;
 +
-+	if (!ino) /* FSID or ALL */
-+		ino = sb->s_root->d_inode;
++	dprintk("encoding getdevicelist!\n");
 +
-+	ret = sb->s_pnfs_op->layout_return(ino, &lrp->args);
-+	dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx "
-+		"cookie = %p flags 0x%x status=%d\n",
-+		__func__, ino->i_ino, lrp->args.lr_seg.iomode,
-+		lrp->args.lr_seg.offset, lrp->args.lr_seg.length,
-+		recall_cookie, flags, ret);
++	status = decode_compound_hdr(xdr, &hdr);
++	if (status != 0)
++		goto out;
++	status = decode_sequence(xdr, &res->seq_res, rqstp);
++	if (status != 0)
++		goto out;
++	status = decode_putfh(xdr);
++	if (status != 0)
++		goto out;
++	status = decode_getdevicelist(xdr, res->devlist);
++out:
++	return status;
 +}
 +
-+static u64
-+alloc_init_sbid(struct super_block *sb)
-+{
-+	struct sbid_tracker *sbid;
-+	struct sbid_tracker *new = alloc_sbid();
-+	unsigned long hash_idx = sbid_hashval(sb);
-+	u64 id = 0;
++/*
+  * Decode GETDEVINFO response
+  */
+ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
+@@ -6062,6 +6518,108 @@ static int nfs4_xdr_dec_layoutget(struct
+ out:
+ 	return status;
+ }
 +
-+	if (likely(new)) {
-+		spin_lock(&layout_lock);
-+		id = ++current_sbid;
-+		new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK);
-+		id = new->id;
-+		BUG_ON(id == 0);
-+		new->sb = sb;
++/*
++ * Decode LAYOUTRETURN response
++ */
++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
++				     struct xdr_stream *xdr,
++				     struct nfs4_layoutreturn_res *res)
++{
++	struct compound_hdr hdr;
++	int status;
 +
-+		list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash)
-+			if (sbid->sb == sb) {
-+				kfree(new);
-+				id = sbid->id;
-+				spin_unlock(&layout_lock);
-+				return id;
-+			}
-+		list_add(&new->hash, &sbid_hashtbl[hash_idx]);
-+		spin_unlock(&layout_lock);
-+	}
-+	return id;
++	status = decode_compound_hdr(xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_sequence(xdr, &res->seq_res, rqstp);
++	if (status)
++		goto out;
++	status = decode_putfh(xdr);
++	if (status)
++		goto out;
++	status = decode_layoutreturn(xdr, res);
++out:
++	return status;
 +}
 +
-+struct super_block *
-+find_sbid_id(u64 id)
++/*
++ * Decode LAYOUTCOMMIT response
++ */
++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
++				     struct xdr_stream *xdr,
++				     struct nfs4_layoutcommit_res *res)
 +{
-+	struct sbid_tracker *sbid;
-+	struct super_block *sb = NULL;
-+	unsigned long hash_idx = id & SBID_HASH_MASK;
-+	int pos = 0;
++	struct compound_hdr hdr;
++	int status;
 +
-+	spin_lock(&layout_lock);
-+	list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
-+		pos++;
-+		if (sbid->id != id)
-+			continue;
-+		if (pos > 1)
-+			list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
-+		sb = sbid->sb;
-+		break;
-+	}
-+	spin_unlock(&layout_lock);
-+	return sb;
++	status = decode_compound_hdr(xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_sequence(xdr, &res->seq_res, rqstp);
++	if (status)
++		goto out;
++	status = decode_putfh(xdr);
++	if (status)
++		goto out;
++	status = decode_layoutcommit(xdr, rqstp, res);
++	if (status)
++		goto out;
++	decode_getfattr(xdr, res->fattr, res->server,
++			!RPC_IS_ASYNC(rqstp->rq_task));
++out:
++	return status;
 +}
 +
-+u64
-+find_create_sbid(struct super_block *sb)
++/*
++ * Decode pNFS File Layout Data Server WRITE response
++ */
++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp,
++				struct xdr_stream *xdr,
++				struct nfs_writeres *res)
 +{
-+	struct sbid_tracker *sbid;
-+	unsigned long hash_idx = sbid_hashval(sb);
-+	int pos = 0;
-+	u64 id = 0;
++	struct compound_hdr hdr;
++	int status;
 +
-+	spin_lock(&layout_lock);
-+	list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
-+		pos++;
-+		if (sbid->sb != sb)
-+			continue;
-+		if (pos > 1)
-+			list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
-+		id = sbid->id;
-+		break;
-+	}
-+	spin_unlock(&layout_lock);
++	status = decode_compound_hdr(xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_sequence(xdr, &res->seq_res, rqstp);
++	if (status)
++		goto out;
++	status = decode_putfh(xdr);
++	if (status)
++		goto out;
++	status = decode_write(xdr, res);
++	if (!status)
++		return res->count;
++out:
++	return status;
++}
 +
-+	if (!id)
-+		id = alloc_init_sbid(sb);
++/*
++ * Decode pNFS File Layout Data Server COMMIT response
++ */
++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp,
++				 struct xdr_stream *xdr,
++				 struct nfs_writeres *res)
++{
++	struct compound_hdr hdr;
++	int status;
 +
-+	return id;
++	status = decode_compound_hdr(xdr, &hdr);
++	if (status)
++		goto out;
++	status = decode_sequence(xdr, &res->seq_res, rqstp);
++	if (status)
++		goto out;
++	status = decode_putfh(xdr);
++	if (status)
++		goto out;
++	status = decode_commit(xdr, res);
++out:
++	return status;
 +}
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ /**
+@@ -6081,7 +6639,7 @@ out:
+ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
+ 		       int plus)
+ {
+-	uint32_t bitmap[2] = {0};
++	uint32_t bitmap[3] = {0};
+ 	uint32_t len;
+ 	__be32 *p = xdr_inline_decode(xdr, 4);
+ 	if (unlikely(!p))
+@@ -6266,8 +6824,13 @@ struct rpc_procinfo	nfs4_procedures[] = 
+ 	PROC(SEQUENCE,		enc_sequence,		dec_sequence),
+ 	PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time),
+ 	PROC(RECLAIM_COMPLETE,	enc_reclaim_complete,	dec_reclaim_complete),
++	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist),
+ 	PROC(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo),
+ 	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget),
++	PROC(LAYOUTCOMMIT,	enc_layoutcommit,	dec_layoutcommit),
++	PROC(LAYOUTRETURN,	enc_layoutreturn,	dec_layoutreturn),
++	PROC(PNFS_WRITE,	enc_dswrite,		dec_dswrite),
++	PROC(PNFS_COMMIT,	enc_dscommit,		dec_dscommit),
+ #endif /* CONFIG_NFS_V4_1 */
+ };
+ 
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild
+--- linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild.orig	2011-01-28 09:43:53.331773009 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/Kbuild	2011-01-28 09:43:53.331773009 -0500
+@@ -0,0 +1,11 @@
++#
++# Makefile for the pNFS Objects Layout Driver kernel module
++#
++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o
++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
 +
++#
++# Panasas pNFS Layout Driver kernel module
++#
++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o
++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c
+--- linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c.orig	2011-01-28 09:43:53.333772709 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/objio_osd.c	2011-01-28 09:43:53.333772709 -0500
+@@ -0,0 +1,1060 @@
 +/*
-+ * Create a layoutrecall structure
-+ * An optional layoutrecall can be cloned (except for the layoutrecall lists)
++ *  objio_osd.c
++ *
++ *  pNFS Objects layout implementation over open-osd initiator library
++ *
++ *  Copyright (C) 2009 Panasas Inc.
++ *  All rights reserved.
++ *
++ *  Benny Halevy <bharrosh at panasas.com>
++ *  Boaz Harrosh <bharrosh at panasas.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2
++ *  See the file COPYING included with this distribution for more details.
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the Panasas company nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 + */
-+static struct nfs4_layoutrecall *
-+alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl,
-+			struct nfs4_client *clp,
-+			struct nfs4_file *lrfile)
-+{
-+	struct nfs4_layoutrecall *clr;
 +
-+	dprintk("NFSD %s\n", __func__);
-+	clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL);
-+	if (clr == NULL)
-+		return clr;
++#include <linux/module.h>
++#include <scsi/scsi_device.h>
++#include <scsi/osd_attributes.h>
++#include <scsi/osd_initiator.h>
++#include <scsi/osd_sec.h>
++#include <scsi/osd_sense.h>
 +
-+	dprintk("NFSD %s -->\n", __func__);
++#include "objlayout.h"
 +
-+	memset(clr, 0, sizeof(*clr));
-+	if (lrfile)
-+		get_nfs4_file(lrfile);
-+	clr->clr_client = clp;
-+	clr->clr_file = lrfile;
-+	clr->cb = *cbl;
++#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
++
++#define _LLU(x) ((unsigned long long)x)
 +
-+	kref_init(&clr->clr_ref);
-+	INIT_LIST_HEAD(&clr->clr_perclnt);
++enum { BIO_MAX_PAGES_KMALLOC =
++		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
++};
 +
-+	dprintk("NFSD %s return %p\n", __func__, clr);
-+	return clr;
-+}
++/* A per mountpoint struct currently for device cache */
++struct objio_mount_type {
++	struct list_head dev_list;
++	spinlock_t dev_list_lock;
++};
 +
-+static void
-+get_layoutrecall(struct nfs4_layoutrecall *clr)
-+{
-+	dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
-+		atomic_read(&clr->clr_ref.refcount));
-+	kref_get(&clr->clr_ref);
-+}
++struct _dev_ent {
++	struct list_head list;
++	struct nfs4_deviceid d_id;
++	struct osd_dev *od;
++};
 +
-+static void
-+destroy_layoutrecall(struct kref *kref)
++static void _dev_list_remove_all(struct objio_mount_type *omt)
 +{
-+	struct nfs4_layoutrecall *clr =
-+			container_of(kref, struct nfs4_layoutrecall, clr_ref);
-+	dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr,
-+		clr->clr_file, clr->clr_client);
-+	BUG_ON(!list_empty(&clr->clr_perclnt));
-+	if (clr->clr_file)
-+		put_nfs4_file(clr->clr_file);
-+	kmem_cache_free(pnfs_layoutrecall_slab, clr);
-+}
++	spin_lock(&omt->dev_list_lock);
 +
-+int
-+put_layoutrecall(struct nfs4_layoutrecall *clr)
-+{
-+	dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
-+		atomic_read(&clr->clr_ref.refcount));
-+	return kref_put(&clr->clr_ref, destroy_layoutrecall);
++	while (!list_empty(&omt->dev_list)) {
++		struct _dev_ent *de = list_entry(omt->dev_list.next,
++				 struct _dev_ent, list);
++
++		list_del_init(&de->list);
++		osduld_put_device(de->od);
++		kfree(de);
++	}
++
++	spin_unlock(&omt->dev_list_lock);
 +}
 +
-+void *
-+layoutrecall_done(struct nfs4_layoutrecall *clr)
++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt,
++	struct nfs4_deviceid *d_id)
 +{
-+	void *recall_cookie = clr->cb.cbl_cookie;
-+	struct nfs4_layoutrecall *parent = clr->parent;
++	struct list_head *le;
 +
-+	dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
-+		atomic_read(&clr->clr_ref.refcount));
-+	BUG_ON_UNLOCKED_LAYOUT();
-+	list_del_init(&clr->clr_perclnt);
-+	put_layoutrecall(clr);
++	list_for_each(le, &omt->dev_list) {
++		struct _dev_ent *de = list_entry(le, struct _dev_ent, list);
 +
-+	if (parent && !put_layoutrecall(parent))
-+		recall_cookie = NULL;
++		if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id)))
++			return de->od;
++	}
 +
-+	return recall_cookie;
++	return NULL;
 +}
 +
-+/*
-+ * get_state() and cb_get_state() are
-+ */
-+void
-+release_pnfs_ds_dev_list(struct nfs4_stateid *stp)
++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt,
++	struct nfs4_deviceid *d_id)
 +{
-+	struct pnfs_ds_dev_entry *ddp;
++	struct osd_dev *od;
 +
-+	while (!list_empty(&stp->st_pnfs_ds_id)) {
-+		ddp = list_entry(stp->st_pnfs_ds_id.next,
-+				 struct pnfs_ds_dev_entry, dd_dev_entry);
-+		list_del(&ddp->dd_dev_entry);
-+		kfree(ddp);
-+	}
++	spin_lock(&omt->dev_list_lock);
++	od = ___dev_list_find(omt, d_id);
++	spin_unlock(&omt->dev_list_lock);
++	return od;
 +}
 +
-+static int
-+nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid)
++static int _dev_list_add(struct objio_mount_type *omt,
++	struct nfs4_deviceid *d_id, struct osd_dev *od)
 +{
-+	struct pnfs_ds_dev_entry *ddp;
++	struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL);
 +
-+	ddp = kmalloc(sizeof(*ddp), GFP_KERNEL);
-+	if (!ddp)
++	if (!de)
 +		return -ENOMEM;
 +
-+	INIT_LIST_HEAD(&ddp->dd_dev_entry);
-+	list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id);
-+	ddp->dd_dsid = dsid;
++	spin_lock(&omt->dev_list_lock);
++
++	if (___dev_list_find(omt, d_id)) {
++		kfree(de);
++		goto out;
++	}
++
++	de->d_id = *d_id;
++	de->od = od;
++	list_add(&de->list, &omt->dev_list);
++
++out:
++	spin_unlock(&omt->dev_list_lock);
 +	return 0;
 +}
 +
-+/*
-+ * are two octet ranges overlapping?
-+ * start1            last1
-+ *   |-----------------|
-+ *                start2            last2
-+ *                  |----------------|
-+ */
-+static inline int
-+lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
-+{
-+	u64 start1 = l1->offset;
-+	u64 last1 = last_byte_offset(start1, l1->length);
-+	u64 start2 = l2->offset;
-+	u64 last2 = last_byte_offset(start2, l2->length);
-+	int ret;
++struct objio_segment {
++	struct pnfs_osd_layout *layout;
 +
-+	/* if last1 == start2 there's a single byte overlap */
-+	ret = (last2 >= start1) && (last1 >= start2);
-+	dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__,
-+		l1->offset, l1->length, l2->offset, l2->length, ret);
-+	return ret;
-+}
++	unsigned mirrors_p1;
++	unsigned stripe_unit;
++	unsigned group_width;	/* Data stripe_units without integrity comps */
++	u64 group_depth;
++	unsigned group_count;
 +
-+static inline int
-+same_fsid_major(struct nfs4_fsid *fsid, u64 major)
-+{
-+	return fsid->major == major;
-+}
++	unsigned num_comps;
++	/* variable length */
++	struct osd_dev	*ods[1];
++};
 +
-+static inline int
-+same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh)
-+{
-+	return same_fsid_major(fsid, current_fh->fh_export->ex_fsid);
-+}
++struct objio_state;
++typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
 +
-+/*
-+ * find a layout recall conflicting with the specified layoutget
-+ */
-+static int
-+is_layout_recalled(struct nfs4_client *clp,
-+		   struct svc_fh *current_fh,
-+		   struct nfsd4_layout_seg *seg)
-+{
-+	struct nfs4_layoutrecall *clr;
++struct objio_state {
++	/* Generic layer */
++	struct objlayout_io_state ol_state;
 +
-+	spin_lock(&layout_lock);
-+	list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) {
-+		if (clr->cb.cbl_seg.layout_type != seg->layout_type)
-+			continue;
-+		if (clr->cb.cbl_recall_type == RETURN_ALL)
-+			goto found;
-+		if (clr->cb.cbl_recall_type == RETURN_FSID) {
-+			if (same_fsid(&clr->cb.cbl_fsid, current_fh))
-+				goto found;
-+			else
-+				continue;
-+		}
-+		BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE);
-+		if (clr->cb.cbl_seg.clientid == seg->clientid &&
-+		    lo_seg_overlapping(&clr->cb.cbl_seg, seg))
-+			goto found;
-+	}
-+	spin_unlock(&layout_lock);
-+	return 0;
-+found:
-+	spin_unlock(&layout_lock);
-+	return 1;
-+}
++	struct objio_segment *objio_seg;
 +
-+/*
-+ * are two octet ranges overlapping or adjacent?
-+ */
-+static inline int
-+lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
-+{
-+	u64 start1 = l1->offset;
-+	u64 end1 = end_offset(start1, l1->length);
-+	u64 start2 = l2->offset;
-+	u64 end2 = end_offset(start2, l2->length);
++	struct kref kref;
++	objio_done_fn done;
++	void *private;
 +
-+	/* is end1 == start2 ranges are adjacent */
-+	return (end2 >= start1) && (end1 >= start2);
-+}
++	unsigned long length;
++	unsigned numdevs; /* Actually used devs in this IO */
++	/* A per-device variable array of size numdevs */
++	struct _objio_per_comp {
++		struct bio *bio;
++		struct osd_request *or;
++		unsigned long length;
++		u64 offset;
++		unsigned dev;
++	} per_dev[];
++};
 +
-+static void
-+extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg)
++/* Send and wait for a get_device_info of devices in the layout,
++   then look them up with the osd_initiator library */
++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay,
++			       struct objio_segment *objio_seg, unsigned comp)
 +{
-+	u64 lo_start = lo->offset;
-+	u64 lo_end = end_offset(lo_start, lo->length);
-+	u64 lg_start = lg->offset;
-+	u64 lg_end = end_offset(lg_start, lg->length);
++	struct pnfs_osd_layout *layout = objio_seg->layout;
++	struct pnfs_osd_deviceaddr *deviceaddr;
++	struct nfs4_deviceid *d_id;
++	struct osd_dev *od;
++	struct osd_dev_info odi;
++	struct objio_mount_type *omt = NFS_SERVER(pnfslay->inode)->pnfs_ld_data;
++	int err;
++
++	d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id;
++
++	od = _dev_list_find(omt, d_id);
++	if (od)
++		return od;
++
++	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr);
++	if (unlikely(err)) {
++		dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err);
++		return ERR_PTR(err);
++	}
 +
-+	/* lo already covers lg? */
-+	if (lo_start <= lg_start && lg_end <= lo_end)
-+		return;
++	odi.systemid_len = deviceaddr->oda_systemid.len;
++	if (odi.systemid_len > sizeof(odi.systemid)) {
++		err = -EINVAL;
++		goto out;
++	} else if (odi.systemid_len)
++		memcpy(odi.systemid, deviceaddr->oda_systemid.data,
++		       odi.systemid_len);
++	odi.osdname_len	 = deviceaddr->oda_osdname.len;
++	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data;
 +
-+	/* extend start offset */
-+	if (lo_start > lg_start)
-+		lo_start = lg_start;
++	if (!odi.osdname_len && !odi.systemid_len) {
++		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
++			__func__);
++		err = -ENODEV;
++		goto out;
++	}
 +
-+	/* extend end offset */
-+	if (lo_end < lg_end)
-+		lo_end = lg_end;
++	od = osduld_info_lookup(&odi);
++	if (unlikely(IS_ERR(od))) {
++		err = PTR_ERR(od);
++		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
++		goto out;
++	}
 +
-+	lo->offset = lo_start;
-+	lo->length = (lo_end == NFS4_MAX_UINT64) ?
-+		      lo_end : lo_end - lo_start;
++	_dev_list_add(omt, d_id, od);
++
++out:
++	dprintk("%s: return=%d\n", __func__, err);
++	objlayout_put_deviceinfo(deviceaddr);
++	return err ? ERR_PTR(err) : od;
 +}
 +
-+static struct nfs4_layout *
-+merge_layout(struct nfs4_file *fp,
-+	     struct nfs4_client *clp,
-+	     struct nfsd4_layout_seg *seg)
++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
++	struct objio_segment *objio_seg)
 +{
-+	struct nfs4_layout *lp = NULL;
++	struct pnfs_osd_layout *layout = objio_seg->layout;
++	unsigned i, num_comps = layout->olo_num_comps;
++	int err;
 +
-+	spin_lock(&layout_lock);
-+	list_for_each_entry (lp, &fp->fi_layouts, lo_perfile)
-+		if (lp->lo_seg.layout_type == seg->layout_type &&
-+		    lp->lo_seg.clientid == seg->clientid &&
-+		    lp->lo_seg.iomode == seg->iomode &&
-+		    lo_seg_mergeable(&lp->lo_seg, seg)) {
-+			extend_layout(&lp->lo_seg, seg);
-+			break;
++	/* lookup all devices */
++	for (i = 0; i < num_comps; i++) {
++		struct osd_dev *od;
++
++		od = _device_lookup(pnfslay, objio_seg, i);
++		if (unlikely(IS_ERR(od))) {
++			err = PTR_ERR(od);
++			goto out;
 +		}
-+	spin_unlock(&layout_lock);
++		objio_seg->ods[i] = od;
++	}
++	objio_seg->num_comps = num_comps;
++	err = 0;
 +
-+	return lp;
++out:
++	dprintk("%s: return=%d\n", __func__, err);
++	return err;
 +}
 +
-+__be32
-+nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp,
-+		     struct exp_xdr_stream *xdr)
++static int _verify_data_map(struct pnfs_osd_layout *layout)
 +{
-+	u32 status;
-+	__be32 nfserr;
-+	struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode;
-+	struct super_block *sb = ino->i_sb;
-+	int can_merge;
-+	struct nfs4_file *fp;
-+	struct nfs4_client *clp;
-+	struct nfs4_layout *lp = NULL;
-+	struct nfs4_layout_state *ls = NULL;
-+	struct nfsd4_pnfs_layoutget_arg args = {
-+		.lg_minlength = lgp->lg_minlength,
-+		.lg_fh = &lgp->lg_fhp->fh_handle,
-+	};
-+	struct nfsd4_pnfs_layoutget_res res = {
-+		.lg_seg = lgp->lg_seg,
-+	};
-+
-+	dprintk("NFSD: %s Begin\n", __func__);
++	struct pnfs_osd_data_map *data_map = &layout->olo_map;
++	u64 stripe_length;
++	u32 group_width;
 +
-+	args.lg_sbid = find_create_sbid(sb);
-+	if (!args.lg_sbid) {
-+		nfserr = nfserr_layouttrylater;
-+		goto out;
++/* FIXME: Only raid0 for now. if not go through MDS */
++	if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
++		printk(KERN_ERR "Only RAID_0 for now\n");
++		return -ENOTSUPP;
 +	}
-+
-+	can_merge = sb->s_pnfs_op->can_merge_layouts != NULL &&
-+		    sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type);
-+
-+	nfs4_lock_state();
-+	fp = find_alloc_file(ino, lgp->lg_fhp);
-+	clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid);
-+	dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp);
-+	if (!fp || !clp) {
-+		nfserr = nfserr_inval;
-+		goto out_unlock;
++	if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
++		printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
++			  data_map->odm_num_comps, data_map->odm_mirror_cnt);
++		return -EINVAL;
 +	}
 +
-+	/* Check decoded layout stateid */
-+	nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls);
-+	if (nfserr)
-+		goto out_unlock;
++	if (data_map->odm_group_width)
++		group_width = data_map->odm_group_width;
++	else
++		group_width = data_map->odm_num_comps /
++						(data_map->odm_mirror_cnt + 1);
 +
-+	if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) {
-+		nfserr = nfserr_recallconflict;
-+		goto out;
++	stripe_length = (u64)data_map->odm_stripe_unit * group_width;
++	if (stripe_length >= (1ULL << 32)) {
++		printk(KERN_ERR "Total Stripe length(0x%llx)"
++			  " >= 32bit is not supported\n", _LLU(stripe_length));
++		return -ENOTSUPP;
 +	}
 +
-+	/* pre-alloc layout in case we can't merge after we call
-+	 * the file system
-+	 */
-+	lp = alloc_layout();
-+	if (!lp) {
-+		nfserr = nfserr_layouttrylater;
-+		goto out_unlock;
++	if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
++		printk(KERN_ERR "Stripe Unit(0x%llx)"
++			  " must be Multples of PAGE_SIZE(0x%lx)\n",
++			  _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
++		return -ENOTSUPP;
 +	}
 +
-+	dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd "
-+		"iomode %u offset %llu length %llu\n",
-+		__func__, lgp->lg_seg.layout_type,
-+		exp_xdr_qbytes(xdr->end - xdr->p),
-+		lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length);
++	return 0;
++}
 +
-+	/* FIXME: need to eliminate the use of the state lock */
-+	nfs4_unlock_state();
-+	status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res);
-+	nfs4_lock_state();
++int objio_alloc_lseg(void **outp,
++	struct pnfs_layout_hdr *pnfslay,
++	struct pnfs_layout_segment *lseg,
++	struct pnfs_osd_layout *layout)
++{
++	struct objio_segment *objio_seg;
++	int err;
 +
-+	dprintk("pNFS %s: post-export status %u "
-+		"iomode %u offset %llu length %llu\n",
-+		__func__, status, res.lg_seg.iomode,
-+		res.lg_seg.offset, res.lg_seg.length);
++	err = _verify_data_map(layout);
++	if (unlikely(err))
++		return err;
 +
-+	/*
-+	 * The allowable error codes for the layout_get pNFS export
-+	 * operations vector function (from the file system) can be
-+	 * expanded as needed to include other errors defined for
-+	 * the RFC 5561 LAYOUTGET operation.
-+	 */
-+	switch (status) {
-+	case 0:
-+		nfserr = NFS4_OK;
-+		break;
-+	case NFS4ERR_ACCESS:
-+	case NFS4ERR_BADIOMODE:
-+		/* No support for LAYOUTIOMODE4_RW layouts */
-+	case NFS4ERR_BADLAYOUT:
-+		/* No layout matching loga_minlength rules */
-+	case NFS4ERR_INVAL:
-+	case NFS4ERR_IO:
-+	case NFS4ERR_LAYOUTTRYLATER:
-+	case NFS4ERR_LAYOUTUNAVAILABLE:
-+	case NFS4ERR_LOCKED:
-+	case NFS4ERR_NOSPC:
-+	case NFS4ERR_RECALLCONFLICT:
-+	case NFS4ERR_SERVERFAULT:
-+	case NFS4ERR_TOOSMALL:
-+		/* Requested layout too big for loga_maxcount */
-+	case NFS4ERR_WRONG_TYPE:
-+		/* Not a regular file */
-+		nfserr = cpu_to_be32(status);
-+		goto out_freelayout;
-+	default:
-+		BUG();
-+		nfserr = nfserr_serverfault;
++	objio_seg = kzalloc(sizeof(*objio_seg) +
++			(layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
++			GFP_KERNEL);
++	if (!objio_seg)
++		return -ENOMEM;
++
++	objio_seg->layout = layout;
++	err = objio_devices_lookup(pnfslay, objio_seg);
++	if (err)
++		goto free_seg;
++
++	objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
++	objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
++	if (layout->olo_map.odm_group_width) {
++		objio_seg->group_width = layout->olo_map.odm_group_width;
++		objio_seg->group_depth = layout->olo_map.odm_group_depth;
++		objio_seg->group_count = layout->olo_map.odm_num_comps /
++						objio_seg->mirrors_p1 /
++						objio_seg->group_width;
++	} else {
++		objio_seg->group_width = layout->olo_map.odm_num_comps /
++						objio_seg->mirrors_p1;
++		objio_seg->group_depth = -1;
++		objio_seg->group_count = 1;
 +	}
 +
-+	lgp->lg_seg = res.lg_seg;
-+	lgp->lg_roc = res.lg_return_on_close;
++	*outp = objio_seg;
++	return 0;
++
++free_seg:
++	dprintk("%s: Error: return %d\n", __func__, err);
++	kfree(objio_seg);
++	*outp = NULL;
++	return err;
++}
++
++void objio_free_lseg(void *p)
++{
++	struct objio_segment *objio_seg = p;
++
++	kfree(objio_seg);
++}
++
++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++{
++	struct objio_segment *objio_seg = seg;
++	struct objio_state *ios;
++	const unsigned first_size = sizeof(*ios) +
++				objio_seg->num_comps * sizeof(ios->per_dev[0]);
++	const unsigned sec_size = objio_seg->num_comps *
++						sizeof(ios->ol_state.ioerrs[0]);
++
++	dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
++	ios = kzalloc(first_size + sec_size, GFP_KERNEL);
++	if (unlikely(!ios))
++		return -ENOMEM;
 +
-+	/* SUCCESS!
-+	 * Can the new layout be merged into an existing one?
-+	 * If so, free unused layout struct
-+	 */
-+	if (can_merge && merge_layout(fp, clp, &res.lg_seg))
-+		goto out_freelayout;
++	ios->objio_seg = objio_seg;
++	ios->ol_state.ioerrs = ((void *)ios) + first_size;
++	ios->ol_state.num_comps = objio_seg->num_comps;
 +
-+	/* Can't merge, so let's initialize this new layout */
-+	init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid);
-+out_unlock:
-+	if (ls)
-+		put_layout_state(ls);
-+	if (fp)
-+		put_nfs4_file(fp);
-+	nfs4_unlock_state();
-+out:
-+	dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp,
-+		be32_to_cpu(nfserr));
-+	return nfserr;
-+out_freelayout:
-+	free_layout(lp);
-+	goto out_unlock;
++	*outp = &ios->ol_state;
++	return 0;
 +}
 +
-+static void
-+trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr)
++void objio_free_io_state(struct objlayout_io_state *ol_state)
 +{
-+	u64 lo_start = lo->offset;
-+	u64 lo_end = end_offset(lo_start, lo->length);
-+	u64 lr_start = lr->offset;
-+	u64 lr_end = end_offset(lr_start, lr->length);
++	struct objio_state *ios = container_of(ol_state, struct objio_state,
++					       ol_state);
 +
-+	dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__,
-+		lo->offset, lo->length, lr->offset, lr->length);
++	kfree(ios);
++}
 +
-+	/* lr fully covers lo? */
-+	if (lr_start <= lo_start && lo_end <= lr_end) {
-+		lo->length = 0;
-+		goto out;
-+	}
++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
++{
++	switch (oep) {
++	case OSD_ERR_PRI_NO_ERROR:
++		return (enum pnfs_osd_errno)0;
 +
-+	/*
-+	 * split not supported yet. retain layout segment.
-+	 * remains must be returned by the client
-+	 * on the final layout return.
-+	 */
-+	if (lo_start < lr_start && lr_end < lo_end) {
-+		dprintk("%s: split not supported\n", __func__);
-+		goto out;
++	case OSD_ERR_PRI_CLEAR_PAGES:
++		BUG_ON(1);
++		return 0;
++
++	case OSD_ERR_PRI_RESOURCE:
++		return PNFS_OSD_ERR_RESOURCE;
++	case OSD_ERR_PRI_BAD_CRED:
++		return PNFS_OSD_ERR_BAD_CRED;
++	case OSD_ERR_PRI_NO_ACCESS:
++		return PNFS_OSD_ERR_NO_ACCESS;
++	case OSD_ERR_PRI_UNREACHABLE:
++		return PNFS_OSD_ERR_UNREACHABLE;
++	case OSD_ERR_PRI_NOT_FOUND:
++		return PNFS_OSD_ERR_NOT_FOUND;
++	case OSD_ERR_PRI_NO_SPACE:
++		return PNFS_OSD_ERR_NO_SPACE;
++	default:
++		WARN_ON(1);
++		/* fallthrough */
++	case OSD_ERR_PRI_EIO:
++		return PNFS_OSD_ERR_EIO;
 +	}
++}
 +
-+	if (lo_start < lr_start)
-+		lo_end = lr_start - 1;
-+	else /* lr_end < lo_end */
-+		lo_start = lr_end + 1;
++static void _clear_bio(struct bio *bio)
++{
++	struct bio_vec *bv;
++	unsigned i;
 +
-+	lo->offset = lo_start;
-+	lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start;
-+out:
-+	dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length);
++	__bio_for_each_segment(bv, bio, i, 0) {
++		unsigned this_count = bv->bv_len;
++
++		if (likely(PAGE_SIZE == this_count))
++			clear_highpage(bv->bv_page);
++		else
++			zero_user(bv->bv_page, bv->bv_offset, this_count);
++	}
 +}
 +
-+static int
-+pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp,
-+			 struct nfsd4_pnfs_layoutreturn *lrp,
-+			 struct nfs4_layout_state *ls)
++static int _io_check(struct objio_state *ios, bool is_write)
 +{
-+	int layouts_found = 0;
-+	struct nfs4_layout *lp, *nextlp;
++	enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
++	int lin_ret = 0;
++	int i;
 +
-+	dprintk("%s: clp %p fp %p\n", __func__, clp, fp);
-+	spin_lock(&layout_lock);
-+	list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) {
-+		dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n",
-+			__func__, lp,
-+			lp->lo_client, clp,
-+			lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type,
-+			lp->lo_seg.iomode, lrp->args.lr_seg.iomode);
-+		if (lp->lo_client != clp ||
-+		    lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type ||
-+		    (lp->lo_seg.iomode != lrp->args.lr_seg.iomode &&
-+		     lrp->args.lr_seg.iomode != IOMODE_ANY) ||
-+		     !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg))
++	for (i = 0; i <  ios->numdevs; i++) {
++		struct osd_sense_info osi;
++		struct osd_request *or = ios->per_dev[i].or;
++		int ret;
++
++		if (!or)
 +			continue;
-+		layouts_found++;
-+		trim_layout(&lp->lo_seg, &lrp->args.lr_seg);
-+		if (!lp->lo_seg.length) {
-+			lrp->lrs_present = 0;
-+			dequeue_layout(lp);
-+			destroy_layout(lp);
++
++		ret = osd_req_decode_sense(or, &osi);
++		if (likely(!ret))
++			continue;
++
++		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
++			/* start read offset passed endof file */
++			BUG_ON(is_write);
++			_clear_bio(ios->per_dev[i].bio);
++			dprintk("%s: start read offset passed end of file "
++				"offset=0x%llx, length=0x%lx\n", __func__,
++				_LLU(ios->per_dev[i].offset),
++				ios->per_dev[i].length);
++
++			continue; /* we recovered */
++		}
++		objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev,
++					osd_pri_2_pnfs_err(osi.osd_err_pri),
++					ios->per_dev[i].offset,
++					ios->per_dev[i].length,
++					is_write);
++
++		if (osi.osd_err_pri >= oep) {
++			oep = osi.osd_err_pri;
++			lin_ret = ret;
 +		}
 +	}
-+	if (ls && layouts_found && lrp->lrs_present)
-+		update_layout_stateid(ls, &lrp->lr_sid);
-+	spin_unlock(&layout_lock);
 +
-+	return layouts_found;
++	return lin_ret;
 +}
 +
-+static int
-+pnfs_return_client_layouts(struct nfs4_client *clp,
-+			   struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid)
++/*
++ * Common IO state helpers.
++ */
++static void _io_free(struct objio_state *ios)
 +{
-+	int layouts_found = 0;
-+	struct nfs4_layout *lp, *nextlp;
++	unsigned i;
 +
-+	spin_lock(&layout_lock);
-+	list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) {
-+		if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type ||
-+		   (lrp->args.lr_seg.iomode != lp->lo_seg.iomode &&
-+		    lrp->args.lr_seg.iomode != IOMODE_ANY))
-+			continue;
++	for (i = 0; i < ios->numdevs; i++) {
++		struct _objio_per_comp *per_dev = &ios->per_dev[i];
 +
-+		if (lrp->args.lr_return_type == RETURN_FSID &&
-+		    !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid))
-+			continue;
++		if (per_dev->or) {
++			osd_end_request(per_dev->or);
++			per_dev->or = NULL;
++		}
 +
-+		layouts_found++;
-+		dequeue_layout(lp);
-+		destroy_layout(lp);
++		if (per_dev->bio) {
++			bio_put(per_dev->bio);
++			per_dev->bio = NULL;
++		}
 +	}
-+	spin_unlock(&layout_lock);
-+
-+	return layouts_found;
 +}
 +
-+static int
-+recall_return_perfect_match(struct nfs4_layoutrecall *clr,
-+			    struct nfsd4_pnfs_layoutreturn *lrp,
-+			    struct nfs4_file *fp,
-+			    struct svc_fh *current_fh)
++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)
 +{
-+	if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode ||
-+	    clr->cb.cbl_recall_type != lrp->args.lr_return_type)
-+		return 0;
++	unsigned min_dev = ios->objio_seg->layout->olo_comps_index;
++	unsigned max_dev = min_dev + ios->ol_state.num_comps;
 +
-+	return (clr->cb.cbl_recall_type == RETURN_FILE &&
-+		clr->clr_file == fp &&
-+		clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset &&
-+		clr->cb.cbl_seg.length == lrp->args.lr_seg.length) ||
++	BUG_ON(dev < min_dev || max_dev <= dev);
++	return ios->objio_seg->ods[dev - min_dev];
++}
 +
-+		(clr->cb.cbl_recall_type == RETURN_FSID &&
-+		 same_fsid(&clr->cb.cbl_fsid, current_fh)) ||
++struct _striping_info {
++	u64 obj_offset;
++	u64 group_length;
++	u64 total_group_length;
++	u64 Major;
++	unsigned dev;
++	unsigned unit_off;
++};
 +
-+		clr->cb.cbl_recall_type == RETURN_ALL;
-+}
++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
++			      struct _striping_info *si)
++{
++	u32	stripe_unit = ios->objio_seg->stripe_unit;
++	u32	group_width = ios->objio_seg->group_width;
++	u64	group_depth = ios->objio_seg->group_depth;
++	u32	U = stripe_unit * group_width;
++
++	u64	T = U * group_depth;
++	u64	S = T * ios->objio_seg->group_count;
++	u64	M = div64_u64(file_offset, S);
++
++	/*
++	G = (L - (M * S)) / T
++	H = (L - (M * S)) % T
++	*/
++	u64	LmodU = file_offset - M * S;
++	u32	G = div64_u64(LmodU, T);
++	u64	H = LmodU - G * T;
 +
-+static int
-+recall_return_partial_match(struct nfs4_layoutrecall *clr,
-+			    struct nfsd4_pnfs_layoutreturn *lrp,
-+			    struct nfs4_file *fp,
-+			    struct svc_fh *current_fh)
-+{
-+	/* iomode matching? */
-+	if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode &&
-+	    clr->cb.cbl_seg.iomode != IOMODE_ANY &&
-+	    lrp->args.lr_seg.iomode != IOMODE_ANY)
-+		return 0;
++	u32	N = div_u64(H, U);
 +
-+	if (clr->cb.cbl_recall_type == RETURN_ALL ||
-+	    lrp->args.lr_return_type == RETURN_ALL)
-+		return 1;
++	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
++	si->obj_offset = si->unit_off + (N * stripe_unit) +
++				  (M * group_depth * stripe_unit);
 +
-+	/* fsid matches? */
-+	if (clr->cb.cbl_recall_type == RETURN_FSID ||
-+	    lrp->args.lr_return_type == RETURN_FSID)
-+		return same_fsid(&clr->cb.cbl_fsid, current_fh);
++	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
++	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
++	si->dev *= ios->objio_seg->mirrors_p1;
 +
-+	/* file matches, range overlapping? */
-+	return clr->clr_file == fp &&
-+	       lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg);
++	si->group_length = T - H;
++	si->total_group_length = T;
++	si->Major = M;
 +}
 +
-+int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh,
-+			    struct nfsd4_pnfs_layoutreturn *lrp)
++static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
++		unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len)
 +{
-+	int status = 0;
-+	int layouts_found = 0;
-+	struct inode *ino = current_fh->fh_dentry->d_inode;
-+	struct nfs4_file *fp = NULL;
-+	struct nfs4_client *clp;
-+	struct nfs4_layout_state *ls = NULL;
-+	struct nfs4_layoutrecall *clr, *nextclr;
-+	u64 ex_fsid = current_fh->fh_export->ex_fsid;
-+	void *recall_cookie = NULL;
++	unsigned pg = *cur_pg;
++	struct request_queue *q =
++			osd_request_queue(_io_od(ios, per_dev->dev));
 +
-+	dprintk("NFSD: %s\n", __func__);
++	per_dev->length += cur_len;
 +
-+	nfs4_lock_state();
-+	clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid);
-+	if (!clp)
-+		goto out;
++	if (per_dev->bio == NULL) {
++		unsigned stripes = ios->ol_state.num_comps /
++						     ios->objio_seg->mirrors_p1;
++		unsigned pages_in_stripe = stripes *
++				      (ios->objio_seg->stripe_unit / PAGE_SIZE);
++		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
++				    stripes;
 +
-+	if (lrp->args.lr_return_type == RETURN_FILE) {
-+		fp = find_file(ino);
-+		if (!fp) {
-+			printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for "
-+				"ino %p:%lu\n",
-+				__func__, ino, ino ? ino->i_ino : 0L);
-+			goto out;
++		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
++		if (unlikely(!per_dev->bio)) {
++			dprintk("Faild to allocate BIO size=%u\n", bio_size);
++			return -ENOMEM;
 +		}
-+
-+		/* Check the stateid */
-+		dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino);
-+		status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls);
-+		if (status)
-+			goto out_put_file;
-+
-+		/* update layouts */
-+		layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls);
-+		/* optimize for the all-empty case */
-+		if (list_empty(&fp->fi_layouts))
-+			recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
-+	} else {
-+		layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid);
 +	}
 +
-+	dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d "
-+		"return_type %d fsid 0x%llx offset %llu length %llu: "
-+		"layouts_found %d\n",
-+		__func__, clp, fp, lrp->args.lr_seg.layout_type,
-+		lrp->args.lr_seg.iomode, lrp->args.lr_return_type,
-+		ex_fsid,
-+		lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found);
++	while (cur_len > 0) {
++		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
++		unsigned added_len;
 +
-+	/* update layoutrecalls
-+	 * note: for RETURN_{FSID,ALL}, fp may be NULL
-+	 */
-+	spin_lock(&layout_lock);
-+	list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls,
-+				  clr_perclnt) {
-+		if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type)
-+			continue;
++		BUG_ON(ios->ol_state.nr_pages <= pg);
++		cur_len -= pglen;
 +
-+		if (recall_return_perfect_match(clr, lrp, fp, current_fh))
-+			recall_cookie = layoutrecall_done(clr);
-+		else if (layouts_found &&
-+			 recall_return_partial_match(clr, lrp, fp, current_fh))
-+			clr->clr_time = CURRENT_TIME;
++		added_len = bio_add_pc_page(q, per_dev->bio,
++					ios->ol_state.pages[pg], pglen, pgbase);
++		if (unlikely(pglen != added_len))
++			return -ENOMEM;
++		pgbase = 0;
++		++pg;
 +	}
-+	spin_unlock(&layout_lock);
-+
-+out_put_file:
-+	if (fp)
-+		put_nfs4_file(fp);
-+	if (ls)
-+		put_layout_state(ls);
-+out:
-+	nfs4_unlock_state();
-+
-+	/* call exported filesystem layout_return (ignore return-code) */
-+	fs_layout_return(sb, ino, lrp, 0, recall_cookie);
++	BUG_ON(cur_len);
 +
-+	dprintk("pNFS %s: exit status %d \n", __func__, status);
-+	return status;
++	*cur_pg = pg;
++	return 0;
 +}
 +
-+/*
-+ * PNFS Metadata server export operations callback for get_state
-+ *
-+ * called by the cluster fs when it receives a get_state() from a data
-+ * server.
-+ * returns status, or pnfs_get_state* with pnfs_get_state->status set.
-+ *
-+ */
-+int
-+nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg)
++static int _prepare_one_group(struct objio_state *ios, u64 length,
++			      struct _striping_info *si, unsigned first_comp,
++			      unsigned *last_pg)
 +{
-+	struct nfs4_stateid *stp;
-+	int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */
-+	int status = -EINVAL;
-+	struct inode *ino;
-+	struct nfs4_delegation *dl;
-+	stateid_t *stid = (stateid_t *)&arg->stid;
-+
-+	dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__,
-+		STATEID_VAL(stid), arg->ino);
++	unsigned stripe_unit = ios->objio_seg->stripe_unit;
++	unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
++	unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1;
++	unsigned dev = si->dev;
++	unsigned first_dev = dev - (dev % devs_in_group);
++	unsigned comp = first_comp + (dev - first_dev);
++	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
++	unsigned cur_pg = *last_pg;
++	int ret = 0;
 +
-+	nfs4_lock_state();
-+	stp = find_stateid(stid, flags);
-+	if (!stp) {
-+		ino = iget_locked(sb, arg->ino);
-+		if (!ino)
-+			goto out;
++	while (length) {
++		struct _objio_per_comp *per_dev = &ios->per_dev[comp];
++		unsigned cur_len, page_off = 0;
 +
-+		if (ino->i_state & I_NEW) {
-+			iget_failed(ino);
-+			goto out;
-+		}
++		if (!per_dev->length) {
++			per_dev->dev = dev;
++			if (dev < si->dev) {
++				per_dev->offset = si->obj_offset + stripe_unit -
++								   si->unit_off;
++				cur_len = stripe_unit;
++			} else if (dev == si->dev) {
++				per_dev->offset = si->obj_offset;
++				cur_len = stripe_unit - si->unit_off;
++				page_off = si->unit_off & ~PAGE_MASK;
++				BUG_ON(page_off &&
++				      (page_off != ios->ol_state.pgbase));
++			} else { /* dev > si->dev */
++				per_dev->offset = si->obj_offset - si->unit_off;
++				cur_len = stripe_unit;
++			}
 +
-+		dl = find_delegation_stateid(ino, stid);
-+		if (dl)
-+			status = 0;
++			if (max_comp < comp)
++				max_comp = comp;
 +
-+		iput(ino);
-+	} else {
-+		/* XXX ANDROS: marc removed nfs4_check_fh - how come? */
++			dev += mirrors_p1;
++			dev = (dev % devs_in_group) + first_dev;
++		} else {
++			cur_len = stripe_unit;
++		}
++		if (cur_len >= length)
++			cur_len = length;
 +
-+		/* arg->devid is the Data server id, set by the cluster fs */
-+		status = nfs4_add_pnfs_ds_dev(stp, arg->dsid);
-+		if (status)
++		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
++				       cur_len);
++		if (unlikely(ret))
 +			goto out;
 +
-+		arg->access = stp->st_access_bmap;
-+		*(clientid_t *)&arg->clid =
-+			stp->st_stateowner->so_client->cl_clientid;
++		comp += mirrors_p1;
++		comp = (comp % devs_in_group) + first_comp;
++
++		length -= cur_len;
++		ios->length += cur_len;
 +	}
 +out:
-+	nfs4_unlock_state();
-+	return status;
++	ios->numdevs = max_comp + mirrors_p1;
++	*last_pg = cur_pg;
++	return ret;
 +}
 +
-+static int
-+cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile,
-+		   stateid_t *lsid)
++static int _io_rw_pagelist(struct objio_state *ios)
 +{
-+	int found = 0;
-+	struct nfs4_layout *lp;
-+	struct nfs4_layout_state *ls;
++	u64 length = ios->ol_state.count;
++	struct _striping_info si;
++	unsigned devs_in_group = ios->objio_seg->group_width *
++				 ios->objio_seg->mirrors_p1;
++	unsigned first_comp = 0;
++	unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps;
++	unsigned last_pg = 0;
++	int ret = 0;
++
++	_calc_stripe_info(ios, ios->ol_state.offset, &si);
++	while (length) {
++		if (length < si.group_length)
++			si.group_length = length;
++
++		ret = _prepare_one_group(ios, si.group_length, &si, first_comp,
++					 &last_pg);
++		if (unlikely(ret))
++			goto out;
++
++		length -= si.group_length;
++
++		si.group_length = si.total_group_length;
++		si.unit_off = 0;
++		++si.Major;
++		si.obj_offset = si.Major * ios->objio_seg->stripe_unit *
++						ios->objio_seg->group_depth;
 +
-+	spin_lock(&layout_lock);
-+	list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) {
-+		if (lp->lo_file != lrfile)
-+			continue;
++		si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
++		si.dev %= num_comps;
 +
-+		ls = find_get_layout_state(clp, lrfile);
-+		if (!ls) {
-+			/* This shouldn't happen as the file should have a
-+			 * layout stateid if it has a layout.
-+			 */
-+			printk(KERN_ERR "%s: file %p has no layout stateid\n",
-+				__func__, lrfile);
-+			WARN_ON(1);
-+			break;
-+		}
-+		update_layout_stateid(ls, lsid);
-+		put_layout_state_locked(ls);
-+		found = 1;
-+		break;
++		first_comp += devs_in_group;
++		first_comp %= num_comps;
 +	}
-+	spin_unlock(&layout_lock);
 +
-+	return found;
++out:
++	if (!ios->length)
++		return ret;
++
++	return 0;
 +}
 +
-+static int
-+cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid)
++static ssize_t _sync_done(struct objio_state *ios)
 +{
-+	int found = 0;
-+	struct nfs4_layout *lp;
++	struct completion *waiting = ios->private;
 +
-+	/* note: minor version unused */
-+	spin_lock(&layout_lock);
-+	list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt)
-+		if (lp->lo_file->fi_fsid.major == fsid->major) {
-+			found = 1;
-+			break;
-+		}
-+	spin_unlock(&layout_lock);
-+	return found;
++	complete(waiting);
++	return 0;
 +}
 +
-+static int
-+cl_has_any_layout(struct nfs4_client *clp)
++static void _last_io(struct kref *kref)
 +{
-+	return !list_empty(&clp->cl_layouts);
-+}
++	struct objio_state *ios = container_of(kref, struct objio_state, kref);
 +
-+static int
-+cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl,
-+	      struct nfs4_file *lrfile, stateid_t *lsid)
-+{
-+	switch (cbl->cbl_recall_type) {
-+	case RETURN_FILE:
-+		return cl_has_file_layout(clp, lrfile, lsid);
-+	case RETURN_FSID:
-+		return cl_has_fsid_layout(clp, &cbl->cbl_fsid);
-+	default:
-+		return cl_has_any_layout(clp);
-+	}
++	ios->done(ios);
 +}
 +
-+/*
-+ * Called without the layout_lock.
-+ */
-+void
-+nomatching_layout(struct nfs4_layoutrecall *clr)
++static void _done_io(struct osd_request *or, void *p)
 +{
-+	struct nfsd4_pnfs_layoutreturn lr = {
-+		.args.lr_return_type = clr->cb.cbl_recall_type,
-+		.args.lr_seg = clr->cb.cbl_seg,
-+	};
-+	struct inode *inode;
-+	void *recall_cookie;
-+
-+	if (clr->clr_file) {
-+		inode = igrab(clr->clr_file->fi_inode);
-+		if (WARN_ON(!inode))
-+			return;
-+	} else {
-+		inode = NULL;
-+	}
-+
-+	dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__,
-+		clr->clr_client, clr->clr_file);
-+
-+	if (clr->cb.cbl_recall_type == RETURN_FILE)
-+		pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr,
-+					 NULL);
-+	else
-+		pnfs_return_client_layouts(clr->clr_client, &lr,
-+					   clr->cb.cbl_fsid.major);
-+
-+	spin_lock(&layout_lock);
-+	recall_cookie = layoutrecall_done(clr);
-+	spin_unlock(&layout_lock);
++	struct objio_state *ios = p;
 +
-+	fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN,
-+			 recall_cookie);
-+	iput(inode);
++	kref_put(&ios->kref, _last_io);
 +}
 +
-+void pnfs_expire_client(struct nfs4_client *clp)
++static ssize_t _io_exec(struct objio_state *ios)
 +{
-+	for (;;) {
-+		struct nfs4_layoutrecall *lrp = NULL;
-+
-+		spin_lock(&layout_lock);
-+		if (!list_empty(&clp->cl_layoutrecalls)) {
-+			lrp = list_entry(clp->cl_layoutrecalls.next,
-+					 struct nfs4_layoutrecall, clr_perclnt);
-+			get_layoutrecall(lrp);
-+		}
-+		spin_unlock(&layout_lock);
-+		if (!lrp)
-+			break;
++	DECLARE_COMPLETION_ONSTACK(wait);
++	ssize_t status = 0; /* sync status */
++	unsigned i;
++	objio_done_fn saved_done_fn = ios->done;
++	bool sync = ios->ol_state.sync;
 +
-+		dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file);
-+		BUG_ON(lrp->clr_client != clp);
-+		nomatching_layout(lrp);
-+		put_layoutrecall(lrp);
++	if (sync) {
++		ios->done = _sync_done;
++		ios->private = &wait;
 +	}
 +
-+	for (;;) {
-+		struct nfs4_layout *lp = NULL;
-+		struct inode *inode = NULL;
-+		struct nfsd4_pnfs_layoutreturn lr;
-+		bool empty = false;
++	kref_init(&ios->kref);
 +
-+		spin_lock(&layout_lock);
-+		if (!list_empty(&clp->cl_layouts)) {
-+			lp = list_entry(clp->cl_layouts.next,
-+					struct nfs4_layout, lo_perclnt);
-+			inode = igrab(lp->lo_file->fi_inode);
-+			memset(&lr, 0, sizeof(lr));
-+			lr.args.lr_return_type = RETURN_FILE;
-+			lr.args.lr_seg = lp->lo_seg;
-+			empty = list_empty(&lp->lo_file->fi_layouts);
-+			BUG_ON(lp->lo_client != clp);
-+			dequeue_layout(lp);
-+			destroy_layout(lp); /* do not access lp after this */
-+		}
-+		spin_unlock(&layout_lock);
-+		if (!lp)
-+			break;
++	for (i = 0; i < ios->numdevs; i++) {
++		struct osd_request *or = ios->per_dev[i].or;
 +
-+		if (WARN_ON(!inode))
-+			break;
++		if (!or)
++			continue;
 +
-+		dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino,
-+			lp, clp);
++		kref_get(&ios->kref);
++		osd_execute_request_async(or, _done_io, ios);
++	}
 +
-+		fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE,
-+				 empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL);
-+		iput(inode);
++	kref_put(&ios->kref, _last_io);
++
++	if (sync) {
++		wait_for_completion(&wait);
++		status = saved_done_fn(ios);
 +	}
-+}
 +
-+struct create_recall_list_arg {
-+	struct nfsd4_pnfs_cb_layout *cbl;
-+	struct nfs4_file *lrfile;
-+	struct list_head *todolist;
-+	unsigned todo_count;
-+};
++	return status;
++}
 +
 +/*
-+ * look for matching layout for the given client
-+ * and add a pending layout recall to the todo list
-+ * if found any.
-+ * returns:
-+ *   0 if layouts found or negative error.
++ * read
 + */
-+static int
-+lo_recall_per_client(struct nfs4_client *clp, void *p)
++static ssize_t _read_done(struct objio_state *ios)
 +{
-+	stateid_t lsid;
-+	struct nfs4_layoutrecall *pending;
-+	struct create_recall_list_arg *arg = p;
++	ssize_t status;
++	int ret = _io_check(ios, false);
 +
-+	memset(&lsid, 0, sizeof(lsid));
-+	if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid))
-+		return 0;
++	_io_free(ios);
 +
-+	/* Matching put done by layoutreturn */
-+	pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile);
-+	/* out of memory, drain todo queue */
-+	if (!pending)
-+		return -ENOMEM;
++	if (likely(!ret))
++		status = ios->length;
++	else
++		status = ret;
 +
-+	*(stateid_t *)&pending->cb.cbl_sid = lsid;
-+	list_add(&pending->clr_perclnt, arg->todolist);
-+	arg->todo_count++;
-+	return 0;
++	objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
++	return status;
 +}
 +
-+/* Create a layoutrecall structure for each client based on the
-+ * original structure. */
-+int
-+create_layout_recall_list(struct list_head *todolist, unsigned *todo_len,
-+			  struct nfsd4_pnfs_cb_layout *cbl,
-+			  struct nfs4_file *lrfile)
++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
 +{
-+	struct nfs4_client *clp;
-+	struct create_recall_list_arg arg = {
-+		.cbl = cbl,
-+		.lrfile = lrfile,
-+		.todolist = todolist,
++	struct osd_request *or = NULL;
++	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
++	unsigned dev = per_dev->dev;
++	struct pnfs_osd_object_cred *cred =
++			&ios->objio_seg->layout->olo_comps[dev];
++	struct osd_obj_id obj = {
++		.partition = cred->oc_object_id.oid_partition_id,
++		.id = cred->oc_object_id.oid_object_id,
 +	};
-+	int status = 0;
-+
-+	dprintk("%s: -->\n", __func__);
++	int ret;
 +
-+	/* If client given by fs, just do single client */
-+	if (cbl->cbl_seg.clientid) {
-+		clp = find_confirmed_client(
-+				(clientid_t *)&cbl->cbl_seg.clientid);
-+		if (!clp) {
-+			status = -ENOENT;
-+			dprintk("%s: clientid %llx not found\n", __func__,
-+				(unsigned long long)cbl->cbl_seg.clientid);
-+			goto out;
-+		}
++	or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
++	if (unlikely(!or)) {
++		ret = -ENOMEM;
++		goto err;
++	}
++	per_dev->or = or;
 +
-+		status = lo_recall_per_client(clp, &arg);
-+	} else {
-+		/* Check all clients for layout matches */
-+		status = filter_confirmed_clients(lo_recall_per_client, &arg);
++	osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
++
++	ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
++	if (ret) {
++		dprintk("%s: Faild to osd_finalize_request() => %d\n",
++			__func__, ret);
++		goto err;
 +	}
 +
-+out:
-+	*todo_len = arg.todo_count;
-+	dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status);
-+	return status;
++	dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
++		__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
++		per_dev->length);
++
++err:
++	return ret;
 +}
 +
-+/*
-+ * Recall layouts asynchronously
-+ * Called with state lock.
-+ */
-+static int
-+spawn_layout_recall(struct super_block *sb, struct list_head *todolist,
-+		    unsigned todo_len)
++static ssize_t _read_exec(struct objio_state *ios)
 +{
-+	struct nfs4_layoutrecall *pending;
-+	struct nfs4_layoutrecall *parent = NULL;
-+	int status = 0;
-+
-+	dprintk("%s: -->\n", __func__);
-+
-+	if (todo_len > 1) {
-+		pending = list_entry(todolist->next, struct nfs4_layoutrecall,
-+				     clr_perclnt);
++	unsigned i;
++	int ret;
 +
-+		parent = alloc_init_layoutrecall(&pending->cb, NULL,
-+						 pending->clr_file);
-+		if (unlikely(!parent)) {
-+			/* We want forward progress. If parent cannot be
-+			 * allocated take the first one as parent but don't
-+			 * execute it.  Caller must check for -EAGAIN, if so
-+			 * When the partial recalls return,
-+			 * nfsd_layout_recall_cb should be called again.
-+			 */
-+			list_del_init(&pending->clr_perclnt);
-+			if (todo_len > 2) {
-+				parent = pending;
-+			} else {
-+				parent = NULL;
-+				put_layoutrecall(pending);
-+			}
-+			--todo_len;
-+				status = -ENOMEM;
-+		}
++	for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
++		if (!ios->per_dev[i].length)
++			continue;
++		ret = _read_mirrors(ios, i);
++		if (unlikely(ret))
++			goto err;
 +	}
 +
-+	while (!list_empty(todolist)) {
-+		pending = list_entry(todolist->next, struct nfs4_layoutrecall,
-+				     clr_perclnt);
-+		list_del_init(&pending->clr_perclnt);
-+		dprintk("%s: clp %p cb_client %p fp %p\n", __func__,
-+			pending->clr_client,
-+			pending->clr_client->cl_cb_client,
-+			pending->clr_file);
-+		if (unlikely(!pending->clr_client->cl_cb_client)) {
-+			printk(KERN_INFO
-+				"%s: clientid %08x/%08x has no callback path\n",
-+				__func__,
-+				pending->clr_client->cl_clientid.cl_boot,
-+				pending->clr_client->cl_clientid.cl_id);
-+			put_layoutrecall(pending);
-+			continue;
-+		}
++	ios->done = _read_done;
++	return _io_exec(ios); /* In sync mode exec returns the io status */
 +
-+		pending->clr_time = CURRENT_TIME;
-+		pending->clr_sb = sb;
-+		if (parent) {
-+			/* If we created a parent its initial ref count is 1.
-+			 * We will need to de-ref it eventually. So we just
-+			 * don't increment on behalf of the last one.
-+			 */
-+			if (todo_len != 1)
-+				get_layoutrecall(parent);
-+		}
-+		pending->parent = parent;
-+		get_layoutrecall(pending);
-+		/* Add to list so corresponding layoutreturn can find req */
-+		list_add(&pending->clr_perclnt,
-+			 &pending->clr_client->cl_layoutrecalls);
++err:
++	_io_free(ios);
++	return ret;
++}
 +
-+		nfsd4_cb_layout(pending);
-+		--todo_len;
-+	}
++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
++{
++	struct objio_state *ios = container_of(ol_state, struct objio_state,
++					       ol_state);
++	int ret;
 +
-+	return status;
++	ret = _io_rw_pagelist(ios);
++	if (unlikely(ret))
++		return ret;
++
++	return _read_exec(ios);
 +}
 +
 +/*
-+ * Spawn a thread to perform a recall layout
-+ *
++ * write
 + */
-+int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode,
-+			  struct nfsd4_pnfs_cb_layout *cbl)
++static ssize_t _write_done(struct objio_state *ios)
 +{
-+	int status;
-+	struct nfs4_file *lrfile = NULL;
-+	struct list_head todolist;
-+	unsigned todo_len = 0;
++	ssize_t status;
++	int ret = _io_check(ios, true);
 +
-+	dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl);
-+	BUG_ON(!cbl);
-+	BUG_ON(cbl->cbl_recall_type != RETURN_FILE &&
-+	       cbl->cbl_recall_type != RETURN_FSID &&
-+	       cbl->cbl_recall_type != RETURN_ALL);
-+	BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode);
-+	BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ &&
-+	       cbl->cbl_seg.iomode != IOMODE_RW &&
-+	       cbl->cbl_seg.iomode != IOMODE_ANY);
++	_io_free(ios);
 +
-+	if (nfsd_serv == NULL) {
-+		dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n");
-+		return -ENOENT;
++	if (likely(!ret)) {
++		/* FIXME: should be based on the OSD's persistence model
++		 * See OSD2r05 Section 4.13 Data persistence model */
++		ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC;
++		status = ios->length;
++	} else {
++		status = ret;
 +	}
 +
-+	nfs4_lock_state();
-+	status = -ENOENT;
-+	if (inode) {
-+		lrfile = find_file(inode);
-+		if (!lrfile) {
-+			dprintk("NFSD nfsd_layout_recall_cb: "
-+				"nfs4_file not found\n");
-+			goto err;
-+		}
-+		if (cbl->cbl_recall_type == RETURN_FSID)
-+			cbl->cbl_fsid = lrfile->fi_fsid;
-+	}
++	objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
++	return status;
++}
 +
-+	INIT_LIST_HEAD(&todolist);
++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
++{
++	struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
++	unsigned dev = ios->per_dev[cur_comp].dev;
++	unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1;
++	int ret;
 +
-+	/* If no cookie provided by FS, return a default one */
-+	if (!cbl->cbl_cookie)
-+		cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
++	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
++		struct osd_request *or = NULL;
++		struct pnfs_osd_object_cred *cred =
++					&ios->objio_seg->layout->olo_comps[dev];
++		struct osd_obj_id obj = {
++			.partition = cred->oc_object_id.oid_partition_id,
++			.id = cred->oc_object_id.oid_object_id,
++		};
++		struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
++		struct bio *bio;
 +
-+	status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile);
-+	if (list_empty(&todolist)) {
-+		status = -ENOENT;
-+	} else {
-+		/* process todolist even if create_layout_recall_list
-+		 * returned an error */
-+		int status2 = spawn_layout_recall(sb, &todolist, todo_len);
-+		if (status2)
-+			status = status2;
-+	}
++		or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
++		if (unlikely(!or)) {
++			ret = -ENOMEM;
++			goto err;
++		}
++		per_dev->or = or;
 +
-+err:
-+	nfs4_unlock_state();
-+	if (lrfile)
-+		put_nfs4_file(lrfile);
-+	return (todo_len && status) ? -EAGAIN : status;
-+}
++		if (per_dev != master_dev) {
++			bio = bio_kmalloc(GFP_KERNEL,
++					  master_dev->bio->bi_max_vecs);
++			if (unlikely(!bio)) {
++				dprintk("Faild to allocate BIO size=%u\n",
++					master_dev->bio->bi_max_vecs);
++				ret = -ENOMEM;
++				goto err;
++			}
 +
-+struct create_device_notify_list_arg {
-+	struct list_head *todolist;
-+	struct nfsd4_pnfs_cb_dev_list *ndl;
-+};
++			__bio_clone(bio, master_dev->bio);
++			bio->bi_bdev = NULL;
++			bio->bi_next = NULL;
++			per_dev->bio = bio;
++			per_dev->dev = dev;
++			per_dev->length = master_dev->length;
++			per_dev->offset =  master_dev->offset;
++		} else {
++			bio = master_dev->bio;
++			/* FIXME: bio_set_dir() */
++			bio->bi_rw |= REQ_WRITE;
++		}
 +
-+static int
-+create_device_notify_per_cl(struct nfs4_client *clp, void *p)
-+{
-+	struct nfs4_notify_device *cbnd;
-+	struct create_device_notify_list_arg *arg = p;
++		osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
 +
-+	if (atomic_read(&clp->cl_deviceref) <= 0)
-+		return 0;
++		ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
++		if (ret) {
++			dprintk("%s: Faild to osd_finalize_request() => %d\n",
++				__func__, ret);
++			goto err;
++		}
 +
-+	cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL);
-+	if (!cbnd)
-+		return -ENOMEM;
++		dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
++			__func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
++			per_dev->length);
++	}
 +
-+	cbnd->nd_list = arg->ndl;
-+	cbnd->nd_client = clp;
-+	list_add(&cbnd->nd_perclnt, arg->todolist);
-+	return 0;
++err:
++	return ret;
 +}
 +
-+/* Create a list of clients to send device notifications. */
-+int
-+create_device_notify_list(struct list_head *todolist,
-+			  struct nfsd4_pnfs_cb_dev_list *ndl)
++static ssize_t _write_exec(struct objio_state *ios)
 +{
-+	int status;
-+	struct create_device_notify_list_arg arg = {
-+		.todolist = todolist,
-+		.ndl = ndl,
-+	};
++	unsigned i;
++	int ret;
 +
-+	nfs4_lock_state();
-+	status = filter_confirmed_clients(create_device_notify_per_cl, &arg);
-+	nfs4_unlock_state();
++	for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
++		if (!ios->per_dev[i].length)
++			continue;
++		ret = _write_mirrors(ios, i);
++		if (unlikely(ret))
++			goto err;
++	}
 +
-+	return status;
++	ios->done = _write_done;
++	return _io_exec(ios); /* In sync mode exec returns the io->status */
++
++err:
++	_io_free(ios);
++	return ret;
 +}
 +
-+/*
-+ * For each client that a device, send a device notification.
-+ * XXX: Need to track which clients have which devices.
-+ */
-+int nfsd_device_notify_cb(struct super_block *sb,
-+			  struct nfsd4_pnfs_cb_dev_list *ndl)
++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
 +{
-+	struct nfs4_notify_device *cbnd;
-+	unsigned int notify_num = 0;
-+	int status2, status = 0;
-+	struct list_head todolist;
-+
-+	BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list);
-+
-+	dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len);
++	struct objio_state *ios = container_of(ol_state, struct objio_state,
++					       ol_state);
++	int ret;
 +
-+	if (nfsd_serv == NULL)
-+		return -ENOENT;
++	/* TODO: ios->stable = stable; */
++	ret = _io_rw_pagelist(ios);
++	if (unlikely(ret))
++		return ret;
 +
-+	INIT_LIST_HEAD(&todolist);
++	return _write_exec(ios);
++}
 +
-+	status = create_device_notify_list(&todolist, ndl);
++/*
++ * Policy Operations
++ */
 +
-+	while (!list_empty(&todolist)) {
-+		cbnd = list_entry(todolist.next, struct nfs4_notify_device,
-+				  nd_perclnt);
-+		list_del_init(&cbnd->nd_perclnt);
-+		status2 = nfsd4_cb_notify_device(cbnd);
-+		pnfs_clear_device_notify(cbnd->nd_client);
-+		if (status2) {
-+			kfree(cbnd);
-+			status = status2;
-+		}
-+		notify_num++;
-+	}
++/*
++ * Get the max [rw]size
++ */
++static ssize_t
++objlayout_get_blocksize(void)
++{
++	ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE;
 +
-+	dprintk("NFSD %s: status %d clients %u\n",
-+		__func__, status, notify_num);
-+	return status;
++	return sz;
 +}
-diff --git a/fs/nfsd/nfs4pnfsdlm.c b/fs/nfsd/nfs4pnfsdlm.c
-new file mode 100644
-index 0000000..006ded5
---- /dev/null
-+++ b/fs/nfsd/nfs4pnfsdlm.c
-@@ -0,0 +1,461 @@
-+/******************************************************************************
-+ *
-+ * (c) 2007 Network Appliance, Inc.  All Rights Reserved.
-+ * (c) 2009 NetApp.  All Rights Reserved.
-+ *
-+ * NetApp provides this source code under the GPL v2 License.
-+ * The GPL v2 license is available at
-+ * http://opensource.org/licenses/gpl-license.php.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++/*
++ * Don't gather across stripes, but rather gather (coalesce) up to
++ * the stripe size.
 + *
-+ ******************************************************************************/
++ * FIXME: change interface to use merge_align, merge_count
++ */
++static struct pnfs_layoutdriver_type objlayout_type = {
++	.id = LAYOUT_OSD2_OBJECTS,
++	.name = "LAYOUT_OSD2_OBJECTS",
++	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
 +
-+#include <linux/nfs4.h>
-+#include <linux/nfsd/const.h>
-+#include <linux/nfsd/debug.h>
-+#include <linux/nfsd/nfs4pnfsdlm.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/sunrpc/clnt.h>
++	.set_layoutdriver        = objlayout_set_layoutdriver,
++	.clear_layoutdriver      = objlayout_clear_layoutdriver,
 +
-+#include "nfsfh.h"
-+#include "nfsd.h"
++	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
++	.free_layout_hdr         = objlayout_free_layout_hdr,
 +
-+#define NFSDDBG_FACILITY                NFSDDBG_PROC
++	.alloc_lseg              = objlayout_alloc_lseg,
++	.free_lseg               = objlayout_free_lseg,
 +
-+/* Just use a linked list. Do not expect more than 32 dlm_device_entries
-+ * the first implementation will just use one device per cluster file system
-+ */
++	.get_blocksize           = objlayout_get_blocksize,
 +
-+static LIST_HEAD(dlm_device_list);
-+static DEFINE_SPINLOCK(dlm_device_list_lock);
++	.read_pagelist           = objlayout_read_pagelist,
++	.write_pagelist          = objlayout_write_pagelist,
++	.commit                  = objlayout_commit,
 +
-+struct dlm_device_entry {
-+	struct list_head	dlm_dev_list;
-+	char			disk_name[DISK_NAME_LEN];
-+	int			num_ds;
-+	char			ds_list[NFSD_DLM_DS_LIST_MAX];
++	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
++	.encode_layoutreturn     = objlayout_encode_layoutreturn,
 +};
 +
-+static struct dlm_device_entry *
-+_nfsd4_find_pnfs_dlm_device(char *disk_name)
++void *objio_init_mt(void)
 +{
-+	struct dlm_device_entry *dlm_pdev;
++	struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL);
 +
-+	dprintk("--> %s  disk name %s\n", __func__, disk_name);
-+	spin_lock(&dlm_device_list_lock);
-+	list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) {
-+		dprintk("%s Look for dlm_pdev %s\n", __func__,
-+			dlm_pdev->disk_name);
-+		if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) {
-+			spin_unlock(&dlm_device_list_lock);
-+			return dlm_pdev;
-+		}
-+	}
-+	spin_unlock(&dlm_device_list_lock);
-+	return NULL;
++	if (!omt)
++		return ERR_PTR(-ENOMEM);
++
++	INIT_LIST_HEAD(&omt->dev_list);
++	spin_lock_init(&omt->dev_list_lock);
++	return omt;
 +}
 +
-+static struct dlm_device_entry *
-+nfsd4_find_pnfs_dlm_device(struct super_block *sb) {
-+	char dname[BDEVNAME_SIZE];
++void objio_fini_mt(void *mountid)
++{
++	_dev_list_remove_all(mountid);
++	kfree(mountid);
++}
 +
-+	bdevname(sb->s_bdev, dname);
-+	return _nfsd4_find_pnfs_dlm_device(dname);
++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
++MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
++MODULE_LICENSE("GPL");
++
++static int __init
++objlayout_init(void)
++{
++	int ret = pnfs_register_layoutdriver(&objlayout_type);
++
++	if (ret)
++		printk(KERN_INFO
++			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
++			__func__, ret);
++	else
++		printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
++			__func__);
++	return ret;
 +}
 +
-+ssize_t
-+nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen)
++static void __exit
++objlayout_exit(void)
 +{
-+	char *pos = buf;
-+	ssize_t size = 0;
-+	struct dlm_device_entry *dlm_pdev;
-+	int ret = -EINVAL;
++	pnfs_unregister_layoutdriver(&objlayout_type);
++	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
++	       __func__);
++}
 +
-+	spin_lock(&dlm_device_list_lock);
-+	list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list)
-+	{
-+		int advanced;
-+		advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list);
-+		if (advanced >= buflen - size)
-+			goto out;
-+		size += advanced;
-+		pos += advanced;
-+	}
-+	ret = size;
++module_init(objlayout_init);
++module_exit(objlayout_exit);
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c
+--- linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c.orig	2011-01-28 09:43:53.334772561 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.c	2011-01-28 09:43:53.334772561 -0500
+@@ -0,0 +1,773 @@
++/*
++ *  objlayout.c
++ *
++ *  pNFS layout driver for Panasas OSDs
++ *
++ *  Copyright (C) 2007-2009 Panasas Inc.
++ *  All rights reserved.
++ *
++ *  Benny Halevy <bhalevy at panasas.com>
++ *  Boaz Harrosh <bharrosh at panasas.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2
++ *  See the file COPYING included with this distribution for more details.
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the Panasas company nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
 +
-+out:
-+	spin_unlock(&dlm_device_list_lock);
-+	return ret;
-+}
++#include <scsi/osd_initiator.h>
++#include "objlayout.h"
 +
-+bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds)
-+{
-+	char *start = ds_list;
++#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 +
-+	*num_ds = 0;
++struct pnfs_client_operations *pnfs_client_ops;
 +
-+	while (*start) {
-+		struct sockaddr_storage tempAddr;
-+		int ipLen = strcspn(start, ",");
++/*
++ * Create a objlayout layout structure for the given inode and return it.
++ */
++struct pnfs_layout_hdr *
++objlayout_alloc_layout_hdr(struct inode *inode)
++{
++	struct objlayout *objlay;
 +
-+		if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr)))
-+			return false;
-+		(*num_ds)++;
-+		start += ipLen + 1;
++	objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
++	if (objlay) {
++		spin_lock_init(&objlay->lock);
++		INIT_LIST_HEAD(&objlay->err_list);
 +	}
-+	return true;
++	dprintk("%s: Return %p\n", __func__, objlay);
++	return &objlay->pnfs_layout;
 +}
 +
 +/*
-+ * pnfs_dlm_device string format:
-+ *     block-device-path:<ds1 ipv4 address>,<ds2 ipv4 address>
-+ *
-+ * Examples
-+ *     /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with
-+ *     two data servers for the dlm cluster file system mounted on /dev/sda.
-+ *
-+ *     /dev/sda:192.168.1.96,192.168.1.100'
-+ *     replaces the data server list for /dev/sda
-+ *
-+ *     Only the deviceid == 1 is supported. Can add device id to
-+ *     pnfs_dlm_device string when needed.
-+ *
-+ *     Only the round robin each data server once stripe index is supported.
++ * Free an objlayout layout structure
 + */
-+int
-+nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len)
-+
++void
++objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 +{
-+	struct dlm_device_entry *new, *found;
-+	char *bufp = pnfs_dlm_device;
-+	char *endp = bufp + strlen(bufp);
-+	int err = -ENOMEM;
++	struct objlayout *objlay = OBJLAYOUT(lo);
 +
-+	dprintk("--> %s len %d\n", __func__, len);
++	dprintk("%s: objlay %p\n", __func__, objlay);
 +
-+	new = kzalloc(sizeof(*new), GFP_KERNEL);
-+	if (!new)
-+		return err;
++	WARN_ON(!list_empty(&objlay->err_list));
++	kfree(objlay);
++}
 +
-+	err = -EINVAL;
-+	/* disk_name */
-+	/* FIXME: need to check for valid disk_name. search superblocks?
-+	 * check for slash dev slash ?
-+	 */
-+	len = strcspn(bufp, ":");
-+	if (len > DISK_NAME_LEN)
-+		goto out_free;
-+	memcpy(new->disk_name, bufp, len);
++/*
++ * Unmarshall layout and store it in pnfslay.
++ */
++struct pnfs_layout_segment *
++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
++		     struct nfs4_layoutget_res *lgr)
++{
++	int status;
++	void *layout = lgr->layout.buf;
++	struct objlayout_segment *objlseg;
++	struct pnfs_osd_layout *pnfs_osd_layout;
 +
-+	err = -EINVAL;
-+	bufp += len + 1;
-+	if (bufp >= endp)
-+		goto out_free;
++	dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout);
 +
-+	/* data server list */
-+	/* FIXME: need to check for comma separated valid ip format */
-+	len = strcspn(bufp, ":");
-+	if (len > NFSD_DLM_DS_LIST_MAX)
-+		goto out_free;
-+	memcpy(new->ds_list, bufp, len);
++	BUG_ON(!layout);
 +
++	status = -ENOMEM;
++	objlseg = kzalloc(sizeof(*objlseg) +
++			  pnfs_osd_layout_incore_sz(layout), GFP_KERNEL);
++	if (!objlseg)
++		goto err;
 +
-+	/*  validate the ips */
-+	if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds)))
-+		goto out_free;
++	pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
++	pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout);
 +
-+	dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__,
-+		new->disk_name, new->num_ds, new->ds_list);
++	objlseg->lseg.range = lgr->range;
++	status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg,
++				  pnfs_osd_layout);
++	if (status)
++		goto err;
 +
-+	found = _nfsd4_find_pnfs_dlm_device(new->disk_name);
-+	if (found) {
-+		/* FIXME: should compare found->ds_list with new->ds_list
-+		 * and if it is different, kick off a CB_NOTIFY change
-+		 * deviceid.
-+		 */
-+		dprintk("%s pnfs_dlm_device %s:%s already in cache "
-+			" replace ds_list with new ds_list %s\n", __func__,
-+			found->disk_name, found->ds_list, new->ds_list);
-+		memset(found->ds_list, 0, DISK_NAME_LEN);
-+		memcpy(found->ds_list, new->ds_list, strlen(new->ds_list));
-+		found->num_ds = new->num_ds;
-+		kfree(new);
-+	} else {
-+		dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__,
-+				new->disk_name, new->ds_list);
-+		spin_lock(&dlm_device_list_lock);
-+		list_add(&new->dlm_dev_list, &dlm_device_list);
-+		spin_unlock(&dlm_device_list_lock);
-+	}
-+	dprintk("<-- %s Success\n", __func__);
-+	return 0;
++	dprintk("%s: Return %p\n", __func__, &objlseg->lseg);
++	return &objlseg->lseg;
 +
-+out_free:
-+	kfree(new);
-+	dprintk("<-- %s returns %d\n", __func__, err);
-+	return err;
++ err:
++	kfree(objlseg);
++	return ERR_PTR(status);
 +}
 +
-+void nfsd4_pnfs_dlm_shutdown(void)
++/*
++ * Free a layout segement
++ */
++void
++objlayout_free_lseg(struct pnfs_layout_segment *lseg)
 +{
-+	struct dlm_device_entry *dlm_pdev, *next;
++	struct objlayout_segment *objlseg;
 +
-+	dprintk("--> %s\n", __func__);
++	dprintk("%s: freeing layout segment %p\n", __func__, lseg);
 +
-+	spin_lock(&dlm_device_list_lock);
-+	list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list,
-+				  dlm_dev_list) {
-+		list_del(&dlm_pdev->dlm_dev_list);
-+		kfree(dlm_pdev);
-+	}
-+	spin_unlock(&dlm_device_list_lock);
++	if (unlikely(!lseg))
++		return;
++
++	objlseg = container_of(lseg, struct objlayout_segment, lseg);
++	objio_free_lseg(objlseg->internal);
++	kfree(objlseg);
 +}
 +
-+static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb,
-+				     u32 layout_type,
-+				     struct nfsd4_pnfs_dev_iter_res *res)
++/*
++ * I/O Operations
++ */
++static inline u64
++end_offset(u64 start, u64 len)
 +{
-+	if (layout_type != LAYOUT_NFSV4_1_FILES) {
-+		printk(KERN_ERR "%s: ERROR: layout type isn't 'file' "
-+			"(type: %x)\n", __func__, layout_type);
-+		return -ENOTSUPP;
-+	}
-+
-+	res->gd_eof = 1;
-+	if (res->gd_cookie)
-+		return -ENOENT;
++	u64 end;
 +
-+	res->gd_cookie = 1;
-+	res->gd_verf = 1;
-+	res->gd_devid = 1;
-+	return 0;
++	end = start + len;
++	return end >= start ? end : NFS4_MAX_UINT64;
 +}
 +
-+static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb,
-+				     struct exp_xdr_stream *xdr,
-+				     u32 layout_type,
-+				     const struct nfsd4_pnfs_deviceid *devid)
++/* last octet in a range */
++static inline u64
++last_byte_offset(u64 start, u64 len)
 +{
-+	int err, len, i = 0;
-+	struct pnfs_filelayout_device fdev;
-+	struct pnfs_filelayout_devaddr *daddr;
-+	struct dlm_device_entry *dlm_pdev;
-+	char   *bufp;
++	u64 end;
 +
-+	err = -ENOTSUPP;
-+	if (layout_type != LAYOUT_NFSV4_1_FILES) {
-+		dprintk("%s: ERROR: layout type isn't 'file' "
-+			"(type: %x)\n", __func__, layout_type);
-+		return err;
-+	}
++	BUG_ON(!len);
++	end = start + len;
++	return end > start ? end - 1 : NFS4_MAX_UINT64;
++}
 +
-+	/* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO
-+	 * with a gdia_device_id != 1 is invalid.
-+	 */
-+	err = -EINVAL;
-+	if (devid->devid != 1) {
-+		dprintk("%s: WARNING: didn't receive a deviceid of "
-+			"1 (got: 0x%llx)\n", __func__, devid->devid);
-+		return err;
-+	}
++static struct objlayout_io_state *
++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
++			struct page **pages,
++			unsigned pgbase,
++			unsigned nr_pages,
++			loff_t offset,
++			size_t count,
++			struct pnfs_layout_segment *lseg,
++			void *rpcdata)
++{
++	struct objlayout_segment *objlseg =
++		container_of(lseg, struct objlayout_segment, lseg);
++	struct objlayout_io_state *state;
++	u64 lseg_end_offset;
++	size_t size_nr_pages;
 +
-+	/*
-+	 * If the DS list has not been established, return -EINVAL
-+	 */
-+	dlm_pdev = nfsd4_find_pnfs_dlm_device(sb);
-+	if (!dlm_pdev) {
-+		dprintk("%s: DEBUG: disk %s Not Found\n", __func__,
-+			sb->s_bdev->bd_disk->disk_name);
-+		return err;
++	dprintk("%s: allocating io_state\n", __func__);
++	if (objio_alloc_io_state(objlseg->internal, &state))
++		return NULL;
++
++	BUG_ON(offset < lseg->range.offset);
++	lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length);
++	BUG_ON(offset >= lseg_end_offset);
++	if (offset + count > lseg_end_offset) {
++		count = lseg->range.length - (offset - lseg->range.offset);
++		dprintk("%s: truncated count %Zd\n", __func__, count);
 +	}
 +
-+	dprintk("%s: Found disk %s with DS list |%s|\n",
-+		__func__, dlm_pdev->disk_name, dlm_pdev->ds_list);
-+
-+	memset(&fdev, '\0', sizeof(fdev));
-+	fdev.fl_device_length = dlm_pdev->num_ds;
++	if (pgbase > PAGE_SIZE) {
++		unsigned n = pgbase >> PAGE_SHIFT;
 +
-+	err = -ENOMEM;
-+	len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length;
-+	fdev.fl_device_list = kzalloc(len, GFP_KERNEL);
-+	if (!fdev.fl_device_list) {
-+		printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list "
-+			"buffer for %d DSes.\n", __func__, i);
-+		fdev.fl_device_length = 0;
-+		goto out;
++		pgbase &= ~PAGE_MASK;
++		pages += n;
++		nr_pages -= n;
 +	}
 +
-+	/* Set a simple stripe indicie */
-+	fdev.fl_stripeindices_length = fdev.fl_device_length;
-+	fdev.fl_stripeindices_list = kzalloc(sizeof(u32) *
-+				     fdev.fl_stripeindices_length, GFP_KERNEL);
++	size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
++	BUG_ON(nr_pages < size_nr_pages);
++	if (nr_pages > size_nr_pages)
++		nr_pages = size_nr_pages;
 +
-+	if (!fdev.fl_stripeindices_list) {
-+		printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices "
-+			"list buffer for %d DSes.\n", __func__, i);
-+		goto out;
-+	}
-+	for (i = 0; i < fdev.fl_stripeindices_length; i++)
-+		fdev.fl_stripeindices_list[i] = i;
++	INIT_LIST_HEAD(&state->err_list);
++	state->objlseg = objlseg;
++	state->rpcdata = rpcdata;
++	state->pages = pages;
++	state->pgbase = pgbase;
++	state->nr_pages = nr_pages;
++	state->offset = offset;
++	state->count = count;
++	state->sync = 0;
 +
-+	/* Transfer the data server list with a single multipath entry */
-+	bufp = dlm_pdev->ds_list;
-+	for (i = 0; i < fdev.fl_device_length; i++) {
-+		daddr = kmalloc(sizeof(*daddr), GFP_KERNEL);
-+		if (!daddr) {
-+			printk(KERN_ERR "%s: ERROR: unable to kmalloc a device "
-+				"addr buffer.\n", __func__);
-+			goto out;
-+		}
++	return state;
++}
 +
-+		daddr->r_netid.data = "tcp";
-+		daddr->r_netid.len = 3;
++static void
++objlayout_free_io_state(struct objlayout_io_state *state)
++{
++	dprintk("%s: freeing io_state\n", __func__);
++	if (unlikely(!state))
++		return;
 +
-+		len = strcspn(bufp, ",");
-+		daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL);
-+		memcpy(daddr->r_addr.data, bufp, len);
-+		/*
-+		 * append the port number.  interpreted as two more bytes
-+		 * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049.
-+		 */
-+		memcpy(daddr->r_addr.data + len, ".8.1", 4);
-+		daddr->r_addr.len = len + 4;
++	objio_free_io_state(state);
++}
 +
-+		fdev.fl_device_list[i].fl_multipath_length = 1;
-+		fdev.fl_device_list[i].fl_multipath_list = daddr;
++/*
++ * I/O done common code
++ */
++static void
++objlayout_iodone(struct objlayout_io_state *state)
++{
++	dprintk("%s: state %p status\n", __func__, state);
 +
-+		dprintk("%s: encoding DS |%s|\n", __func__, bufp);
++	if (likely(state->status >= 0)) {
++		objlayout_free_io_state(state);
++	} else {
++		struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
 +
-+		bufp += len + 1;
++		spin_lock(&objlay->lock);
++		objlay->delta_space_valid = OBJ_DSU_INVALID;
++		list_add(&objlay->err_list, &state->err_list);
++		spin_unlock(&objlay->lock);
 +	}
++}
 +
-+	/* have nfsd encode the device info */
-+	err = filelayout_encode_devinfo(xdr, &fdev);
-+out:
-+	for (i = 0; i < fdev.fl_device_length; i++)
-+		kfree(fdev.fl_device_list[i].fl_multipath_list);
-+	kfree(fdev.fl_device_list);
-+	kfree(fdev.fl_stripeindices_list);
-+	dprintk("<-- %s returns %d\n", __func__, err);
-+	return err;
++/*
++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
++ *
++ * The @index component IO failed (error returned from target). Register
++ * the error for later reporting at layout-return.
++ */
++void
++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
++			int osd_error, u64 offset, u64 length, bool is_write)
++{
++	struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
++
++	BUG_ON(index >= state->num_comps);
++	if (osd_error) {
++		struct pnfs_osd_layout *layout =
++			(typeof(layout))state->objlseg->pnfs_osd_layout;
++
++		ioerr->oer_component = layout->olo_comps[index].oc_object_id;
++		ioerr->oer_comp_offset = offset;
++		ioerr->oer_comp_length = length;
++		ioerr->oer_iswrite = is_write;
++		ioerr->oer_errno = osd_error;
++
++		dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
++			"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
++			__func__, index, ioerr->oer_errno,
++			ioerr->oer_iswrite,
++			_DEVID_LO(&ioerr->oer_component.oid_device_id),
++			_DEVID_HI(&ioerr->oer_component.oid_device_id),
++			ioerr->oer_component.oid_partition_id,
++			ioerr->oer_component.oid_object_id,
++			ioerr->oer_comp_offset,
++			ioerr->oer_comp_length);
++	} else {
++		/* User need not call if no error is reported */
++		ioerr->oer_errno = 0;
++	}
 +}
 +
-+static int get_stripe_unit(int blocksize)
++static void _rpc_commit_complete(struct work_struct *work)
 +{
-+	if (blocksize >= NFSSVC_MAXBLKSIZE)
-+		return blocksize;
-+	return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
++	struct rpc_task *task;
++	struct nfs_write_data *wdata;
++
++	dprintk("%s enter\n", __func__);
++	task = container_of(work, struct rpc_task, u.tk_work);
++	wdata = container_of(task, struct nfs_write_data, task);
++
++	pnfs_commit_done(wdata);
 +}
 +
 +/*
-+ * Look up inode block device in pnfs_dlm_device list.
-+ * Hash on the inode->i_ino and number of data servers.
++ * Commit data remotely on OSDs
 + */
-+static int dlm_ino_hash(struct inode *ino)
++enum pnfs_try_status
++objlayout_commit(struct nfs_write_data *wdata, int how)
 +{
-+	struct dlm_device_entry *de;
-+	u32 hash_mask = 0;
++	int status = PNFS_ATTEMPTED;
 +
-+	/* If can't find the inode block device in the pnfs_dlm_deivce list
-+	 * then don't hand out a layout
-+	 */
-+	de = nfsd4_find_pnfs_dlm_device(ino->i_sb);
-+	if (!de)
-+		return -1;
-+	hash_mask = de->num_ds - 1;
-+	return ino->i_ino & hash_mask;
++	INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete);
++	schedule_work(&wdata->task.u.tk_work);
++	dprintk("%s: Return %d\n", __func__, status);
++	return status;
 +}
 +
-+static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode,
-+			   struct exp_xdr_stream *xdr,
-+			   const struct nfsd4_pnfs_layoutget_arg *args,
-+			   struct nfsd4_pnfs_layoutget_res *res)
++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
++ * This is because the osd completion is called with ints-off from
++ * the block layer
++ */
++static void _rpc_read_complete(struct work_struct *work)
 +{
-+	struct pnfs_filelayout_layout *layout = NULL;
-+	struct knfsd_fh *fhp = NULL;
-+	int index;
-+	enum nfsstat4 rc = NFS4_OK;
++	struct rpc_task *task;
++	struct nfs_read_data *rdata;
 +
-+	dprintk("%s: LAYOUT_GET\n", __func__);
++	dprintk("%s enter\n", __func__);
++	task = container_of(work, struct rpc_task, u.tk_work);
++	rdata = container_of(task, struct nfs_read_data, task);
 +
-+	/* DLM exported file systems only support layouts for READ */
-+	if (res->lg_seg.iomode == IOMODE_RW)
-+		return NFS4ERR_BADIOMODE;
++	pnfs_read_done(rdata);
++}
 +
-+	index = dlm_ino_hash(inode);
-+	dprintk("%s first stripe index %d i_ino %lu\n", __func__, index,
-+		inode->i_ino);
-+	if (index < 0)
-+		return NFS4ERR_LAYOUTUNAVAILABLE;
++void
++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
++{
++	int eof = state->eof;
++	struct nfs_read_data *rdata;
 +
-+	res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
-+	/* Always give out whole file layouts */
-+	res->lg_seg.offset = 0;
-+	res->lg_seg.length = NFS4_MAX_UINT64;
-+	/* Always give out READ ONLY layouts */
-+	res->lg_seg.iomode = IOMODE_READ;
++	state->status = status;
++	dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
++	rdata = state->rpcdata;
++	rdata->task.tk_status = status;
++	if (status >= 0) {
++		rdata->res.count = status;
++		rdata->res.eof = eof;
++	}
++	objlayout_iodone(state);
++	/* must not use state after this point */
++
++	if (sync)
++		pnfs_read_done(rdata);
++	else {
++		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
++		schedule_work(&rdata->task.u.tk_work);
++	}
++}
++
++/*
++ * Perform sync or async reads.
++ */
++enum pnfs_try_status
++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages)
++{
++	loff_t offset = rdata->args.offset;
++	size_t count = rdata->args.count;
++	struct objlayout_io_state *state;
++	ssize_t status = 0;
++	loff_t eof;
++
++	dprintk("%s: Begin inode %p offset %llu count %d\n",
++		__func__, rdata->inode, offset, (int)count);
++
++	eof = i_size_read(rdata->inode);
++	if (unlikely(offset + count > eof)) {
++		if (offset >= eof) {
++			status = 0;
++			rdata->res.count = 0;
++			rdata->res.eof = 1;
++			goto out;
++		}
++		count = eof - offset;
++	}
 +
-+	layout = kzalloc(sizeof(*layout), GFP_KERNEL);
-+	if (layout == NULL) {
-+		rc = NFS4ERR_LAYOUTTRYLATER;
-+		goto error;
++	state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
++					 rdata->args.pages, rdata->args.pgbase,
++					 nr_pages, offset, count,
++					 rdata->pdata.lseg, rdata);
++	if (unlikely(!state)) {
++		status = -ENOMEM;
++		goto out;
 +	}
 +
-+	/* Set file layout response args */
-+	layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
-+	layout->lg_stripe_type = STRIPE_SPARSE;
-+	layout->lg_commit_through_mds = false;
-+	layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
-+	layout->lg_fh_length = 1;
-+	layout->device_id.sbid = args->lg_sbid;
-+	layout->device_id.devid = 1;                                /*FSFTEMP*/
-+	layout->lg_first_stripe_index = index;                      /*FSFTEMP*/
-+	layout->lg_pattern_offset = 0;
++	state->eof = state->offset + state->count >= eof;
 +
-+	fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
-+	if (fhp == NULL) {
-+		rc = NFS4ERR_LAYOUTTRYLATER;
-+		goto error;
-+	}
++	status = objio_read_pagelist(state);
++ out:
++	dprintk("%s: Return status %Zd\n", __func__, status);
++	rdata->pdata.pnfs_error = status;
++	return PNFS_ATTEMPTED;
++}
 +
-+	memcpy(fhp, args->lg_fh, sizeof(*fhp));
-+	pnfs_fh_mark_ds(fhp);
-+	layout->lg_fh_list = fhp;
++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
++ * This is because the osd completion is called with ints-off from
++ * the block layer
++ */
++static void _rpc_write_complete(struct work_struct *work)
++{
++	struct rpc_task *task;
++	struct nfs_write_data *wdata;
 +
-+	/* Call nfsd to encode layout */
-+	rc = filelayout_encode_layout(xdr, layout);
-+exit:
-+	kfree(layout);
-+	kfree(fhp);
-+	return rc;
++	dprintk("%s enter\n", __func__);
++	task = container_of(work, struct rpc_task, u.tk_work);
++	wdata = container_of(task, struct nfs_write_data, task);
 +
-+error:
-+	res->lg_seg.length = 0;
-+	goto exit;
++	pnfs_writeback_done(wdata);
 +}
 +
-+static int
-+nfsd4_pnfs_dlm_layouttype(struct super_block *sb)
++void
++objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
++		     bool sync)
 +{
-+	return LAYOUT_NFSV4_1_FILES;
++	struct nfs_write_data *wdata;
++
++	dprintk("%s: Begin\n", __func__);
++	wdata = state->rpcdata;
++	state->status = status;
++	wdata->task.tk_status = status;
++	if (status >= 0) {
++		wdata->res.count = status;
++		wdata->verf.committed = state->committed;
++		dprintk("%s: Return status %d committed %d\n",
++			__func__, wdata->task.tk_status,
++			wdata->verf.committed);
++	} else
++		dprintk("%s: Return status %d\n",
++			__func__, wdata->task.tk_status);
++	objlayout_iodone(state);
++	/* must not use state after this point */
++
++	if (sync)
++		pnfs_writeback_done(wdata);
++	else {
++		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
++		schedule_work(&wdata->task.u.tk_work);
++	}
 +}
 +
-+/* For use by DLM cluster file systems exported by pNFSD */
-+const struct pnfs_export_operations pnfs_dlm_export_ops = {
-+	.layout_type = nfsd4_pnfs_dlm_layouttype,
-+	.get_device_info = nfsd4_pnfs_dlm_getdevinfo,
-+	.get_device_iter = nfsd4_pnfs_dlm_getdeviter,
-+	.layout_get = nfsd4_pnfs_dlm_layoutget,
-+};
-+EXPORT_SYMBOL(pnfs_dlm_export_ops);
-diff --git a/fs/nfsd/nfs4pnfsds.c b/fs/nfsd/nfs4pnfsds.c
-new file mode 100644
-index 0000000..8ebc64d
---- /dev/null
-+++ b/fs/nfsd/nfs4pnfsds.c
-@@ -0,0 +1,620 @@
 +/*
-+*  linux/fs/nfsd/nfs4pnfsds.c
-+*
-+*  Copyright (c) 2005 The Regents of the University of Michigan.
-+*  All rights reserved.
-+*
-+*  Andy Adamson <andros at umich.edu>
-+*
-+*  Redistribution and use in source and binary forms, with or without
-+*  modification, are permitted provided that the following conditions
-+*  are met:
-+*
-+*  1. Redistributions of source code must retain the above copyright
-+*     notice, this list of conditions and the following disclaimer.
-+*  2. Redistributions in binary form must reproduce the above copyright
-+*     notice, this list of conditions and the following disclaimer in the
-+*     documentation and/or other materials provided with the distribution.
-+*  3. Neither the name of the University nor the names of its
-+*     contributors may be used to endorse or promote products derived
-+*     from this software without specific prior written permission.
-+*
-+*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*
-+*/
-+#if defined(CONFIG_PNFSD)
++ * Perform sync or async writes.
++ */
++enum pnfs_try_status
++objlayout_write_pagelist(struct nfs_write_data *wdata,
++			 unsigned nr_pages,
++			 int how)
++{
++	struct objlayout_io_state *state;
++	ssize_t status;
 +
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
++	dprintk("%s: Begin inode %p offset %llu count %u\n",
++		__func__, wdata->inode, wdata->args.offset, wdata->args.count);
 +
-+#include <linux/param.h>
-+#include <linux/sunrpc/svc.h>
-+#include <linux/sunrpc/debug.h>
-+#include <linux/nfs4.h>
-+#include <linux/exportfs.h>
-+#include <linux/sched.h>
++	state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
++					 wdata->args.pages,
++					 wdata->args.pgbase,
++					 nr_pages,
++					 wdata->args.offset,
++					 wdata->args.count,
++					 wdata->pdata.lseg, wdata);
++	if (unlikely(!state)) {
++		status = -ENOMEM;
++		goto out;
++	}
 +
-+#include "nfsd.h"
-+#include "pnfsd.h"
-+#include "state.h"
++	state->sync = how & FLUSH_SYNC;
 +
-+/*
-+ *******************
-+ *   	 PNFS
-+ *******************
-+ */
-+/*
-+ * Hash tables for pNFS Data Server state
-+ *
-+ * mds_nodeid:	list of struct pnfs_mds_id one per Metadata server (MDS) using
-+ *		this data server (DS).
-+ *
-+ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained
-+ *			from any MDS.
-+ *
-+ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained
-+ *			from any MDS.
-+ *
-+ */
-+/* Hash tables for clientid state */
-+#define CLIENT_HASH_BITS                 4
-+#define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
-+#define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
++	status = objio_write_pagelist(state, how & FLUSH_STABLE);
++ out:
++	dprintk("%s: Return status %Zd\n", __func__, status);
++	wdata->pdata.pnfs_error = status;
++	return PNFS_ATTEMPTED;
++}
 +
-+#define clientid_hashval(id) \
-+	((id) & CLIENT_HASH_MASK)
++void
++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
++			      struct xdr_stream *xdr,
++			      const struct nfs4_layoutcommit_args *args)
++{
++	struct objlayout *objlay = OBJLAYOUT(pnfslay);
++	struct pnfs_osd_layoutupdate lou;
++	__be32 *start;
 +
-+/* hash table for pnfs_ds_stateid */
-+#define STATEID_HASH_BITS              10
-+#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-+#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
++	dprintk("%s: Begin\n", __func__);
 +
-+#define stateid_hashval(owner_id, file_id)  \
-+	(((owner_id) + (file_id)) & STATEID_HASH_MASK)
++	spin_lock(&objlay->lock);
++	lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
++	lou.dsu_delta = objlay->delta_space_used;
++	objlay->delta_space_used = 0;
++	objlay->delta_space_valid = OBJ_DSU_INIT;
++	lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
++	spin_unlock(&objlay->lock);
 +
-+static struct list_head mds_id_tbl;
-+static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE];
-+static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE];
++	start = xdr_reserve_space(xdr, 4);
 +
-+static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp);
-+static inline void put_ds_mdsid(struct pnfs_mds_id *mdp);
++	BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
 +
-+/* Mutex for data server state.  Needs to be separate from
-+ * mds state mutex since a node can be both mds and ds */
-+static DEFINE_MUTEX(ds_mutex);
-+static struct thread_info *ds_mutex_owner;
++	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 +
-+static void
-+ds_lock_state(void)
++	dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
++		lou.dsu_delta, lou.olu_ioerr_flag);
++}
++
++static int
++err_prio(u32 oer_errno)
 +{
-+	mutex_lock(&ds_mutex);
-+	ds_mutex_owner = current_thread_info();
++	switch (oer_errno) {
++	case 0:
++		return 0;
++
++	case PNFS_OSD_ERR_RESOURCE:
++		return OSD_ERR_PRI_RESOURCE;
++	case PNFS_OSD_ERR_BAD_CRED:
++		return OSD_ERR_PRI_BAD_CRED;
++	case PNFS_OSD_ERR_NO_ACCESS:
++		return OSD_ERR_PRI_NO_ACCESS;
++	case PNFS_OSD_ERR_UNREACHABLE:
++		return OSD_ERR_PRI_UNREACHABLE;
++	case PNFS_OSD_ERR_NOT_FOUND:
++		return OSD_ERR_PRI_NOT_FOUND;
++	case PNFS_OSD_ERR_NO_SPACE:
++		return OSD_ERR_PRI_NO_SPACE;
++	default:
++		WARN_ON(1);
++		/* fallthrough */
++	case PNFS_OSD_ERR_EIO:
++		return OSD_ERR_PRI_EIO;
++	}
 +}
 +
 +static void
-+ds_unlock_state(void)
++merge_ioerr(struct pnfs_osd_ioerr *dest_err,
++	    const struct pnfs_osd_ioerr *src_err)
 +{
-+	BUG_ON(ds_mutex_owner != current_thread_info());
-+	ds_mutex_owner = NULL;
-+	mutex_unlock(&ds_mutex);
-+}
++	u64 dest_end, src_end;
 +
-+static int
-+cmp_clid(const clientid_t *cl1, const clientid_t *cl2)
-+{
-+	return (cl1->cl_boot == cl2->cl_boot) &&
-+	       (cl1->cl_id == cl2->cl_id);
-+}
++	if (!dest_err->oer_errno) {
++		*dest_err = *src_err;
++		/* accumulated device must be blank */
++		memset(&dest_err->oer_component.oid_device_id, 0,
++			sizeof(dest_err->oer_component.oid_device_id));
 +
-+void
-+nfs4_pnfs_state_init(void)
-+{
-+	int i;
++		return;
++	}
 +
-+	for (i = 0; i < CLIENT_HASH_SIZE; i++)
-+		INIT_LIST_HEAD(&mds_clid_hashtbl[i]);
++	if (dest_err->oer_component.oid_partition_id !=
++				src_err->oer_component.oid_partition_id)
++		dest_err->oer_component.oid_partition_id = 0;
 +
-+	for (i = 0; i < STATEID_HASH_SIZE; i++)
-+		INIT_LIST_HEAD(&ds_stid_hashtbl[i]);
++	if (dest_err->oer_component.oid_object_id !=
++				src_err->oer_component.oid_object_id)
++		dest_err->oer_component.oid_object_id = 0;
 +
-+	INIT_LIST_HEAD(&mds_id_tbl);
-+}
++	if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
++		dest_err->oer_comp_offset = src_err->oer_comp_offset;
 +
-+static struct pnfs_mds_id *
-+find_pnfs_mds_id(u32 mdsid)
-+{
-+	struct pnfs_mds_id *local = NULL;
++	dest_end = end_offset(dest_err->oer_comp_offset,
++			      dest_err->oer_comp_length);
++	src_end =  end_offset(src_err->oer_comp_offset,
++			      src_err->oer_comp_length);
++	if (dest_end < src_end)
++		dest_end = src_end;
 +
-+	dprintk("pNFSD: %s\n", __func__);
-+	list_for_each_entry(local, &mds_id_tbl, di_hash) {
-+		if (local->di_mdsid == mdsid)
-+			return local;
++	dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
++
++	if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
++	    (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
++			dest_err->oer_errno = src_err->oer_errno;
++	} else if (src_err->oer_iswrite) {
++		dest_err->oer_iswrite = true;
++		dest_err->oer_errno = src_err->oer_errno;
 +	}
-+	return NULL;
 +}
 +
-+static struct pnfs_ds_clientid *
-+find_pnfs_ds_clientid(const clientid_t *clid)
++static void
++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr)
 +{
-+	struct pnfs_ds_clientid *local = NULL;
-+	unsigned int hashval;
++	struct objlayout_io_state *state, *tmp;
++	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
 +
-+	dprintk("pNFSD: %s\n", __func__);
++	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
++		unsigned i;
 +
-+	hashval = clientid_hashval(clid->cl_id);
-+	list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) {
-+		if (cmp_clid(&local->dc_mdsclid, clid))
-+			return local;
-+	}
-+	return NULL;
-+}
++		for (i = 0; i < state->num_comps; i++) {
++			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
 +
-+static struct pnfs_ds_stateid *
-+find_pnfs_ds_stateid(stateid_t *stid)
-+{
-+	struct pnfs_ds_stateid *local = NULL;
-+	u32 st_id = stid->si_stateownerid;
-+	u32 f_id = stid->si_fileid;
-+	unsigned int hashval;
++			if (!ioerr->oer_errno)
++				continue;
 +
-+	dprintk("pNFSD: %s\n", __func__);
++			printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
++				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
++				"offset=0x%llx length=0x%llx\n",
++				__func__, i, ioerr->oer_errno,
++				ioerr->oer_iswrite,
++				_DEVID_LO(&ioerr->oer_component.oid_device_id),
++				_DEVID_HI(&ioerr->oer_component.oid_device_id),
++				ioerr->oer_component.oid_partition_id,
++				ioerr->oer_component.oid_object_id,
++				ioerr->oer_comp_offset,
++				ioerr->oer_comp_length);
 +
-+	hashval = stateid_hashval(st_id, f_id);
-+	list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash)
-+		if ((local->ds_stid.si_stateownerid == st_id) &&
-+				(local->ds_stid.si_fileid == f_id) &&
-+				(local->ds_stid.si_boot == stid->si_boot)) {
-+			stateid_t *sid = &local->ds_stid;
-+			dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n",
-+				__func__, local, local->ds_flags,
-+				STATEID_VAL(sid));
-+			return local;
++			merge_ioerr(&accumulated_err, ioerr);
 +		}
-+	return NULL;
++		list_del(&state->err_list);
++		objlayout_free_io_state(state);
++	}
++
++	BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err));
 +}
 +
-+static void
-+release_ds_mdsid(struct kref *kref)
++void
++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
++			      struct xdr_stream *xdr,
++			      const struct nfs4_layoutreturn_args *args)
 +{
-+	struct pnfs_mds_id *mdp =
-+		container_of(kref, struct pnfs_mds_id, di_ref);
-+	dprintk("pNFSD: %s\n", __func__);
++	struct objlayout *objlay = OBJLAYOUT(pnfslay);
++	struct objlayout_io_state *state, *tmp;
++	__be32 *start, *uninitialized_var(last_xdr);
 +
-+	list_del(&mdp->di_hash);
-+	list_del(&mdp->di_mdsclid);
-+	kfree(mdp);
-+}
++	dprintk("%s: Begin\n", __func__);
++	start = xdr_reserve_space(xdr, 4);
++	BUG_ON(!start);
 +
-+static void
-+release_ds_clientid(struct kref *kref)
-+{
-+	struct pnfs_ds_clientid *dcp =
-+		container_of(kref, struct pnfs_ds_clientid, dc_ref);
-+	struct pnfs_mds_id *mdp;
-+	dprintk("pNFSD: %s\n", __func__);
++	spin_lock(&objlay->lock);
 +
-+	mdp = find_pnfs_mds_id(dcp->dc_mdsid);
-+	if (mdp)
-+		put_ds_mdsid(mdp);
++	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
++		unsigned i;
++		int res = 0;
 +
-+	list_del(&dcp->dc_hash);
-+	list_del(&dcp->dc_stateid);
-+	list_del(&dcp->dc_permdsid);
-+	kfree(dcp);
-+}
++		for (i = 0; i < state->num_comps && !res; i++) {
++			struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
 +
-+static void
-+release_ds_stateid(struct kref *kref)
-+{
-+	struct pnfs_ds_stateid *dsp =
-+		container_of(kref, struct pnfs_ds_stateid, ds_ref);
-+	struct pnfs_ds_clientid *dcp;
-+	dprintk("pNFS %s: dsp %p\n", __func__, dsp);
++			if (!ioerr->oer_errno)
++				continue;
 +
-+	dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid);
-+	if (dcp)
-+		put_ds_clientid(dcp);
++			dprintk("%s: err[%d]: errno=%d is_write=%d "
++				"dev(%llx:%llx) par=0x%llx obj=0x%llx "
++				"offset=0x%llx length=0x%llx\n",
++				__func__, i, ioerr->oer_errno,
++				ioerr->oer_iswrite,
++				_DEVID_LO(&ioerr->oer_component.oid_device_id),
++				_DEVID_HI(&ioerr->oer_component.oid_device_id),
++				ioerr->oer_component.oid_partition_id,
++				ioerr->oer_component.oid_object_id,
++				ioerr->oer_comp_offset,
++				ioerr->oer_comp_length);
 +
-+	list_del(&dsp->ds_hash);
-+	list_del(&dsp->ds_perclid);
-+	kfree(dsp);
-+}
++			last_xdr = xdr->p;
++			res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
++		}
++		if (unlikely(res)) {
++			/* no space for even one error descriptor */
++			BUG_ON(last_xdr == start + 1);
 +
-+static inline void
-+put_ds_clientid(struct pnfs_ds_clientid *dcp)
-+{
-+	dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
-+		atomic_read(&dcp->dc_ref.refcount));
-+	kref_put(&dcp->dc_ref, release_ds_clientid);
-+}
++			/* we've encountered a situation with lots and lots of
++			 * errors and no space to encode them all. Use the last
++			 * available slot to report the union of all the
++			 * remaining errors.
++			 */
++			xdr_rewind_stream(xdr, last_xdr -
++					       pnfs_osd_ioerr_xdr_sz() / 4);
++			encode_accumulated_error(objlay, xdr);
++			goto loop_done;
++		}
++		list_del(&state->err_list);
++		objlayout_free_io_state(state);
++	}
++loop_done:
++	spin_unlock(&objlay->lock);
 +
-+static inline void
-+get_ds_clientid(struct pnfs_ds_clientid *dcp)
-+{
-+	dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
-+		atomic_read(&dcp->dc_ref.refcount));
-+	kref_get(&dcp->dc_ref);
++	*start = cpu_to_be32((xdr->p - start - 1) * 4);
++	dprintk("%s: Return\n", __func__);
 +}
 +
-+static inline void
-+put_ds_mdsid(struct pnfs_mds_id *mdp)
-+{
-+	dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
-+		atomic_read(&mdp->di_ref.refcount));
-+	kref_put(&mdp->di_ref, release_ds_mdsid);
-+}
++struct objlayout_deviceinfo {
++	struct page *page;
++	struct pnfs_osd_deviceaddr da; /* This must be last */
++};
 +
-+static inline void
-+get_ds_mdsid(struct pnfs_mds_id *mdp)
++/* Initialize and call nfs_getdeviceinfo, then decode and return a
++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
++ * should be called.
++ */
++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
++	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr)
 +{
-+	dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
-+		atomic_read(&mdp->di_ref.refcount));
-+	kref_get(&mdp->di_ref);
-+}
++	struct objlayout_deviceinfo *odi;
++	struct pnfs_device pd;
++	struct super_block *sb;
++	struct page *page;
++	size_t sz;
++	u32 *p;
++	int err;
 +
-+static inline void
-+put_ds_stateid(struct pnfs_ds_stateid *dsp)
-+{
-+	dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
-+		atomic_read(&dsp->ds_ref.refcount));
-+	kref_put(&dsp->ds_ref, release_ds_stateid);
-+}
++	page = alloc_page(GFP_KERNEL);
++	if (!page)
++		return -ENOMEM;
 +
-+static inline void
-+get_ds_stateid(struct pnfs_ds_stateid *dsp)
-+{
-+	dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
-+		atomic_read(&dsp->ds_ref.refcount));
-+	kref_get(&dsp->ds_ref);
-+}
++	pd.area = page_address(page);
 +
-+void
-+nfs4_pnfs_state_shutdown(void)
-+{
-+	struct pnfs_ds_stateid *dsp;
-+	int i;
++	memcpy(&pd.dev_id, d_id, sizeof(*d_id));
++	pd.layout_type = LAYOUT_OSD2_OBJECTS;
++	pd.pages = &page;
++	pd.pgbase = 0;
++	pd.pglen = PAGE_SIZE;
++	pd.mincount = 0;
 +
-+	dprintk("pNFSD %s: -->\n", __func__);
++	sb = pnfslay->inode->i_sb;
++	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->inode), &pd);
++	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
++	if (err)
++		goto err_out;
 +
-+	ds_lock_state();
-+	for (i = 0; i < STATEID_HASH_SIZE; i++) {
-+		while (!list_empty(&ds_stid_hashtbl[i])) {
-+			dsp = list_entry(ds_stid_hashtbl[i].next,
-+					 struct pnfs_ds_stateid, ds_hash);
-+			put_ds_stateid(dsp);
-+		}
++	p = pd.area;
++	sz = pnfs_osd_xdr_deviceaddr_incore_sz(p);
++	odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL);
++	if (!odi) {
++		err = -ENOMEM;
++		goto err_out;
 +	}
-+	ds_unlock_state();
++	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
++	odi->page = page;
++	*deviceaddr = &odi->da;
++	return 0;
++
++err_out:
++	__free_page(page);
++	return err;
 +}
 +
-+static struct pnfs_mds_id *
-+alloc_init_mds_id(struct pnfs_get_state *gsp)
++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
 +{
-+	struct pnfs_mds_id *mdp;
-+
-+	dprintk("pNFSD: %s\n", __func__);
++	struct objlayout_deviceinfo *odi = container_of(deviceaddr,
++						struct objlayout_deviceinfo,
++						da);
 +
-+	mdp = kmalloc(sizeof(*mdp), GFP_KERNEL);
-+	if (!mdp)
-+		return NULL;
-+	INIT_LIST_HEAD(&mdp->di_hash);
-+	INIT_LIST_HEAD(&mdp->di_mdsclid);
-+	list_add(&mdp->di_hash, &mds_id_tbl);
-+	mdp->di_mdsid = gsp->dsid;
-+	mdp->di_mdsboot = 0;
-+	kref_init(&mdp->di_ref);
-+	return mdp;
++	__free_page(odi->page);
++	kfree(odi);
 +}
 +
-+static struct pnfs_ds_clientid *
-+alloc_init_ds_clientid(struct pnfs_get_state *gsp)
++/*
++ * Perform the objio specific init_mt method.
++ * Set the layout driver private data pointer for later use.
++ */
++int
++objlayout_set_layoutdriver(struct nfs_server *server,
++			   const struct nfs_fh *mntfh)
 +{
-+	struct pnfs_mds_id *mdp;
-+	struct pnfs_ds_clientid *dcp;
-+	clientid_t *clid = (clientid_t *)&gsp->clid;
-+	unsigned int hashval = clientid_hashval(clid->cl_id);
-+
-+	dprintk("pNFSD: %s\n", __func__);
++	void *data;
 +
-+	mdp = find_pnfs_mds_id(gsp->dsid);
-+	if (!mdp) {
-+		mdp = alloc_init_mds_id(gsp);
-+		if (!mdp)
-+			return NULL;
-+	} else {
-+		get_ds_mdsid(mdp);
++	data = objio_init_mt();
++	if (IS_ERR(data)) {
++		printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n",
++		       __func__, PTR_ERR(data));
++		return PTR_ERR(data);
 +	}
++	server->pnfs_ld_data = data;
 +
-+	dcp = kmalloc(sizeof(*dcp), GFP_KERNEL);
-+	if (!dcp)
-+		return NULL;
-+
-+	INIT_LIST_HEAD(&dcp->dc_hash);
-+	INIT_LIST_HEAD(&dcp->dc_stateid);
-+	INIT_LIST_HEAD(&dcp->dc_permdsid);
-+	list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]);
-+	list_add(&dcp->dc_permdsid, &mdp->di_mdsclid);
-+	dcp->dc_mdsclid = *clid;
-+	kref_init(&dcp->dc_ref);
-+	dcp->dc_mdsid = gsp->dsid;
-+	return dcp;
++	dprintk("%s: Return data=%p\n", __func__, data);
++	return 0;
 +}
 +
-+static struct pnfs_ds_stateid *
-+alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp)
++/*
++ * Perform the objio specific fini_mt method to release the
++ * layoutdriver private data.
++ */
++int
++objlayout_clear_layoutdriver(struct nfs_server *server)
 +{
-+	struct pnfs_ds_stateid *dsp;
-+	u32 st_id = stidp->si_stateownerid;
-+	u32 f_id  = stidp->si_fileid;
-+	unsigned int hashval;
-+
-+	dprintk("pNFSD: %s\n", __func__);
++	dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data);
++	objio_fini_mt(server->pnfs_ld_data);
++	return 0;
++}
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h
+--- linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h.orig	2011-01-28 09:43:53.334772561 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/objlayout.h	2011-01-28 09:43:53.334772561 -0500
+@@ -0,0 +1,206 @@
++/*
++ *  objlayout.h
++ *
++ *  Data types and function declerations for interfacing with the
++ *  pNFS standard object layout driver.
++ *
++ *  Copyright (C) 2007-2009 Panasas Inc.
++ *  All rights reserved.
++ *
++ *  Benny Halevy <bhalevy at panasas.com>
++ *  Boaz Harrosh <bharrosh at panasas.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2
++ *  See the file COPYING included with this distribution for more details.
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the Panasas company nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
 +
-+	dsp = kmalloc(sizeof(*dsp), GFP_KERNEL);
-+	if (!dsp)
-+		return dsp;
++#ifndef _OBJLAYOUT_H
++#define _OBJLAYOUT_H
 +
-+	INIT_LIST_HEAD(&dsp->ds_hash);
-+	INIT_LIST_HEAD(&dsp->ds_perclid);
-+	memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t));
-+	fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle);
-+	dsp->ds_access = 0;
-+	dsp->ds_status = 0;
-+	dsp->ds_flags = 0L;
-+	kref_init(&dsp->ds_ref);
-+	set_bit(DS_STATEID_NEW, &dsp->ds_flags);
-+	clear_bit(DS_STATEID_VALID, &dsp->ds_flags);
-+	clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
-+	init_waitqueue_head(&dsp->ds_waitq);
++#include <linux/nfs_fs.h>
++#include <linux/pnfs_osd_xdr.h>
++#include "../pnfs.h"
 +
-+	hashval = stateid_hashval(st_id, f_id);
-+	list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]);
-+	dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
-+	return dsp;
-+}
++/*
++ * in-core layout segment
++ */
++struct objlayout_segment {
++	struct pnfs_layout_segment lseg;
++	void *internal;    /* for provider internal use */
++	u8 pnfs_osd_layout[];
++};
 +
-+static int
-+update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh,
-+		  struct pnfs_get_state *gsp)
-+{
-+	struct pnfs_ds_clientid *dcp;
-+	int new = 0;
++/*
++ * per-inode layout
++ */
++struct objlayout {
++	struct pnfs_layout_hdr pnfs_layout;
 +
-+	dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++	 /* for layout_commit */
++	enum osd_delta_space_valid_enum {
++		OBJ_DSU_INIT = 0,
++		OBJ_DSU_VALID,
++		OBJ_DSU_INVALID,
++	} delta_space_valid;
++	s64 delta_space_used;  /* consumed by write ops */
 +
-+	dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid);
-+	if (!dcp) {
-+		dcp = alloc_init_ds_clientid(gsp);
-+		if (!dcp)
-+			return 1;
-+		new = 1;
-+	}
-+	if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) {
-+		list_add(&dsp->ds_perclid, &dcp->dc_stateid);
-+		if (!new)
-+			get_ds_clientid(dcp);
-+	}
++	 /* for layout_return */
++	spinlock_t lock;
++	struct list_head err_list;
++};
 +
-+	memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t));
-+	dsp->ds_access = gsp->access;
-+	dsp->ds_status = 0;
-+	dsp->ds_verifier[0] = gsp->verifier[0];
-+	dsp->ds_verifier[1] = gsp->verifier[1];
-+	memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t));
-+	set_bit(DS_STATEID_VALID, &dsp->ds_flags);
-+	clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
-+	clear_bit(DS_STATEID_NEW, &dsp->ds_flags);
-+	return 0;
++static inline struct objlayout *
++OBJLAYOUT(struct pnfs_layout_hdr *lo)
++{
++	return container_of(lo, struct objlayout, pnfs_layout);
 +}
 +
-+int
-+nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs)
-+{
-+	stateid_t *stid = (stateid_t *)&gs->stid;
-+	struct pnfs_ds_stateid *dsp;
++/*
++ * per-I/O operation state
++ * embedded in objects provider io_state data structure
++ */
++struct objlayout_io_state {
++	struct objlayout_segment *objlseg;
 +
-+	dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__,
-+		STATEID_VAL(stid));
++	struct page **pages;
++	unsigned pgbase;
++	unsigned nr_pages;
++	unsigned long count;
++	loff_t offset;
++	bool sync;
 +
-+	ds_lock_state();
-+	dsp = find_pnfs_ds_stateid(stid);
-+	if (dsp)
-+		put_ds_stateid(dsp);
-+	ds_unlock_state();
++	void *rpcdata;
++	int status;             /* res */
++	int eof;                /* res */
++	int committed;          /* res */
 +
-+	dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++	/* Error reporting (layout_return) */
++	struct list_head err_list;
++	unsigned num_comps;
++	/* Pointer to array of error descriptors of size num_comps.
++	 * It should contain as many entries as devices in the osd_layout
++	 * that participate in the I/O. It is up to the io_engine to allocate
++	 * needed space and set num_comps.
++	 */
++	struct pnfs_osd_ioerr *ioerrs;
++};
 +
-+	if (dsp)
-+		return 0;
-+	return -ENOENT;
-+}
++/*
++ * Raid engine I/O API
++ */
++extern void *objio_init_mt(void);
++extern void objio_fini_mt(void *mt);
 +
-+/* Retrieves and validates stateid.
-+ * If stateid exists and its fields match, return it.
-+ * If stateid exists but either the generation or
-+ * ownerids don't match, check with mds to see if it is valid.
-+ * If the stateid doesn't exist, the first thread creates a
-+ * invalid *marker* stateid, then checks to see if the
-+ * stateid exists on the mds.  If so, it validates the *marker*
-+ * stateid and updates its fields.  Subsequent threads that
-+ * find the *marker* stateid wait until it is valid or an error
-+ * occurs.
-+ * Called with ds_state_lock.
++extern int objio_alloc_lseg(void **outp,
++	struct pnfs_layout_hdr *pnfslay,
++	struct pnfs_layout_segment *lseg,
++	struct pnfs_osd_layout *layout);
++extern void objio_free_lseg(void *p);
++
++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp);
++extern void objio_free_io_state(struct objlayout_io_state *state);
++
++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
++				    bool stable);
++
++/*
++ * callback API
 + */
-+static struct pnfs_ds_stateid *
-+nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp)
++extern void objlayout_io_set_result(struct objlayout_io_state *state,
++				    unsigned index, int osd_error,
++				    u64 offset, u64 length, bool is_write);
++
++static inline void
++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
 +{
-+	struct inode *ino = cfh->fh_dentry->d_inode;
-+	struct super_block *sb;
-+	struct pnfs_ds_stateid *dsp = NULL;
-+	struct pnfs_get_state gs = {
-+		.access = 0,
-+	};
-+	int status = 0, waiter = 0;
++	struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
 +
-+	dprintk("pNFSD: %s -->\n", __func__);
++	/* If one of the I/Os errored out and the delta_space_used was
++	 * invalid we render the complete report as invalid. Protocol mandate
++	 * the DSU be accurate or not reported.
++	 */
++	spin_lock(&objlay->lock);
++	if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
++		objlay->delta_space_valid = OBJ_DSU_VALID;
++		objlay->delta_space_used += space_used;
++	}
++	spin_unlock(&objlay->lock);
++}
 +
-+	dsp = find_pnfs_ds_stateid(stidp);
-+	if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) &&
-+	    (stidp->si_generation == dsp->ds_stid.si_generation))
-+		goto out_noput;
++extern void objlayout_read_done(struct objlayout_io_state *state,
++				ssize_t status, bool sync);
++extern void objlayout_write_done(struct objlayout_io_state *state,
++				 ssize_t status, bool sync);
 +
-+	sb = ino->i_sb;
-+	if (!sb || !sb->s_pnfs_op->get_state)
-+		goto out_noput;
++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
++	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr);
++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
 +
-+	/* Uninitialize current state if it exists yet it doesn't match.
-+	 * If it is already invalid, another thread is checking state */
-+	if (dsp) {
-+		if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags))
-+			waiter = 1;
-+	} else {
-+		dsp = alloc_init_ds_stateid(cfh, stidp);
-+		if (!dsp)
-+			goto out_noput;
-+	}
++/*
++ * exported generic objects function vectors
++ */
 +
-+	dprintk("pNFSD: %s Starting loop\n", __func__);
-+	get_ds_stateid(dsp);
-+	while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
-+		ds_unlock_state();
++extern int objlayout_set_layoutdriver(
++	struct nfs_server *,
++	const struct nfs_fh *);
++extern int objlayout_clear_layoutdriver(struct nfs_server *);
 +
-+		/* Another thread is checking the state */
-+		if (waiter) {
-+			dprintk("pNFSD: %s waiting\n", __func__);
-+			wait_event_interruptible_timeout(dsp->ds_waitq,
-+				(test_bit(DS_STATEID_VALID, &dsp->ds_flags) ||
-+				 test_bit(DS_STATEID_ERROR, &dsp->ds_flags)),
-+				 msecs_to_jiffies(1024));
-+			dprintk("pNFSD: %s awake\n", __func__);
-+			ds_lock_state();
-+			if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
-+				goto out;
++extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *);
++extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
 +
-+			continue;
-+		}
++extern struct pnfs_layout_segment *objlayout_alloc_lseg(
++	struct pnfs_layout_hdr *,
++	struct nfs4_layoutget_res *);
++extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 +
-+		/* Validate stateid on mds */
-+		dprintk("pNFSD: %s Checking state on MDS\n", __func__);
-+		memcpy(&gs.stid, stidp, sizeof(stateid_t));
-+		status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs);
-+		dprintk("pNFSD: %s from MDS status %d\n", __func__, status);
-+		ds_lock_state();
-+		/* if !status and stateid is valid, update id and mark valid */
-+		if (status || update_ds_stateid(dsp, cfh, &gs)) {
-+			set_bit(DS_STATEID_ERROR, &dsp->ds_flags);
-+			/* remove invalid stateid from list */
-+			put_ds_stateid(dsp);
-+			wake_up(&dsp->ds_waitq);
-+			goto out;
-+		}
++extern enum pnfs_try_status objlayout_read_pagelist(
++	struct nfs_read_data *,
++	unsigned nr_pages);
 +
-+		wake_up(&dsp->ds_waitq);
-+	}
-+out:
-+	if (dsp)
-+		put_ds_stateid(dsp);
-+out_noput:
-+	if (dsp)
-+		dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n",
-+			__func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid));
-+	/* If error, return null */
-+	if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
-+		dsp = NULL;
-+	dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
-+	return dsp;
-+}
++extern enum pnfs_try_status objlayout_write_pagelist(
++	struct nfs_write_data *,
++	unsigned nr_pages,
++	int how);
 +
-+int
-+nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid)
-+{
-+	struct pnfs_ds_stateid *dsp;
-+	int status = 0;
++extern enum pnfs_try_status objlayout_commit(
++	struct nfs_write_data *,
++	int how);
 +
-+	dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__,
-+		STATEID_VAL(stateid));
++extern void objlayout_encode_layoutcommit(
++	struct pnfs_layout_hdr *,
++	struct xdr_stream *,
++	const struct nfs4_layoutcommit_args *);
 +
-+	/* Must release state lock while verifying stateid on mds */
-+	nfs4_unlock_state();
-+	ds_lock_state();
-+	dsp = nfsv4_ds_get_state(cfh, stateid);
-+	if (dsp) {
-+		get_ds_stateid(dsp);
-+		dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__,
-+			STATEID_VAL(&dsp->ds_stid));
++extern void objlayout_encode_layoutreturn(
++	struct pnfs_layout_hdr *,
++	struct xdr_stream *,
++	const struct nfs4_layoutreturn_args *);
 +
-+		dprintk("NFSD: %s: dsp %p fh_size %u:%u "
-+			"fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] "
-+			"gen %x:%x\n",
-+			__func__, dsp,
-+			cfh->fh_handle.fh_size, dsp->ds_fh.fh_size,
-+			((unsigned *)&cfh->fh_handle.fh_base)[0],
-+			((unsigned *)&cfh->fh_handle.fh_base)[1],
-+			((unsigned *)&cfh->fh_handle.fh_base)[2],
-+			((unsigned *)&cfh->fh_handle.fh_base)[3],
-+			((unsigned *)&dsp->ds_fh.fh_base)[0],
-+			((unsigned *)&dsp->ds_fh.fh_base)[1],
-+			((unsigned *)&dsp->ds_fh.fh_base)[2],
-+			((unsigned *)&dsp->ds_fh.fh_base)[3],
-+			stateid->si_generation, dsp->ds_stid.si_generation);
-+	}
++#endif /* _OBJLAYOUT_H */
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c
+--- linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c.orig	2011-01-28 09:43:53.335772417 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.c	2011-01-28 09:43:53.335772417 -0500
+@@ -0,0 +1,702 @@
++/*
++ *  panfs_shim.c
++ *
++ *  Shim layer for interfacing with the Panasas DirectFlow module I/O stack
++ *
++ *  Copyright (C) 2007-2009 Panasas Inc.
++ *  All rights reserved.
++ *
++ *  Benny Halevy <bhalevy at panasas.com>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the Panasas company nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * See the file COPYING included with this distribution for more details.
++ *
++ */
 +
-+	if (!dsp ||
-+	    (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) ||
-+	    (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base,
-+		    dsp->ds_fh.fh_size) != 0) ||
-+	    (stateid->si_generation > dsp->ds_stid.si_generation))
-+		status = nfserr_bad_stateid;
-+	else if (stateid->si_generation < dsp->ds_stid.si_generation)
-+		status = nfserr_old_stateid;
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <asm/byteorder.h>
 +
-+	if (dsp)
-+		put_ds_stateid(dsp);
-+	ds_unlock_state();
-+	nfs4_lock_state();
-+	dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status));
-+	return status;
-+}
++#include "objlayout.h"
++#include "panfs_shim.h"
 +
-+void
-+nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p)
-+{
-+	struct pnfs_ds_stateid *dsp = NULL;
++#include <linux/panfs_shim_api.h>
 +
-+	dprintk("pNFSD: %s --> stid %p\n", __func__, stateid);
++#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 +
-+	ds_lock_state();
-+	if (stateid != NULL) {
-+		dsp = find_pnfs_ds_stateid(stateid);
-+		if (dsp)
-+			get_ds_stateid(dsp);
-+	}
++struct panfs_export_operations *panfs_export_ops;
 +
-+	/* XXX: Should we fetch the stateid or wait if some other
-+	 * thread is currently retrieving the stateid ? */
-+	if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
-+		*p++ = dsp->ds_verifier[0];
-+		*p++ = dsp->ds_verifier[1];
-+		put_ds_stateid(dsp);
-+	} else {
-+		/* must be on MDS */
-+		ds_unlock_state();
-+		sb->s_pnfs_op->get_verifier(sb, p);
-+		ds_lock_state();
-+		p += 2;
-+	}
-+	ds_unlock_state();
-+	dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
-+	return;
++void *
++objio_init_mt(void)
++{
++	return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL;
 +}
 +
-+#endif /* CONFIG_PNFSD */
-diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
-index 59ec449..00cbf11 100644
---- a/fs/nfsd/nfs4proc.c
-+++ b/fs/nfsd/nfs4proc.c
-@@ -34,10 +34,14 @@
-  */
- #include <linux/file.h>
- #include <linux/slab.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd4_block.h>
- 
- #include "cache.h"
- #include "xdr4.h"
- #include "vfs.h"
-+#include "pnfsd.h"
- 
- #define NFSDDBG_FACILITY		NFSDDBG_PROC
- 
-@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- 	 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
- 	 */
- 	status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
-+#if defined(CONFIG_SPNFS)
-+	if (!status && spnfs_enabled()) {
-+		struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
-+
-+		status = spnfs_open(inode, open);
-+		if (status) {
-+			dprintk(
-+			     "nfsd: pNFS could not be enabled for inode: %lu\n",
-+			     inode->i_ino);
-+			/*
-+			 * XXX When there's a failure then need to indicate to
-+			 * future ops that no pNFS is available.  Should I save
-+			 * the status in the inode?  It's kind of a big hammer.
-+			 * But there may be no stripes available?
-+			 */
-+		}
-+	}
-+#endif /* CONFIG_SPNFS */
- out:
- 	if (open->op_stateowner) {
- 		nfs4_get_stateowner(open->op_stateowner);
-@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- 			   &access->ac_supported);
- }
- 
-+static void
-+nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf)
++void objio_fini_mt(void *mountid)
 +{
-+	u32 *p = (u32 *)verf->data;
-+
-+#if defined(CONFIG_PNFSD)
-+	if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) {
-+		nfs4_ds_get_verifier(NULL, sb, p);
-+		return;
-+	}
-+#endif /* CONFIG_PNFSD */
++}
 +
-+	*p++ = nfssvc_boot.tv_sec;
-+	*p++ = nfssvc_boot.tv_usec;
++static int
++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout,
++		       struct pnfs_osd_data_map *lo_map,
++		       pan_agg_layout_hdr_t *hdr)
++{
++	if (lo_map->odm_mirror_cnt) {
++		hdr->type = PAN_AGG_RAID1;
++		hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1;
++	} else if (layout->olo_num_comps > 1) {
++		hdr->type = PAN_AGG_RAID0;
++		hdr->hdr.raid0.num_comps = layout->olo_num_comps;
++		hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit;
++	} else
++		hdr->type = PAN_AGG_SIMPLE;
++	return 0;
 +}
 +
- static __be32
- nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- 	     struct nfsd4_commit *commit)
- {
- 	__be32 status;
- 
--	u32 *p = (u32 *)commit->co_verf.data;
--	*p++ = nfssvc_boot.tv_sec;
--	*p++ = nfssvc_boot.tv_usec;
--
-+	nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
-+			   &commit->co_verf);
- 	status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
- 			     commit->co_count);
- 	if (status == nfserr_symlink)
-@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- {
- 	stateid_t *stateid = &write->wr_stateid;
- 	struct file *filp = NULL;
--	u32 *p;
- 	__be32 status = nfs_ok;
- 	unsigned long cnt;
- 
-@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- 
- 	cnt = write->wr_buflen;
- 	write->wr_how_written = write->wr_stable_how;
--	p = (u32 *)write->wr_verifier.data;
--	*p++ = nfssvc_boot.tv_sec;
--	*p++ = nfssvc_boot.tv_usec;
- 
-+	nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
-+			   &write->wr_verifier);
-+#if defined(CONFIG_SPNFS)
-+#if defined(CONFIG_SPNFS_BLOCK)
-+	if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) {
-+                status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode,
-+		    RETURN_FILE, write->wr_offset, write->wr_buflen);
-+                if (!status) {
-+                        status =  nfsd_write(rqstp, &cstate->current_fh, filp,
-+			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-+			     &cnt, &write->wr_how_written);
-+                }
-+        } else
-+#endif
-+		
-+	if (spnfs_enabled()) {
-+		status = spnfs_write(cstate->current_fh.fh_dentry->d_inode,
-+			write->wr_offset, write->wr_buflen, write->wr_vlen,
-+			rqstp);
-+		if (status == nfs_ok) {
-+			/* DMXXX: HACK to get filesize set */
-+			/* write one byte at offset+length-1 */
-+			struct kvec k[1];
-+			char zero = 0;
-+			unsigned long cnt = 1;
++static int
++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout,
++		      struct pnfs_osd_data_map *lo_map,
++		      pan_agg_layout_hdr_t *hdr)
++{
++	if (lo_map->odm_mirror_cnt)
++		goto err;
 +
-+			k[0].iov_base = (void *)&zero;
-+			k[0].iov_len = 1;
-+			nfsd_write(rqstp, &cstate->current_fh, filp,
-+				   write->wr_offset+write->wr_buflen-1, k, 1,
-+				   &cnt, &write->wr_how_written);
-+		}
-+	} else /* we're not an MDS */
-+		status =  nfsd_write(rqstp, &cstate->current_fh, filp,
-+			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-+			     &cnt, &write->wr_how_written);
-+#else
- 	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
- 			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
- 			     &cnt, &write->wr_how_written);
-+#endif /* CONFIG_SPNFS */
++	if (lo_map->odm_group_width || lo_map->odm_group_depth) {
++		if (!lo_map->odm_group_width || !lo_map->odm_group_depth)
++			goto err;
 +
- 	if (filp)
- 		fput(filp);
- 
-@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- 	return status == nfserr_same ? nfs_ok : status;
- }
- 
-+#if defined(CONFIG_PNFSD)
++		hdr->type = PAN_AGG_GRP_RAID5_LEFT;
++		hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps;
++		if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps)
++			goto err;
++		hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit;
++		hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width;
++		hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth;
++		/* this is a guess, panasas server is not supposed to
++		   hand out layotu otherwise */
++		hdr->hdr.grp_raid5_left.group_layout_policy =
++			PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN;
++	} else {
++		hdr->type = PAN_AGG_RAID5_LEFT;
++		hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps;
++		if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps)
++			goto err;
++		hdr->hdr.raid5_left.stripe_unit2 =
++		hdr->hdr.raid5_left.stripe_unit1 =
++		hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit;
++	}
 +
-+static __be32
-+nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp,
-+		    unsigned int layout_type)
++	return 0;
++err:
++	return -EINVAL;
++}
++
++/*
++ * Convert a pnfs_osd data map into Panasas aggregation layout header
++ */
++static int
++panfs_shim_conv_pnfs_osd_data_map(
++	struct pnfs_osd_layout *layout,
++	pan_agg_layout_hdr_t *hdr)
 +{
-+	int status, type;
++	int status = -EINVAL;
++	struct pnfs_osd_data_map *lo_map = &layout->olo_map;
 +
-+	/* check to see if pNFS  is supported. */
-+	status = nfserr_layoutunavailable;
-+	if (exp && exp->ex_pnfs == 0) {
-+		dprintk("%s: Underlying file system "
-+			"is not exported over pNFS\n", __func__);
-+		goto out;
++	if (!layout->olo_num_comps) {
++		dprintk("%s: !!layout.n_comps(%u)\n", __func__,
++			layout->olo_num_comps);
++		goto err;
 +	}
-+	if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) {
-+		dprintk("%s: Underlying file system "
-+			"does not support pNFS\n", __func__);
-+		goto out;
++
++	switch (lo_map->odm_raid_algorithm) {
++	case PNFS_OSD_RAID_0:
++		if (layout->olo_num_comps != lo_map->odm_num_comps ||
++		    layout->olo_comps_index) {
++			dprintk("%s: !!PNFS_OSD_RAID_0 "
++				"layout.n_comps(%u) map.n_comps(%u) "
++				"comps_index(%u)\n", __func__,
++				layout->olo_num_comps,
++				lo_map->odm_num_comps,
++				layout->olo_comps_index);
++			goto err;
++		}
++		status = panfs_shim_conv_raid01(layout, lo_map, hdr);
++		break;
++
++	case PNFS_OSD_RAID_5:
++		if (!lo_map->odm_group_width) {
++			if (layout->olo_num_comps != lo_map->odm_num_comps ||
++			    layout->olo_comps_index) {
++				dprintk("%s: !!PNFS_OSD_RAID_5 !group_width "
++					"layout.n_comps(%u)!=map.n_comps(%u) "
++					"|| comps_index(%u)\n", __func__,
++					layout->olo_num_comps,
++					lo_map->odm_num_comps,
++					layout->olo_comps_index);
++				goto err;
++			}
++		} else if ((layout->olo_num_comps != lo_map->odm_num_comps &&
++			    layout->olo_num_comps > lo_map->odm_group_width) ||
++			   (layout->olo_comps_index % lo_map->odm_group_width)){
++				dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) "
++					"layout.n_comps(%u) map.n_comps(%u) "
++					"comps_index(%u)\n", __func__,
++					lo_map->odm_group_width,
++					layout->olo_num_comps,
++					lo_map->odm_num_comps,
++					layout->olo_comps_index);
++				goto err;
++			}
++		status = panfs_shim_conv_raid5(layout, lo_map, hdr);
++		break;
++
++	case PNFS_OSD_RAID_4:
++	case PNFS_OSD_RAID_PQ:
++	default:
++		dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__,
++			lo_map->odm_raid_algorithm);
++		goto err;
 +	}
 +
-+	type = sb->s_pnfs_op->layout_type(sb);
++	return 0;
 +
-+	/* check to see if requested layout type is supported. */
-+	status = nfserr_unknown_layouttype;
-+	if (!type)
-+		dprintk("BUG: %s: layout_type 0 is reserved and must not be "
-+			"used by filesystem\n", __func__);
-+	else if (type != layout_type)
-+		dprintk("%s: requested layout type %d "
-+		       "does not match supported type %d\n",
-+			__func__, layout_type, type);
-+	else
-+		status = nfs_ok;
-+out:
++err:
 +	return status;
 +}
 +
-+static __be32
-+nfsd4_getdevlist(struct svc_rqst *rqstp,
-+		struct nfsd4_compound_state *cstate,
-+		struct nfsd4_pnfs_getdevlist *gdlp)
++/*
++ * Convert pnfs_osd layout into Panasas map and caps type
++ */
++int
++objio_alloc_lseg(void **outp,
++	struct pnfs_layout_hdr *pnfslay,
++	struct pnfs_layout_segment *lseg,
++	struct pnfs_osd_layout *layout)
 +{
-+	struct super_block *sb;
-+	struct svc_fh *current_fh = &cstate->current_fh;
++	int i, total_comps;
 +	int status;
++	struct pnfs_osd_object_cred *lo_comp;
++	pan_size_t alloc_sz, local_sz;
++	pan_sm_map_cap_t *mcs = NULL;
++	u8 *buf;
++	pan_agg_comp_obj_t *pan_comp;
++	pan_sm_sec_t *pan_sec;
 +
-+	dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n",
-+		__func__, gdlp->gd_layout_type, gdlp->gd_maxdevices,
-+		gdlp->gd_cookie, gdlp->gd_verf);
++	status = -EINVAL;
++	if (layout->olo_num_comps < layout->olo_map.odm_group_width) {
++		total_comps = layout->olo_comps_index + layout->olo_num_comps;
++	} else {
++		/* allocate full map, otherwise SAM gets confused */
++		total_comps = layout->olo_map.odm_num_comps;
++	}
++	alloc_sz = total_comps *
++		   (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t));
++	for (i = 0; i < layout->olo_num_comps; i++) {
++		void *p = layout->olo_comps[i].oc_cap.cred;
++		if (panfs_export_ops->sm_sec_t_get_size_otw(
++			(pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL))
++			goto err;
++		alloc_sz += local_sz;
++	}
 +
++	status = -ENOMEM;
++	mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL);
++	if (!mcs)
++		goto err;
++	buf = (u8 *)&mcs[1];
 +
-+	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++	mcs->offset = lseg->range.offset;
++	mcs->length = lseg->range.length;
++#if 0
++	/* FIXME: for now */
++	mcs->expiration_time.ts_sec  = 0;
++	mcs->expiration_time.ts_nsec = 0;
++#endif
++	mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL;
++	status = panfs_shim_conv_pnfs_osd_data_map(layout,
++						   &mcs->full_map.layout_hdr);
 +	if (status)
-+		goto out;
++		goto err;
 +
-+	status = nfserr_inval;
-+	sb = current_fh->fh_dentry->d_inode->i_sb;
-+	if (!sb)
-+		goto out;
++	mcs->full_map.components.size = total_comps;
++	mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf;
++	buf += total_comps * sizeof(pan_agg_comp_obj_t);
 +
-+	/* We must be able to encode at list one device */
-+	if (!gdlp->gd_maxdevices)
-+		goto out;
++	mcs->secs.size = total_comps;
++	mcs->secs.data = (pan_sm_sec_t *)buf;
++	buf += total_comps * sizeof(pan_sm_sec_t);
 +
-+	/* Ensure underlying file system supports pNFS and,
-+	 * if so, the requested layout type
-+	 */
-+	status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+				     gdlp->gd_layout_type);
-+	if (status)
-+		goto out;
++	lo_comp = layout->olo_comps;
++	pan_comp = mcs->full_map.components.data + layout->olo_comps_index;
++	pan_sec = mcs->secs.data + layout->olo_comps_index;
++	for (i = 0; i < layout->olo_num_comps; i++) {
++		void *p;
++		pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id;
++		struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id;
++		u64 dev_id = __be64_to_cpup(
++			(__be64 *)oc_obj_id->oid_device_id.data + 1);
 +
-+	/* Do nothing if underlying file system does not support
-+	 * getdevicelist */
-+	if (!sb->s_pnfs_op->get_device_iter) {
-+		status = nfserr_notsupp;
-+		goto out;
-+	}
++		dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n",
++			__func__, i,
++			__be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data),
++			__be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1),
++			oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id);
 +
-+	/* Set up arguments so device can be retrieved at encode time */
-+	gdlp->gd_fhp = &cstate->current_fh;
-+out:
-+	return status;
-+}
++		if (i == 0) {
++			/* make up mgr_id to calm sam down */
++			pan_mgr_id_construct_artificial(PAN_MGR_SM, 0,
++							&obj_id->dev_id);
++			obj_id->grp_id = oc_obj_id->oid_partition_id;
++			obj_id->obj_id = oc_obj_id->oid_object_id;
++		}
 +
-+static __be32
-+nfsd4_getdevinfo(struct svc_rqst *rqstp,
-+		struct nfsd4_compound_state *cstate,
-+		struct nfsd4_pnfs_getdevinfo *gdp)
-+{
-+	struct super_block *sb;
-+	int status;
-+	clientid_t clid;
++		if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) {
++			dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n",
++				__func__, i, (u64)obj_id->grp_id,
++				lo_comp->oc_object_id.oid_partition_id);
++			status = -EINVAL;
++			goto err;
++		}
 +
-+	dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n",
-+	       __func__, gdp->gd_layout_type, gdp->gd_devid.sbid,
-+	       gdp->gd_devid.devid, gdp->gd_maxcount);
++		if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) {
++			dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n",
++				__func__, i, obj_id->obj_id,
++				lo_comp->oc_object_id.oid_object_id);
++			status = -EINVAL;
++			goto err;
++		}
++
++		pan_comp->dev_id = dev_id;
++		if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) {
++			dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n",
++				__func__, i, obj_id->dev_id);
++			status = -EINVAL;
++			goto err;
++		}
++		if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) {
++			dprintk("%s: degraded maps not supported yet\n",
++				__func__);
++			status = -ENOTSUPP;
++			goto err;
++		}
++		pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL;
++		if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) {
++			dprintk("%s: cap key security not supported yet\n",
++				__func__);
++			status = -ENOTSUPP;
++			goto err;
++		}
++
++		p = lo_comp->oc_cap.cred;
++		panfs_export_ops->sm_sec_t_unmarshall(
++			(pan_sm_sec_otw_t *)&p,
++			pan_sec,
++			buf,
++			alloc_sz,
++			NULL,
++			&local_sz);
++		buf += local_sz;
++		alloc_sz -= local_sz;
 +
-+	status = nfserr_inval;
-+	sb = find_sbid_id(gdp->gd_devid.sbid);
-+	dprintk("%s: sb %p\n", __func__, sb);
-+	if (!sb) {
-+		status = nfserr_noent;
-+		goto out;
++		lo_comp++;
++		pan_comp++;
++		pan_sec++;
 +	}
 +
-+	/* Ensure underlying file system supports pNFS and,
-+	 * if so, the requested layout type
-+	 */
-+	status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type);
-+	if (status)
-+		goto out;
-+
-+	/* Set up arguments so device can be retrieved at encode time */
-+	gdp->gd_sb = sb;
++	*outp = mcs;
++	dprintk("%s:Return mcs=%p\n", __func__, mcs);
++	return 0;
 +
-+	/* Update notifications */
-+	copy_clientid(&clid, cstate->session);
-+	pnfs_set_device_notify(&clid, gdp->gd_notify_types);
-+out:
++err:
++	objio_free_lseg(mcs);
++	dprintk("%s:Error %d\n", __func__, status);
 +	return status;
 +}
 +
-+static __be32
-+nfsd4_layoutget(struct svc_rqst *rqstp,
-+		struct nfsd4_compound_state *cstate,
-+		struct nfsd4_pnfs_layoutget *lgp)
++/*
++ * Free a Panasas map and caps type
++ */
++void
++objio_free_lseg(void *p)
 +{
-+	int status;
-+	struct super_block *sb;
-+	struct svc_fh *current_fh = &cstate->current_fh;
-+
-+	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+	if (status)
-+		goto out;
-+
-+	status = nfserr_inval;
-+	sb = current_fh->fh_dentry->d_inode->i_sb;
-+	if (!sb)
-+		goto out;
++	kfree(p);
++}
 +
-+	/* Ensure underlying file system supports pNFS and,
-+	 * if so, the requested layout type
-+	 */
-+	status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+				     lgp->lg_seg.layout_type);
-+	if (status)
-+		goto out;
++/*
++ * I/O routines
++ */
++int
++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++{
++	struct panfs_shim_io_state *p;
 +
-+	status = nfserr_badiomode;
-+	if (lgp->lg_seg.iomode != IOMODE_READ &&
-+	    lgp->lg_seg.iomode != IOMODE_RW) {
-+		dprintk("pNFS %s: invalid iomode %d\n", __func__,
-+			lgp->lg_seg.iomode);
-+		goto out;
-+	}
++	dprintk("%s: allocating io_state\n", __func__);
++	p = kzalloc(sizeof(*p), GFP_KERNEL);
++	if (!p)
++		return -ENOMEM;
 +
-+	/* Set up arguments so layout can be retrieved at encode time */
-+	lgp->lg_fhp = current_fh;
-+	copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session);
-+	status = nfs_ok;
-+out:
-+	return status;
++	*outp = &p->ol_state;
++	return 0;
 +}
 +
-+static __be32
-+nfsd4_layoutcommit(struct svc_rqst *rqstp,
-+		struct nfsd4_compound_state *cstate,
-+		struct nfsd4_pnfs_layoutcommit *lcp)
++/*
++ * Free an I/O state
++ */
++void
++objio_free_io_state(struct objlayout_io_state *ol_state)
 +{
-+	int status;
-+	struct inode *ino = NULL;
-+	struct iattr ia;
-+	struct super_block *sb;
-+	struct svc_fh *current_fh = &cstate->current_fh;
++	struct panfs_shim_io_state *state = container_of(ol_state,
++					struct panfs_shim_io_state, ol_state);
++	int i;
 +
-+	dprintk("NFSD: nfsd4_layoutcommit \n");
-+	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+	if (status)
-+		goto out;
++	dprintk("%s: freeing io_state\n", __func__);
++	for (i = 0; i < state->ol_state.nr_pages; i++)
++		kunmap(state->ol_state.pages[i]);
 +
-+	status = nfserr_inval;
-+	ino = current_fh->fh_dentry->d_inode;
-+	if (!ino)
-+		goto out;
++	if (state->ucreds)
++		panfs_export_ops->ucreds_put(state->ucreds);
++	kfree(state->sg_list);
++	kfree(state);
++}
 +
-+	status = nfserr_inval;
-+	sb = ino->i_sb;
-+	if (!sb)
-+		goto out;
++static int
++panfs_shim_pages_to_sg(
++	struct panfs_shim_io_state *state,
++	struct page **pages,
++	unsigned int pgbase,
++	unsigned nr_pages,
++	size_t count)
++{
++	unsigned i, n;
++	pan_sg_entry_t *sg;
 +
-+	/* Ensure underlying file system supports pNFS and,
-+	 * if so, the requested layout type
-+	 */
-+	status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+				     lcp->args.lc_seg.layout_type);
-+	if (status)
-+		goto out;
++	dprintk("%s pgbase %u nr_pages %u count %d "
++		"pg0 %p flags 0x%x index %llu\n",
++		__func__, pgbase, nr_pages, (int)count, pages[0],
++		(unsigned)pages[0]->flags, (unsigned long long)pages[0]->index);
 +
-+	/* This will only extend the file length.  Do a quick
-+	 * check to see if there is any point in waiting for the update
-+	 * locks.
-+	 * TODO: Is this correct for all back ends?
-+	 */
-+	dprintk("%s:new offset: %d new size: %llu old size: %lld\n",
-+		__func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1,
-+		ino->i_size);
++	sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL);
++	if (sg == NULL)
++		return -ENOMEM;
 +
-+	/* Set clientid from sessionid */
-+	copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session);
-+	lcp->res.lc_size_chg = 0;
-+	if (sb->s_pnfs_op->layout_commit) {
-+		status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res);
-+		dprintk("%s:layout_commit result %d\n", __func__, status);
-+	} else {
-+		fh_lock(current_fh);
-+		if ((lcp->args.lc_newoffset == 0) ||
-+		    ((lcp->args.lc_last_wr + 1) <= ino->i_size)) {
-+			status = 0;
-+			lcp->res.lc_size_chg = 0;
-+			fh_unlock(current_fh);
-+			goto out;
-+		}
++	dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n",
++		__func__, sg, pages, pgbase, nr_pages);
 +
-+		/* Try our best to update the file size */
-+		dprintk("%s: Modifying file size\n", __func__);
-+		ia.ia_valid = ATTR_SIZE;
-+		ia.ia_size = lcp->args.lc_last_wr + 1;
-+		status = notify_change(current_fh->fh_dentry, &ia);
-+		fh_unlock(current_fh);
-+		dprintk("%s:notify_change result %d\n", __func__, status);
++	for (i = 0; i < nr_pages; i++) {
++		sg[i].buffer = (char *)kmap(pages[i]) + pgbase;
++		n = PAGE_SIZE - pgbase;
++		pgbase = 0;
++		if (n > count)
++			n = count;
++		sg[i].chunk_size = n;
++		count -= n;
++		if (likely(count)) {
++			sg[i].next = &sg[i+1];
++		} else {
++			/* we're done */
++			sg[i].next = NULL;
++			break;
++		}
 +	}
++	BUG_ON(count);
 +
-+	if (!status && lcp->res.lc_size_chg &&
-+	    EX_ISSYNC(current_fh->fh_export)) {
-+		dprintk("%s: Synchronously writing inode size %llu\n",
-+			__func__, ino->i_size);
-+		write_inode_now(ino, 1);
-+		lcp->res.lc_newsize = i_size_read(ino);
-+	}
-+out:
-+	return status;
++	state->sg_list = sg;
++	return 0;
 +}
 +
-+static __be32
-+nfsd4_layoutreturn(struct svc_rqst *rqstp,
-+		struct nfsd4_compound_state *cstate,
-+		struct nfsd4_pnfs_layoutreturn *lrp)
++/*
++ * Callback function for async reads
++ */
++static void
++panfs_shim_read_done(
++	void *arg1,
++	void *arg2,
++	pan_sam_read_res_t *res_p,
++	pan_status_t rc)
 +{
-+	int status;
-+	struct super_block *sb;
-+	struct svc_fh *current_fh = &cstate->current_fh;
++	struct panfs_shim_io_state *state = arg1;
++	ssize_t status;
 +
-+	status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+	if (status)
-+		goto out;
++	dprintk("%s: Begin\n", __func__);
++	if (!res_p)
++		res_p = &state->u.read.res;
++	if (rc == PAN_SUCCESS)
++		rc = res_p->result;
++	if (rc == PAN_SUCCESS) {
++		status = res_p->length;
++		WARN_ON(status < 0);
++	} else {
++		status = -panfs_export_ops->convert_rc(rc);
++		dprintk("%s: pan_sam_read rc %d: status %Zd\n",
++			__func__, rc, status);
++	}
++	dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
++	objlayout_read_done(&state->ol_state, status, true);
++}
 +
-+	status = nfserr_inval;
-+	sb = current_fh->fh_dentry->d_inode->i_sb;
-+	if (!sb)
-+		goto out;
++ssize_t
++objio_read_pagelist(struct objlayout_io_state *ol_state)
++{
++	struct panfs_shim_io_state *state = container_of(ol_state,
++					struct panfs_shim_io_state, ol_state);
++	pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
++	ssize_t status = 0;
++	pan_status_t rc = PAN_SUCCESS;
++
++	dprintk("%s: Begin\n", __func__);
 +
-+	/* Ensure underlying file system supports pNFS and,
-+	 * if so, the requested layout type
-+	 */
-+	status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+				     lrp->args.lr_seg.layout_type);
-+	if (status)
-+		goto out;
++	status = panfs_shim_pages_to_sg(state, ol_state->pages,
++					ol_state->pgbase, ol_state->nr_pages,
++					ol_state->count);
++	if (unlikely(status))
++		goto err;
 +
-+	status = nfserr_inval;
-+	if (lrp->args.lr_return_type != RETURN_FILE &&
-+	    lrp->args.lr_return_type != RETURN_FSID &&
-+	    lrp->args.lr_return_type != RETURN_ALL) {
-+		dprintk("pNFS %s: invalid return_type %d\n", __func__,
-+			lrp->args.lr_return_type);
-+		goto out;
-+	}
++	state->obj_sec.min_security = 0;
++	state->obj_sec.map_ccaps = mcs;
 +
-+	status = nfserr_inval;
-+	if (lrp->args.lr_seg.iomode != IOMODE_READ &&
-+	    lrp->args.lr_seg.iomode != IOMODE_RW &&
-+	    lrp->args.lr_seg.iomode != IOMODE_ANY) {
-+		dprintk("pNFS %s: invalid iomode %d\n", __func__,
-+			lrp->args.lr_seg.iomode);
-+		goto out;
++	rc = panfs_export_ops->ucreds_get(&state->ucreds);
++	if (unlikely(rc)) {
++		status = -EACCES;
++		goto err;
 +	}
 +
-+	/* Set clientid from sessionid */
-+	copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session);
-+	lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE);
-+	status = nfs4_pnfs_return_layout(sb, current_fh, lrp);
-+out:
-+	dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n",
-+		__func__, status, lrp->args.lr_return_type, lrp->lrs_present);
++	state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id;
++	state->u.read.args.offset = ol_state->offset;
++	rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP,
++					&state->u.read.args,
++					&state->obj_sec,
++					state->sg_list,
++					state->ucreds,
++					ol_state->sync ?
++						NULL : panfs_shim_read_done,
++					state, NULL,
++					&state->u.read.res);
++	if (rc != PAN_ERR_IN_PROGRESS)
++		panfs_shim_read_done(state, NULL, &state->u.read.res, rc);
++ err:
++	dprintk("%s: Return %Zd\n", __func__, status);
 +	return status;
 +}
-+#endif /* CONFIG_PNFSD */
 +
- /*
-  * NULL call.
-  */
-@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[] = {
- 		.op_flags = ALLOWED_WITHOUT_FH,
- 		.op_name = "OP_RECLAIM_COMPLETE",
- 	},
-+#if defined(CONFIG_PNFSD)
-+	[OP_GETDEVICELIST] = {
-+		.op_func = (nfsd4op_func)nfsd4_getdevlist,
-+		.op_name = "OP_GETDEVICELIST",
-+	},
-+	[OP_GETDEVICEINFO] = {
-+		.op_func = (nfsd4op_func)nfsd4_getdevinfo,
-+		.op_flags = ALLOWED_WITHOUT_FH,
-+		.op_name = "OP_GETDEVICEINFO",
-+	},
-+	[OP_LAYOUTGET] = {
-+		.op_func = (nfsd4op_func)nfsd4_layoutget,
-+		.op_name = "OP_LAYOUTGET",
-+	},
-+	[OP_LAYOUTCOMMIT] = {
-+		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
-+		.op_name = "OP_LAYOUTCOMMIT",
-+	},
-+	[OP_LAYOUTRETURN] = {
-+		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
-+		.op_name = "OP_LAYOUTRETURN",
-+	},
-+#endif /* CONFIG_PNFSD */
- };
- 
- static const char *nfsd4_op_name(unsigned opnum)
-diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
-index cf0d2ff..770b494 100644
---- a/fs/nfsd/nfs4state.c
-+++ b/fs/nfsd/nfs4state.c
-@@ -42,6 +42,8 @@
- #include "xdr4.h"
- #include "vfs.h"
- 
-+#include "pnfsd.h"
++/*
++ * Callback function for async writes
++ */
++static void
++panfs_shim_write_done(
++	void *arg1,
++	void *arg2,
++	pan_sam_write_res_t *res_p,
++	pan_status_t rc)
++{
++	struct panfs_shim_io_state *state = arg1;
++	ssize_t status;
 +
- #define NFSDDBG_FACILITY                NFSDDBG_PROC
- 
- /* Globals */
-@@ -59,8 +61,6 @@ static u64 current_sessionid = 1;
- #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
- 
- /* forward declarations */
--static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
--static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
- static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
- static void nfs4_set_recdir(char *recdir);
- 
-@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir);
- 
- /* Currently used for almost all code touching nfsv4 state: */
- static DEFINE_MUTEX(client_mutex);
-+struct task_struct *client_mutex_owner;
- 
- /*
-  * Currently used for the del_recall_lru and file hash table.  In an
-@@ -85,11 +86,21 @@ void
- nfs4_lock_state(void)
- {
- 	mutex_lock(&client_mutex);
-+	client_mutex_owner = current;
-+}
++	dprintk("%s: Begin\n", __func__);
++	if (!res_p)
++		res_p = &state->u.write.res;
++	if (rc == PAN_SUCCESS)
++		rc = res_p->result;
++	if (rc == PAN_SUCCESS) {
++/*		state->ol_state.committed = NFS_FILE_SYNC;*/
++		state->ol_state.committed = NFS_UNSTABLE;
++		status = res_p->length;
++		WARN_ON(status < 0);
 +
-+#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current)
++		objlayout_add_delta_space_used(&state->ol_state,
++					       res_p->delta_capacity_used);
++	} else {
++		status = -panfs_export_ops->convert_rc(rc);
++		dprintk("%s: pan_sam_write rc %u: status %Zd\n",
++			__func__, rc, status);
++	}
++	dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
++	objlayout_write_done(&state->ol_state, status, true);
++}
 +
-+void
-+nfs4_bug_on_unlocked_state(void)
++ssize_t
++objio_write_pagelist(struct objlayout_io_state *ol_state,
++		     bool stable /* unused, PanOSD writes are stable */)
 +{
-+	BUG_ON(client_mutex_owner != current);
- }
- 
- void
- nfs4_unlock_state(void)
- {
-+	client_mutex_owner = NULL;
- 	mutex_unlock(&client_mutex);
- }
- 
-@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbytes)
- 
- static struct list_head del_recall_lru;
- 
--static inline void
-+inline void
- put_nfs4_file(struct nfs4_file *fi)
- {
- 	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
-@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi)
- 	}
- }
- 
--static inline void
-+inline void
- get_nfs4_file(struct nfs4_file *fi)
- {
- 	atomic_inc(&fi->fi_ref);
-@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
- 
- static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
- {
--	if (fp->fi_fds[oflag]) {
--		fput(fp->fi_fds[oflag]);
--		fp->fi_fds[oflag] = NULL;
--	}
-+	struct file *fd = fp->fi_fds[oflag];
++	struct panfs_shim_io_state *state = container_of(ol_state,
++					struct panfs_shim_io_state, ol_state);
++	pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
++	ssize_t status = 0;
++	pan_status_t rc = PAN_SUCCESS;
 +
-+	if (!fd)
-+		return;
++	dprintk("%s: Begin\n", __func__);
 +
-+	fp->fi_fds[oflag] = NULL;
-+	BUG_ON_UNLOCKED_STATE();
-+	nfs4_unlock_state();	/* allow nested layout recall/return */
-+	fput(fd);
-+	nfs4_lock_state();
- }
- 
- static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
-@@ -308,8 +325,8 @@ static DEFINE_SPINLOCK(client_lock);
-  * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
-  * used in reboot/reset lease grace period processing
-  *
-- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
-- * setclientid_confirmed info. 
-+ * conf_id_hashtbl[], and conf_str_hashtbl[] hold
-+ * confirmed setclientid_confirmed info.
-  *
-  * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed 
-  * setclientid info.
-@@ -334,6 +351,7 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp)
- 	list_del(&stp->st_hash);
- 	list_del(&stp->st_perfile);
- 	list_del(&stp->st_perstateowner);
-+	release_pnfs_ds_dev_list(stp);
- }
- 
- static void free_generic_stateid(struct nfs4_stateid *stp)
-@@ -856,6 +874,8 @@ expire_client(struct nfs4_client *clp)
- 	struct nfs4_delegation *dp;
- 	struct list_head reaplist;
- 
-+	BUG_ON_UNLOCKED_STATE();
++	status = panfs_shim_pages_to_sg(state, ol_state->pages,
++					ol_state->pgbase, ol_state->nr_pages,
++					ol_state->count);
++	if (unlikely(status))
++		goto err;
++
++	state->obj_sec.min_security = 0;
++	state->obj_sec.map_ccaps = mcs;
++
++	rc = panfs_export_ops->ucreds_get(&state->ucreds);
++	if (unlikely(rc)) {
++		status = -EACCES;
++		goto err;
++	}
++
++	state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id;
++	state->u.write.args.offset = ol_state->offset;
++	rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE,
++					 &state->u.write.args,
++					 &state->obj_sec,
++					 state->sg_list,
++					 state->ucreds,
++					 ol_state->sync ?
++						NULL : panfs_shim_write_done,
++					 state,
++					 NULL,
++					 &state->u.write.res);
++	if (rc != PAN_ERR_IN_PROGRESS)
++		panfs_shim_write_done(state, NULL, &state->u.write.res, rc);
++ err:
++	dprintk("%s: Return %Zd\n", __func__, status);
++	return status;
++}
 +
- 	INIT_LIST_HEAD(&reaplist);
- 	spin_lock(&recall_lock);
- 	while (!list_empty(&clp->cl_delegations)) {
-@@ -875,6 +895,7 @@ expire_client(struct nfs4_client *clp)
- 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
- 		release_openowner(sop);
- 	}
-+	pnfs_expire_client(clp);
- 	nfsd4_set_callback_client(clp, NULL);
- 	if (clp->cl_cb_conn.cb_xprt)
- 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
-@@ -887,6 +908,13 @@ expire_client(struct nfs4_client *clp)
- 	spin_unlock(&client_lock);
- }
- 
-+void expire_client_lock(struct nfs4_client *clp)
++int
++panfs_shim_register(struct panfs_export_operations *ops)
 +{
-+	nfs4_lock_state();
-+	expire_client(clp);
-+	nfs4_unlock_state();
++	if (panfs_export_ops) {
++		printk(KERN_INFO
++		       "%s: panfs already registered (panfs ops %p)\n",
++		       __func__, panfs_export_ops);
++		return -EINVAL;
++	}
++
++	printk(KERN_INFO "%s: registering panfs ops %p\n",
++	       __func__, ops);
++
++	panfs_export_ops = ops;
++	return 0;
 +}
++EXPORT_SYMBOL(panfs_shim_register);
 +
- static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
- {
- 	memcpy(target->cl_verifier.data, source->data,
-@@ -976,6 +1004,11 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
- 	INIT_LIST_HEAD(&clp->cl_strhash);
- 	INIT_LIST_HEAD(&clp->cl_openowners);
- 	INIT_LIST_HEAD(&clp->cl_delegations);
-+#if defined(CONFIG_PNFSD)
-+	INIT_LIST_HEAD(&clp->cl_layouts);
-+	INIT_LIST_HEAD(&clp->cl_layoutrecalls);
-+	atomic_set(&clp->cl_deviceref, 0);
-+#endif /* CONFIG_PNFSD */
- 	INIT_LIST_HEAD(&clp->cl_sessions);
- 	INIT_LIST_HEAD(&clp->cl_lru);
- 	clp->cl_time = get_seconds();
-@@ -1025,7 +1058,7 @@ move_to_confirmed(struct nfs4_client *clp)
- 	renew_client(clp);
- }
- 
--static struct nfs4_client *
-+struct nfs4_client *
- find_confirmed_client(clientid_t *clid)
- {
- 	struct nfs4_client *clp;
-@@ -1095,6 +1128,24 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
- 	return NULL;
- }
- 
 +int
-+filter_confirmed_clients(int (* func)(struct nfs4_client *, void *),
-+			 void *arg)
++panfs_shim_unregister(void)
 +{
-+	struct nfs4_client *clp, *next;
-+	int i, status = 0;
++	if (!panfs_export_ops) {
++		printk(KERN_INFO "%s: panfs is not registered\n", __func__);
++		return -EINVAL;
++	}
 +
-+	for (i = 0; i < CLIENT_HASH_SIZE; i++)
-+		list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i],
-+					  cl_strhash) {
-+			status = func(clp, arg);
-+			if (status)
-+				break;
-+		}
++	printk(KERN_INFO "%s: unregistering panfs ops %p\n",
++	       __func__, panfs_export_ops);
 +
-+	return status;
++	panfs_export_ops = NULL;
++	return 0;
 +}
++EXPORT_SYMBOL(panfs_shim_unregister);
 +
- static void
- gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
- {
-@@ -1227,8 +1278,12 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
- static void
- nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
- {
--	/* pNFS is not supported */
-+#if defined(CONFIG_PNFSD)
-+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS |
-+				  EXCHGID4_FLAG_USE_PNFS_DS;
-+#else  /* CONFIG_PNFSD */
- 	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
-+#endif /* CONFIG_PNFSD */
- 
- 	/* Referrals are supported, Migration is not. */
- 	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
-@@ -1418,6 +1473,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
- 	struct nfsd4_clid_slot *cs_slot = NULL;
- 	int status = 0;
- 
-+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
-+	/* XXX hack to get local ip address */
-+	memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local,
-+		sizeof(pnfsd_lexp_addr));
-+	pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen;
-+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
-+
- 	nfs4_lock_state();
- 	unconf = find_unconfirmed_client(&cr_ses->clientid);
- 	conf = find_confirmed_client(&cr_ses->clientid);
-@@ -1457,25 +1519,26 @@ nfsd4_create_session(struct svc_rqst *rqstp,
- 		cs_slot->sl_seqid++; /* from 0 to 1 */
- 		move_to_confirmed(unconf);
- 
--		if (cr_ses->flags & SESSION4_BACK_CHAN) {
--			unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
--			svc_xprt_get(rqstp->rq_xprt);
--			rpc_copy_addr(
--				(struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
--				sa);
--			unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
--			unconf->cl_cb_conn.cb_minorversion =
--				cstate->minorversion;
--			unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
--			unconf->cl_cb_seq_nr = 1;
--			nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
--		}
-+		if (is_ds_only_session(unconf->cl_exchange_flags))
-+			cr_ses->flags &= ~SESSION4_BACK_CHAN;
++/*
++ * Policy Operations
++ */
 +
- 		conf = unconf;
- 	} else {
- 		status = nfserr_stale_clientid;
- 		goto out;
- 	}
- 
-+	if (cr_ses->flags & SESSION4_BACK_CHAN) {
-+		conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-+		svc_xprt_get(rqstp->rq_xprt);
-+		rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa);
-+		conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-+		conf->cl_cb_conn.cb_minorversion = cstate->minorversion;
-+		conf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-+		conf->cl_cb_seq_nr = 1;
-+		nfsd4_probe_callback(conf, &conf->cl_cb_conn);
-+	}
++#define PANLAYOUT_DEF_STRIPE_UNIT    (64*1024)
++#define PANLAYOUT_DEF_STRIPE_WIDTH   9
++#define PANLAYOUT_MAX_STRIPE_WIDTH   11
++#define PANLAYOUT_MAX_GATHER_STRIPES 8
 +
- 	/*
- 	 * We do not support RDMA or persistent sessions
- 	 */
-@@ -1863,7 +1926,7 @@ out:
- 
- /* OPEN Share state helper functions */
- static inline struct nfs4_file *
--alloc_init_file(struct inode *ino)
-+alloc_init_file(struct inode *ino, struct svc_fh *current_fh)
- {
- 	struct nfs4_file *fp;
- 	unsigned int hashval = file_hashval(ino);
-@@ -1879,6 +1942,16 @@ alloc_init_file(struct inode *ino)
- 		fp->fi_had_conflict = false;
- 		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
- 		memset(fp->fi_access, 0, sizeof(fp->fi_access));
-+#if defined(CONFIG_PNFSD)
-+		INIT_LIST_HEAD(&fp->fi_layouts);
-+		INIT_LIST_HEAD(&fp->fi_layout_states);
-+		fp->fi_fsid.major = current_fh->fh_export->ex_fsid;
-+		fp->fi_fsid.minor = 0;
-+		fp->fi_fhlen = current_fh->fh_handle.fh_size;
-+		BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval));
-+		memcpy(fp->fi_fhval, &current_fh->fh_handle.fh_base,
-+		       fp->fi_fhlen);
-+#endif /* CONFIG_PNFSD */
- 		spin_lock(&recall_lock);
- 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
- 		spin_unlock(&recall_lock);
-@@ -1887,7 +1960,7 @@ alloc_init_file(struct inode *ino)
- 	return NULL;
- }
- 
--static void
-+void
- nfsd4_free_slab(struct kmem_cache **slab)
- {
- 	if (*slab == NULL)
-@@ -1903,6 +1976,7 @@ nfsd4_free_slabs(void)
- 	nfsd4_free_slab(&file_slab);
- 	nfsd4_free_slab(&stateid_slab);
- 	nfsd4_free_slab(&deleg_slab);
-+	nfsd4_free_pnfs_slabs();
- }
- 
- static int
-@@ -1924,6 +1998,8 @@ nfsd4_init_slabs(void)
- 			sizeof(struct nfs4_delegation), 0, 0, NULL);
- 	if (deleg_slab == NULL)
- 		goto out_nomem;
-+	if (nfsd4_init_pnfs_slabs())
-+		goto out_nomem;
- 	return 0;
- out_nomem:
- 	nfsd4_free_slabs();
-@@ -1997,6 +2073,9 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
- 	INIT_LIST_HEAD(&stp->st_perstateowner);
- 	INIT_LIST_HEAD(&stp->st_lockowners);
- 	INIT_LIST_HEAD(&stp->st_perfile);
-+#if defined(CONFIG_PNFSD)
-+	INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
-+#endif /* CONFIG_PNFSD */
- 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
- 	list_add(&stp->st_perstateowner, &sop->so_stateids);
- 	list_add(&stp->st_perfile, &fp->fi_stateids);
-@@ -2038,6 +2117,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
- {
- 	struct nfs4_stateowner *so = NULL;
- 
-+	BUG_ON_UNLOCKED_STATE();
- 	list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
- 		if (same_owner_str(so, &open->op_owner, &open->op_clientid))
- 			return so;
-@@ -2046,7 +2126,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
- }
- 
- /* search file_hashtbl[] for file */
--static struct nfs4_file *
-+struct nfs4_file *
- find_file(struct inode *ino)
- {
- 	unsigned int hashval = file_hashval(ino);
-@@ -2064,6 +2144,18 @@ find_file(struct inode *ino)
- 	return NULL;
- }
- 
-+struct nfs4_file *
-+find_alloc_file(struct inode *ino, struct svc_fh *current_fh)
++/*
++ * Get the max [rw]size
++ */
++static ssize_t
++panlayout_get_blocksize(void)
 +{
-+	struct nfs4_file *fp;
++	ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) *
++		      PANLAYOUT_DEF_STRIPE_UNIT *
++		      PANLAYOUT_MAX_GATHER_STRIPES;
++	dprintk("%s: Return %Zd\n", __func__, sz);
++	return sz;
++}
 +
-+	fp = find_file(ino);
-+	if (fp)
-+		return fp;
++/*
++ * Don't gather across stripes, but rather gather (coalesce) up to
++ * the stripe size.
++ *
++ * FIXME: change interface to use merge_align, merge_count
++ */
++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS)
 +
-+	return alloc_init_file(ino, current_fh);
++static struct pnfs_layoutdriver_type panlayout_type = {
++	.id = PNFS_LAYOUT_PANOSD,
++	.name = "PNFS_LAYOUT_PANOSD",
++	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
++
++	.set_layoutdriver        = objlayout_set_layoutdriver,
++	.clear_layoutdriver      = objlayout_clear_layoutdriver,
++
++	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
++	.free_layout_hdr         = objlayout_free_layout_hdr,
++
++	.alloc_lseg              = objlayout_alloc_lseg,
++	.free_lseg               = objlayout_free_lseg,
++
++	.get_blocksize           = panlayout_get_blocksize,
++
++	.read_pagelist           = objlayout_read_pagelist,
++	.write_pagelist          = objlayout_write_pagelist,
++	.commit                  = objlayout_commit,
++
++	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
++	.encode_layoutreturn     = objlayout_encode_layoutreturn,
++};
++
++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs");
++MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
++MODULE_LICENSE("GPL");
++
++static int __init
++panlayout_init(void)
++{
++	int ret = pnfs_register_layoutdriver(&panlayout_type);
++
++	if (ret)
++		printk(KERN_INFO
++			"%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n",
++			__func__, ret);
++	else
++		printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n",
++			__func__);
++	return ret;
 +}
 +
- static inline int access_valid(u32 x, u32 minorversion)
- {
- 	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
-@@ -2592,7 +2684,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
- 		if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
- 			goto out;
- 		status = nfserr_resource;
--		fp = alloc_init_file(ino);
-+		fp = alloc_init_file(ino, current_fh);
- 		if (fp == NULL)
- 			goto out;
- 	}
-@@ -2813,7 +2905,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
- 	return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
- }
- 
--static int
-+int
- STALE_STATEID(stateid_t *stateid)
- {
- 	if (stateid->si_boot == boot_time)
-@@ -2823,6 +2915,16 @@ STALE_STATEID(stateid_t *stateid)
- 	return 1;
- }
- 
-+__be32
-+nfs4_check_stateid(stateid_t *stateid)
++static void __exit
++panlayout_exit(void)
 +{
-+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-+		return nfserr_bad_stateid;
-+	if (STALE_STATEID(stateid))
-+		return nfserr_stale_stateid;
-+	return 0;
++	pnfs_unregister_layoutdriver(&panlayout_type);
++	printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n",
++	       __func__);
 +}
 +
- static inline int
- access_permit_read(unsigned long access_bmap)
- {
-@@ -2934,6 +3036,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
- 	if (grace_disallows_io(ino))
- 		return nfserr_grace;
- 
-+#if defined(CONFIG_PNFSD)
-+	if (pnfs_fh_is_ds(&current_fh->fh_handle)) {
-+		if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-+			status = nfserr_bad_stateid;
-+		else
-+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
-+		{
-+			dprintk("%s Don't check DS stateid\n", __func__);
-+			return 0;
-+		}
-+#else /* CONFIG_GFS2_FS_LOCKING_DLM */
-+			status = nfs4_preprocess_pnfs_ds_stateid(current_fh,
-+								 stateid);
-+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
-+		goto out;
-+	}
-+#endif /* CONFIG_PNFSD */
++module_init(panlayout_init);
++module_exit(panlayout_exit);
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h
+--- linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h.orig	2011-01-28 09:43:53.336772273 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/panfs_shim.h	2011-01-28 09:43:53.336772273 -0500
+@@ -0,0 +1,482 @@
++/*
++ *  panfs_shim.h
++ *
++ *  Data types and external function declerations for interfacing with
++ *  panfs (Panasas DirectFlow) I/O stack
++ *
++ *  Copyright (C) 2007 Panasas Inc.
++ *  All rights reserved.
++ *
++ *  Benny Halevy <bhalevy at panasas.com>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the Panasas company nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * See the file COPYING included with this distribution for more details.
++ *
++ */
 +
- 	if (nfsd4_has_session(cstate))
- 		flags |= HAS_SESSION;
- 
-@@ -3015,13 +3135,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
- 	*stpp = NULL;
- 	*sopp = NULL;
- 
--	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
--		dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
--		return nfserr_bad_stateid;
--	}
--
--	if (STALE_STATEID(stateid))
--		return nfserr_stale_stateid;
-+	status = nfs4_check_stateid(stateid);
-+	if (status)
-+		return status;
- 
- 	if (nfsd4_has_session(cstate))
- 		flags |= HAS_SESSION;
-@@ -3295,11 +3411,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- 	if (nfsd4_has_session(cstate))
- 		flags |= HAS_SESSION;
- 	nfs4_lock_state();
--	status = nfserr_bad_stateid;
--	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
--		goto out;
--	status = nfserr_stale_stateid;
--	if (STALE_STATEID(stateid))
-+	status = nfs4_check_stateid(stateid);
-+	if (status)
- 		goto out;
- 	status = nfserr_bad_stateid;
- 	if (!is_delegation_stateid(stateid))
-@@ -3328,26 +3441,6 @@ out:
- #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
- #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
- 
--static inline u64
--end_offset(u64 start, u64 len)
--{
--	u64 end;
--
--	end = start + len;
--	return end >= start ? end: NFS4_MAX_UINT64;
--}
--
--/* last octet in a range */
--static inline u64
--last_byte_offset(u64 start, u64 len)
--{
--	u64 end;
--
--	BUG_ON(!len);
--	end = start + len;
--	return end > start ? end - 1: NFS4_MAX_UINT64;
--}
--
- #define lockownerid_hashval(id) \
-         ((id) & LOCK_HASH_MASK)
- 
-@@ -3364,7 +3457,7 @@ static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
- static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
- static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
- 
--static struct nfs4_stateid *
-+struct nfs4_stateid *
- find_stateid(stateid_t *stid, int flags)
- {
- 	struct nfs4_stateid *local;
-@@ -3393,7 +3486,7 @@ find_stateid(stateid_t *stid, int flags)
- 	return NULL;
- }
- 
--static struct nfs4_delegation *
-+struct nfs4_delegation *
- find_delegation_stateid(struct inode *ino, stateid_t *stid)
- {
- 	struct nfs4_file *fp;
-@@ -3524,6 +3617,9 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
- 	INIT_LIST_HEAD(&stp->st_perfile);
- 	INIT_LIST_HEAD(&stp->st_perstateowner);
- 	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
-+#if defined(CONFIG_PNFSD)
-+	INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
-+#endif /* CONFIG_PNFSD */
- 	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
- 	list_add(&stp->st_perfile, &fp->fi_stateids);
- 	list_add(&stp->st_perstateowner, &sop->so_stateids);
-@@ -4100,6 +4196,9 @@ nfs4_state_init(void)
- 	INIT_LIST_HEAD(&client_lru);
- 	INIT_LIST_HEAD(&del_recall_lru);
- 	reclaim_str_hashtbl_size = 0;
-+#if defined(CONFIG_PNFSD)
-+	nfs4_pnfs_state_init();
-+#endif /* CONFIG_PNFSD */
- 	return 0;
- }
- 
-@@ -4204,6 +4303,7 @@ __nfs4_state_shutdown(void)
- 	}
- 
- 	nfsd4_shutdown_recdir();
-+	nfs4_pnfs_state_shutdown();
- }
- 
- void
-diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
-index 1a468bb..b93906b 100644
---- a/fs/nfsd/nfs4xdr.c
-+++ b/fs/nfsd/nfs4xdr.c
-@@ -47,9 +47,14 @@
- #include <linux/nfsd_idmap.h>
- #include <linux/nfs4_acl.h>
- #include <linux/sunrpc/svcauth_gss.h>
-+#include <linux/exportfs.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd4_block.h>
- 
- #include "xdr4.h"
- #include "vfs.h"
-+#include "pnfsd.h"
- 
- #define NFSDDBG_FACILITY		NFSDDBG_XDR
- 
-@@ -1244,6 +1249,138 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
- 	DECODE_TAIL;
- }
- 
-+#if defined(CONFIG_PNFSD)
-+static __be32
-+nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp,
-+			struct nfsd4_pnfs_getdevlist *gdevl)
-+{
-+	DECODE_HEAD;
++#ifndef _PANLAYOUT_PANFS_SHIM_H
++#define _PANLAYOUT_PANFS_SHIM_H
 +
-+	READ_BUF(16 + sizeof(nfs4_verifier));
-+	READ32(gdevl->gd_layout_type);
-+	READ32(gdevl->gd_maxdevices);
-+	READ64(gdevl->gd_cookie);
-+	COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
++typedef s8 pan_int8_t;
++typedef u8 pan_uint8_t;
++typedef s16 pan_int16_t;
++typedef u16 pan_uint16_t;
++typedef s32 pan_int32_t;
++typedef u32 pan_uint32_t;
++typedef s64 pan_int64_t;
++typedef u64 pan_uint64_t;
 +
-+	DECODE_TAIL;
-+}
++/*
++ * from pan_base_types.h
++ */
++typedef  pan_uint64_t pan_rpc_none_t;
++typedef pan_uint32_t  pan_rpc_arrdim_t;
++typedef pan_uint32_t  pan_status_t;
++typedef pan_uint8_t   pan_otw_t;
++typedef pan_uint8_t   pan_pad_t;
 +
-+static __be32
-+nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp,
-+			struct nfsd4_pnfs_getdevinfo *gdev)
-+{
-+	u32 num;
-+	DECODE_HEAD;
++typedef pan_uint32_t  pan_timespec_sec_t;
++typedef pan_uint32_t  pan_timespec_nsec_t;
 +
-+	READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid));
-+	READ64(gdev->gd_devid.sbid);
-+	READ64(gdev->gd_devid.devid);
-+	READ32(gdev->gd_layout_type);
-+	READ32(gdev->gd_maxcount);
-+	READ32(num);
-+	if (num) {
-+		READ_BUF(4);
-+		READ32(gdev->gd_notify_types);
-+	} else {
-+		gdev->gd_notify_types = 0;
-+	}
++typedef  struct pan_timespec_s  pan_timespec_t;
++struct pan_timespec_s {
++  pan_timespec_sec_t   ts_sec;
++  pan_timespec_nsec_t  ts_nsec;
++};
 +
-+	DECODE_TAIL;
-+}
++/*
++ * from pan_std_types.h
++ */
++typedef pan_uint32_t pan_size_t;
++typedef  int  pan_bool_t;
 +
-+static __be32
-+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
-+			struct nfsd4_pnfs_layoutget *lgp)
-+{
-+	DECODE_HEAD;
++/*
++ * from pan_common_error.h
++ */
++#define PAN_SUCCESS                                         ((pan_status_t)0)
++#define PAN_ERR_IN_PROGRESS                                 ((pan_status_t)55)
 +
-+	READ_BUF(36);
-+	READ32(lgp->lg_signal);
-+	READ32(lgp->lg_seg.layout_type);
-+	READ32(lgp->lg_seg.iomode);
-+	READ64(lgp->lg_seg.offset);
-+	READ64(lgp->lg_seg.length);
-+	READ64(lgp->lg_minlength);
-+	nfsd4_decode_stateid(argp, &lgp->lg_sid);
-+	READ_BUF(4);
-+	READ32(lgp->lg_maxcount);
++/*
++ * from pan_sg.h
++ */
++typedef struct pan_sg_entry_s pan_sg_entry_t;
++struct pan_sg_entry_s {
++  void                  *buffer;       /* pointer to memory */
++  pan_uint32_t           chunk_size;   /* size of each chunk (bytes) */
++  pan_sg_entry_t        *next;
++};
 +
-+	DECODE_TAIL;
-+}
++/*
++ * from pan_storage.h
++ */
++typedef pan_uint64_t pan_stor_dev_id_t;
++typedef pan_uint32_t pan_stor_obj_grp_id_t;
++typedef pan_uint64_t pan_stor_obj_uniq_t;
++typedef pan_uint32_t pan_stor_action_t;
++typedef pan_uint8_t pan_stor_cap_key_t[20];
 +
-+static __be32
-+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
-+			  struct nfsd4_pnfs_layoutcommit *lcp)
-+{
-+	DECODE_HEAD;
-+	u32 timechange;
++typedef pan_uint8_t pan_stor_key_type_t;
++typedef pan_uint64_t pan_stor_len_t;
++typedef pan_int64_t pan_stor_delta_len_t;
++typedef pan_uint64_t pan_stor_offset_t;
++typedef pan_uint16_t pan_stor_op_t;
 +
-+	READ_BUF(20);
-+	READ64(lcp->args.lc_seg.offset);
-+	READ64(lcp->args.lc_seg.length);
-+	READ32(lcp->args.lc_reclaim);
-+	nfsd4_decode_stateid(argp, &lcp->lc_sid);
-+	READ_BUF(4);
-+	READ32(lcp->args.lc_newoffset);
-+	if (lcp->args.lc_newoffset) {
-+		READ_BUF(8);
-+		READ64(lcp->args.lc_last_wr);
-+	} else
-+		lcp->args.lc_last_wr = 0;
-+	READ_BUF(4);
-+	READ32(timechange);
-+	if (timechange) {
-+		READ_BUF(12);
-+		READ64(lcp->args.lc_mtime.seconds);
-+		READ32(lcp->args.lc_mtime.nseconds);
-+	} else {
-+		lcp->args.lc_mtime.seconds = 0;
-+		lcp->args.lc_mtime.nseconds = 0;
-+	}
-+	READ_BUF(8);
-+	READ32(lcp->args.lc_seg.layout_type);
-+	/* XXX: saving XDR'ed layout update. Since we don't have the
-+	 * current_fh yet, and therefore no export_ops, we can't call
-+	 * the layout specific decode routines. File and pVFS2
-+	 * do not use the layout update....
-+	 */
-+	READ32(lcp->args.lc_up_len);
-+	if (lcp->args.lc_up_len > 0) {
-+		READ_BUF(lcp->args.lc_up_len);
-+		READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len);
-+	}
++typedef pan_uint16_t pan_stor_sec_level_t;
 +
-+	DECODE_TAIL;
-+}
++struct pan_stor_obj_id_s {
++  pan_stor_dev_id_t      dev_id;
++  pan_stor_obj_uniq_t    obj_id;
++  pan_stor_obj_grp_id_t  grp_id;
++};
 +
-+static __be32
-+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
-+			  struct nfsd4_pnfs_layoutreturn *lrp)
-+{
-+	DECODE_HEAD;
++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t;
 +
-+	READ_BUF(16);
-+	READ32(lrp->args.lr_reclaim);
-+	READ32(lrp->args.lr_seg.layout_type);
-+	READ32(lrp->args.lr_seg.iomode);
-+	READ32(lrp->args.lr_return_type);
-+	if (lrp->args.lr_return_type == RETURN_FILE) {
-+		READ_BUF(16);
-+		READ64(lrp->args.lr_seg.offset);
-+		READ64(lrp->args.lr_seg.length);
-+		nfsd4_decode_stateid(argp, &lrp->lr_sid);
-+		READ_BUF(4);
-+		READ32(lrp->args.lrf_body_len);
-+		if (lrp->args.lrf_body_len > 0) {
-+			READ_BUF(lrp->args.lrf_body_len);
-+			READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len);
-+		}
-+	}
++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U)
++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U)
++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U)
++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U)
++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U)
++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U)
++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U)
++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U)
 +
-+	DECODE_TAIL;
-+}
-+#endif /* CONFIG_PNFSD */
++/*
++ * from pan_aggregation_map.h
++ */
++typedef pan_uint8_t pan_agg_type_t;
++typedef pan_uint64_t pan_agg_map_version_t;
++typedef pan_uint8_t pan_agg_obj_state_t;
++typedef pan_uint8_t pan_agg_comp_state_t;
++typedef pan_uint8_t pan_agg_comp_flag_t;
 +
- static __be32
- nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
- {
-@@ -1345,11 +1482,19 @@ static nfsd4_dec nfsd41_dec_ops[] = {
- 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
- 	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
- 	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
-+#if defined(CONFIG_PNFSD)
-+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdevinfo,
-+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_getdevlist,
-+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
-+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
-+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
-+#else  /* CONFIG_PNFSD */
- 	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
- 	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
- 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
- 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
- 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
-+#endif /* CONFIG_PNFSD */
- 	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_notsupp,
- 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
- 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
-@@ -1805,19 +1950,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- 				goto out_nfserr;
- 		}
- 	}
--	if ((buflen -= 16) < 0)
--		goto out_resource;
- 
- 	if (unlikely(bmval2)) {
-+		if ((buflen -= 16) < 0)
-+			goto out_resource;
- 		WRITE32(3);
- 		WRITE32(bmval0);
- 		WRITE32(bmval1);
- 		WRITE32(bmval2);
- 	} else if (likely(bmval1)) {
-+		if ((buflen -= 12) < 0)
-+			goto out_resource;
- 		WRITE32(2);
- 		WRITE32(bmval0);
- 		WRITE32(bmval1);
- 	} else {
-+		if ((buflen -= 8) < 0)
-+			goto out_resource;
- 		WRITE32(1);
- 		WRITE32(bmval0);
- 	}
-@@ -1828,15 +1977,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- 		u32 word1 = nfsd_suppattrs1(minorversion);
- 		u32 word2 = nfsd_suppattrs2(minorversion);
- 
--		if ((buflen -= 12) < 0)
--			goto out_resource;
- 		if (!aclsupport)
- 			word0 &= ~FATTR4_WORD0_ACL;
- 		if (!word2) {
-+			if ((buflen -= 12) < 0)
-+				goto out_resource;
- 			WRITE32(2);
- 			WRITE32(word0);
- 			WRITE32(word1);
- 		} else {
-+			if ((buflen -= 16) < 0)
-+				goto out_resource;
- 			WRITE32(3);
- 			WRITE32(word0);
- 			WRITE32(word1);
-@@ -2150,6 +2301,36 @@ out_acl:
- 		}
- 		WRITE64(stat.ino);
- 	}
-+#if defined(CONFIG_PNFSD)
-+	if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-+		struct super_block *sb = dentry->d_inode->i_sb;
-+		int type = 0;
++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00)
++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01)
++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02)
++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03)
++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04)
++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05)
++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06)
++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07)
++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00)
++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01)
++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02)
++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03)
++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00)
++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01)
++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02)
++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04)
 +
-+		/* Query the filesystem for supported pNFS layout types.
-+		 * Currently, we only support one layout type per file system.
-+		 * The export_ops->layout_type() returns the pnfs_layouttype4.
-+		 */
-+		buflen -= 4;
-+		if (buflen < 0)		/* length */
-+			goto out_resource;
++struct pan_aggregation_map_s {
++  pan_agg_map_version_t  version;
++  pan_agg_obj_state_t    avail_state;
++  pan_stor_obj_id_t      obj_id;
++};
 +
-+		if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type)
-+			type = sb->s_pnfs_op->layout_type(sb);
-+		if (type) {
-+			if ((buflen -= 4) < 0)	/* type */
-+				goto out_resource;
-+			WRITE32(1); 	/* length */
-+			WRITE32(type);  /* type */
-+		} else
-+			WRITE32(0);  /* length */
-+	}
++typedef struct pan_aggregation_map_s pan_aggregation_map_t;
 +
-+	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
-+		if ((buflen -= 4) < 0)
-+			goto out_resource;
-+		WRITE32(stat.blksize);
-+	}
-+#endif /* CONFIG_PNFSD */
- 	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- 		WRITE32(3);
- 		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
-@@ -2380,6 +2561,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
- 	if (!nfserr) {
- 		RESERVE_SPACE(8);
- 		WRITEMEM(commit->co_verf.data, 8);
-+		dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n",
-+			((u32 *)(&commit->co_verf.data))[0],
-+			((u32 *)(&commit->co_verf.data))[1]);
++struct pan_agg_comp_obj_s {
++  pan_stor_dev_id_t     dev_id;
++  pan_agg_comp_state_t  avail_state;
++  pan_agg_comp_flag_t   comp_flags;
++};
 +
- 		ADJUST_ARGS();
- 	}
- 	return nfserr;
-@@ -2634,6 +2819,13 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
- 	}
- 	read->rd_vlen = v;
- 
-+#if defined(CONFIG_SPNFS)
-+	if (spnfs_enabled())
-+		nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode,
-+				    read->rd_offset, &maxcount, read->rd_vlen,
-+				    resp->rqstp);
-+	else /* we're not an MDS */
-+#endif /* CONFIG_SPNFS */
- 	nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
- 			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
- 			&maxcount);
-@@ -2940,6 +3132,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
- 		WRITE32(write->wr_bytes_written);
- 		WRITE32(write->wr_how_written);
- 		WRITEMEM(write->wr_verifier.data, 8);
-+		dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n",
-+			((u32 *)(&write->wr_verifier.data))[0],
-+			((u32 *)(&write->wr_verifier.data))[1]);
- 		ADJUST_ARGS();
- 	}
- 	return nfserr;
-@@ -3083,6 +3278,343 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
- 	return 0;
- }
- 
-+#if defined(CONFIG_PNFSD)
++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t;
 +
-+/* Uses the export interface to iterate through the available devices
-+ * and encodes them on the response stream.
-+ */
-+static  __be32
-+nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp,
-+			      struct nfsd4_pnfs_getdevlist *gdevl,
-+			      unsigned int *dev_count)
-+{
-+	struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb;
-+	__be32 nfserr;
-+	int status;
-+	__be32 *p;
-+	struct nfsd4_pnfs_dev_iter_res res = {
-+		.gd_cookie = gdevl->gd_cookie,
-+		.gd_verf = gdevl->gd_verf,
-+		.gd_eof = 0
-+	};
-+	u64 sbid;
++struct pan_agg_simple_header_s {
++  pan_uint8_t  unused;
++};
 +
-+	dprintk("%s: Begin\n", __func__);
++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t;
 +
-+	sbid = find_create_sbid(sb);
-+	*dev_count = 0;
-+	do {
-+		status = sb->s_pnfs_op->get_device_iter(sb,
-+							gdevl->gd_layout_type,
-+							&res);
-+		if (status) {
-+			if (status == -ENOENT) {
-+				res.gd_eof = 1;
-+				/* return success */
-+				break;
-+			}
-+			nfserr = nfserrno(status);
-+			goto out_err;
-+		}
++struct pan_agg_raid1_header_s {
++  pan_uint16_t  num_comps;
++};
 +
-+		/* Encode device id and layout type */
-+		RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid));
-+		WRITE64((__be64)sbid);
-+		WRITE64(res.gd_devid);	/* devid minor */
-+		ADJUST_ARGS();
-+		(*dev_count)++;
-+	} while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof);
-+	gdevl->gd_cookie = res.gd_cookie;
-+	gdevl->gd_verf = res.gd_verf;
-+	gdevl->gd_eof = res.gd_eof;
-+	nfserr = nfs_ok;
-+out_err:
-+	dprintk("%s: Encoded %u devices\n", __func__, *dev_count);
-+	return nfserr;
-+}
++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t;
 +
-+/* Encodes the response of get device list.
-+*/
-+static __be32
-+nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr,
-+			struct nfsd4_pnfs_getdevlist *gdevl)
-+{
-+	unsigned int dev_count = 0, lead_count;
-+	u32 *p_in = resp->p;
-+	__be32 *p;
++struct pan_agg_raid0_header_s {
++  pan_uint16_t  num_comps;
++  pan_uint32_t  stripe_unit;
++};
 +
-+	dprintk("%s: err %d\n", __func__, nfserr);
-+	if (nfserr)
-+		return nfserr;
++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t;
 +
-+	/* Ensure we have room for cookie, verifier, and devlist len,
-+	 * which we will backfill in after we encode as many devices as possible
-+	 */
-+	lead_count = 8 + sizeof(nfs4_verifier) + 4;
-+	RESERVE_SPACE(lead_count);
-+	/* skip past these values */
-+	p += XDR_QUADLEN(lead_count);
-+	ADJUST_ARGS();
++struct pan_agg_raid5_left_header_s {
++  pan_uint16_t  num_comps;
++  pan_uint32_t  stripe_unit0;
++  pan_uint32_t  stripe_unit1;
++  pan_uint32_t  stripe_unit2;
++};
 +
-+	/* Iterate over as many device ids as possible on the xdr stream */
-+	nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count);
-+	if (nfserr)
-+		goto out_err;
++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t;
 +
-+	/* Backfill in cookie, verf and number of devices encoded */
-+	p = p_in;
-+	WRITE64(gdevl->gd_cookie);
-+	WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
-+	WRITE32(dev_count);
++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t;
 +
-+	/* Skip over devices */
-+	p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid));
-+	ADJUST_ARGS();
++struct pan_agg_grp_raid5_left_header_s {
++  pan_uint16_t  num_comps;
++  pan_uint32_t  stripe_unit;
++  pan_uint16_t  rg_width;
++  pan_uint16_t  rg_depth;
++  pan_uint8_t   group_layout_policy;
++};
 +
-+	/* are we at the end of devices? */
-+	RESERVE_SPACE(4);
-+	WRITE32(gdevl->gd_eof);
-+	ADJUST_ARGS();
++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00)
++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01)
 +
-+	dprintk("%s: done.\n", __func__);
++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00)
++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01)
++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02)
++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03)
++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04)
++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06)
++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01)
++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06)
 +
-+	nfserr = nfs_ok;
-+out:
-+	return nfserr;
-+out_err:
-+	p = p_in;
-+	ADJUST_ARGS();
-+	goto out;
-+}
++struct pan_agg_layout_hdr_s {
++  pan_agg_type_t type;
++  pan_pad_t pad[3];
++  union {
++    pan_uint64_t                        null;
++    pan_agg_simple_header_t             simple;
++    pan_agg_raid1_header_t              raid1;
++    pan_agg_raid0_header_t              raid0;
++    pan_agg_raid5_left_header_t         raid5_left;
++    pan_agg_grp_raid5_left_header_t     grp_raid5_left;
++  } hdr;
++};
 +
-+/* For a given device id, have the file system retrieve and encode the
-+ * associated device.  For file layout, the encoding function is
-+ * passed down to the file system.  The file system then has the option
-+ * of using this encoding function or one of its own.
-+ *
-+ * Note: the file system must return the XDR size of struct device_addr4
-+ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the
-+ * gdir_mincount calculation.
++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t;
++
++struct pan_agg_comp_obj_a_s {
++  pan_rpc_arrdim_t size;
++  pan_agg_comp_obj_t *data;
++};
++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a;
++
++struct pan_agg_full_map_s {
++  pan_aggregation_map_t  map_hdr;
++  pan_agg_layout_hdr_t   layout_hdr;
++  pan_agg_comp_obj_a     components;
++};
++
++typedef struct pan_agg_full_map_s pan_agg_full_map_t;
++
++/*
++ * from pan_obsd_rpc_types.h
 + */
-+static __be32
-+nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-+			struct nfsd4_pnfs_getdevinfo *gdev)
-+{
-+	struct super_block *sb;
-+	int maxcount = 0, type_notify_len = 12;
-+	__be32 *p, *p_save = NULL, *p_in = resp->p;
-+	struct exp_xdr_stream xdr;
++typedef pan_uint8_t pan_obsd_security_key_a[16];
 +
-+	dprintk("%s: err %d\n", __func__, nfserr);
-+	if (nfserr)
-+		return nfserr;
++typedef pan_uint8_t pan_obsd_capability_key_a[20];
 +
-+	sb = gdev->gd_sb;
++typedef pan_uint8_t pan_obsd_key_holder_id_t;
 +
-+	if (gdev->gd_maxcount != 0) {
-+		/* FIXME: this will be bound by the session max response */
-+		maxcount = svc_max_payload(resp->rqstp);
-+		if (maxcount > gdev->gd_maxcount)
-+			maxcount = gdev->gd_maxcount;
++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01)
++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02)
 +
-+		/* Ensure have room for type and notify field */
-+		maxcount -= type_notify_len;
-+		if (maxcount < 0) {
-+			nfserr = -ETOOSMALL;
-+			goto toosmall;
-+		}
-+	}
++struct pan_obsd_key_holder_s {
++  pan_obsd_key_holder_id_t select;
++  pan_pad_t pad[3];
++  union {
++    pan_obsd_security_key_a    basis_key;
++    pan_obsd_capability_key_a  cap_key;
++  } key;
++};
++
++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t;
++
++/*
++ * from pan_sm_sec.h
++ */
++typedef pan_uint8_t pan_sm_sec_type_t;
++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t;
++
++struct pan_obsd_capability_generic_otw_t_s {
++  pan_rpc_arrdim_t size;
++  pan_uint8_t *data;
++};
++typedef struct pan_obsd_capability_generic_otw_t_s
++				pan_obsd_capability_generic_otw_t;
++
++struct pan_sm_sec_obsd_s {
++  pan_obsd_key_holder_t              key;
++  pan_obsd_capability_generic_otw_t  cap_otw;
++  pan_sm_sec_otw_allo_mode_t         allo_mode;
++};
 +
-+	RESERVE_SPACE(4);
-+	WRITE32(gdev->gd_layout_type);
-+	ADJUST_ARGS();
++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t;
 +
-+	/* If maxcount is 0 then just update notifications */
-+	if (gdev->gd_maxcount == 0)
-+		goto handle_notifications;
++struct pan_sm_sec_s {
++  pan_sm_sec_type_t type;
++  pan_pad_t pad[3];
++  union {
++    pan_rpc_none_t     none;
++    pan_sm_sec_obsd_t  obsd;
++  } variant;
++};
 +
-+	xdr.p = p_save = resp->p;
-+	xdr.end = resp->end;
-+	if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
-+		xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
++typedef struct pan_sm_sec_s pan_sm_sec_t;
 +
-+	nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type,
-+						&gdev->gd_devid);
-+	if (nfserr)
-+		goto err;
++struct pan_sm_sec_a_s {
++  pan_rpc_arrdim_t size;
++  pan_sm_sec_t *data;
++};
++typedef struct pan_sm_sec_a_s pan_sm_sec_a;
++typedef pan_otw_t *pan_sm_sec_otw_t;
 +
-+	/* The file system should never write 0 bytes without
-+	 * returning an error
-+	 */
-+	BUG_ON(xdr.p == p_save);
-+	BUG_ON(xdr.p > xdr.end);
++/*
++ * from pan_sm_types.h
++ */
++typedef pan_uint64_t pan_sm_cap_handle_t;
 +
-+	/* Update the xdr stream with the number of bytes encoded
-+	 * by the file system.
-+	 */
-+	p = xdr.p;
-+	ADJUST_ARGS();
++struct pan_sm_map_cap_s {
++  pan_agg_full_map_t   full_map;
++  pan_stor_offset_t    offset;
++  pan_stor_len_t       length;
++  pan_sm_sec_a         secs;
++  pan_sm_cap_handle_t  handle;
++  pan_timespec_t       expiration_time;
++  pan_stor_action_t    action_mask;
++  pan_uint32_t         flags;
++};
 +
-+handle_notifications:
-+	/* Encode supported device notifications */
-+	RESERVE_SPACE(4);
-+	if (sb->s_pnfs_op->set_device_notify) {
-+		struct pnfs_devnotify_arg dn_args;
++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t;
 +
-+		dn_args.dn_layout_type = gdev->gd_layout_type;
-+		dn_args.dn_devid = gdev->gd_devid;
-+		dn_args.dn_notify_types = gdev->gd_notify_types;
-+		nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args);
-+		if (nfserr)
-+			goto err;
-+		WRITE32(dn_args.dn_notify_types);
-+	} else {
-+		WRITE32(0);
-+	}
-+	ADJUST_ARGS();
++/*
++ * from pan_sm_ops.h
++ */
++typedef pan_rpc_none_t pan_sm_cache_ptr_t;
 +
-+out:
-+	return nfserrno(nfserr);
-+toosmall:
-+	dprintk("%s: maxcount too small\n", __func__);
-+	RESERVE_SPACE(4);
-+	WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len);
-+	ADJUST_ARGS();
-+	goto out;
-+err:
-+	/* Rewind to the beginning */
-+	p = p_in;
-+	ADJUST_ARGS();
-+	if (nfserr == -ETOOSMALL)
-+		goto toosmall;
-+	printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr);
-+	goto out;
-+}
++/*
++ * from pan_sam_api.h
++ */
++typedef pan_uint32_t    pan_sam_access_flags_t;
 +
-+static __be32
-+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp,
-+		       __be32 nfserr,
-+		       struct nfsd4_pnfs_layoutget *lgp)
-+{
-+	int maxcount, leadcount;
-+	struct super_block *sb;
-+	struct exp_xdr_stream xdr;
-+	__be32 *p, *p_save, *p_start = resp->p;
++typedef struct pan_sam_dev_error_s  pan_sam_dev_error_t;
++struct pan_sam_dev_error_s {
++    pan_stor_dev_id_t       dev_id;
++    pan_stor_op_t           stor_op;
++    pan_status_t            error;
++};
 +
-+	dprintk("%s: err %d\n", __func__, nfserr);
-+	if (nfserr)
-+		return nfserr;
++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t;
++struct pan_sam_ext_status_s {
++    pan_uint32_t        available;
++    pan_uint32_t        size;
++    pan_sam_dev_error_t *errors;
++};
 +
-+	sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb;
-+	maxcount = PAGE_SIZE;
-+	if (maxcount > lgp->lg_maxcount)
-+		maxcount = lgp->lg_maxcount;
++enum pan_sam_rpc_sec_sel_e {
++    PAN_SAM_RPC_SEC_DEFAULT,
++    PAN_SAM_RPC_SEC_ATLEAST,
++    PAN_SAM_RPC_SEC_EXACTLY
++};
++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t;
 +
-+	/* Check for space on xdr stream */
-+	leadcount = 36 + sizeof(stateid_opaque_t);
-+	RESERVE_SPACE(leadcount);
-+	/* encode layout metadata after file system encodes layout */
-+	p += XDR_QUADLEN(leadcount);
-+	ADJUST_ARGS();
++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t;
++struct pan_sam_obj_sec_s {
++    pan_stor_sec_level_t    min_security;
++    pan_sm_map_cap_t        *map_ccaps;
++};
 +
-+	/* Ensure have room for ret_on_close, off, len, iomode, type */
-+	maxcount -= leadcount;
-+	if (maxcount < 0) {
-+		printk(KERN_ERR "%s: buffer too small\n", __func__);
-+		nfserr = nfserr_toosmall;
-+		goto err;
-+	}
++typedef struct  pan_sam_rpc_sec_s   pan_sam_rpc_sec_t;
++struct pan_sam_rpc_sec_s {
++    pan_sam_rpc_sec_sel_t   selector;
++};
 +
-+	/* Set xdr info so file system can encode layout */
-+	xdr.p = p_save = resp->p;
-+	xdr.end = resp->end;
-+	if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
-+		xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
++typedef struct pan_sam_read_args_s pan_sam_read_args_t;
++struct pan_sam_read_args_s {
++    pan_stor_obj_id_t                obj_id;
++    pan_sm_cache_ptr_t               obj_ent;
++    void                            *return_attr;
++    void                            *checksum;
++    pan_stor_offset_t                offset;
++    pan_uint16_t                     sm_options;
++    void                            *callout;
++    void                            *callout_arg;
++};
 +
-+	/* Retrieve, encode, and merge layout; process stateid */
-+	nfserr = nfs4_pnfs_get_layout(lgp, &xdr);
-+	if (nfserr)
-+		goto err;
++typedef struct pan_sam_read_res_s pan_sam_read_res_t;
++struct pan_sam_read_res_s {
++    pan_status_t             result;
++    pan_sam_ext_status_t     ext_status;
++    pan_stor_len_t           length;
++    void                    *attr;
++    void                    *checksum;
++};
 +
-+	/* Ensure file system returned enough bytes for the client
-+	 * to access.
-+	 */
-+	if (lgp->lg_seg.length < lgp->lg_minlength) {
-+		nfserr = nfserr_badlayout;
-+		goto err;
-+	}
++typedef void (*pan_sam_read_cb_t)(
++    void                *user_arg1,
++    void                *user_arg2,
++    pan_sam_read_res_t  *res_p,
++    pan_status_t        status);
 +
-+	/* The file system should never write 0 bytes without
-+	 * returning an error
-+	 */
-+	BUG_ON(xdr.p == p_save);
++#define PAN_SAM_ACCESS_NONE                             0x0000
++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP                 0x0020
 +
-+	/* Rewind to beginning and encode attrs */
-+	resp->p = p_start;
-+	RESERVE_SPACE(4);
-+	WRITE32(lgp->lg_roc);	/* return on close */
-+	ADJUST_ARGS();
-+	nfsd4_encode_stateid(resp, &lgp->lg_sid);
-+	RESERVE_SPACE(28);
-+	/* Note: response logr_layout array count, always one for now */
-+	WRITE32(1);
-+	WRITE64(lgp->lg_seg.offset);
-+	WRITE64(lgp->lg_seg.length);
-+	WRITE32(lgp->lg_seg.iomode);
-+	WRITE32(lgp->lg_seg.layout_type);
++typedef struct pan_sam_write_args_s pan_sam_write_args_t;
++struct pan_sam_write_args_s {
++    pan_stor_obj_id_t   obj_id;
++    pan_sm_cache_ptr_t  obj_ent;
++    pan_stor_offset_t   offset;
++    void                *attr;
++    void                *return_attr;
++};
 +
-+	/* Update the xdr stream with the number of bytes written
-+	 * by the file system
-+	 */
-+	p = xdr.p;
-+	ADJUST_ARGS();
++typedef struct pan_sam_write_res_s pan_sam_write_res_t;
++struct pan_sam_write_res_s {
++    pan_status_t            result;
++    pan_sam_ext_status_t    ext_status;
++    pan_stor_len_t          length;
++    pan_stor_delta_len_t    delta_capacity_used;
++    pan_bool_t              parity_dirty;
++    void                   *attr;
++};
 +
-+	return nfs_ok;
-+err:
-+	resp->p = p_start;
-+	return nfserr;
-+}
++typedef void (*pan_sam_write_cb_t)(
++    void                *user_arg1,
++    void                *user_arg2,
++    pan_sam_write_res_t *res_p,
++    pan_status_t        status);
 +
-+static __be32
-+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
-+			  struct nfsd4_pnfs_layoutcommit *lcp)
-+{
-+	__be32 *p;
++/*
++ * from pan_mgr_types.h
++ */
++#define PAN_MGR_ID_TYPE_SHIFT 56
++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL)
++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL)
 +
-+	if (nfserr)
-+		goto out;
++typedef pan_uint16_t pan_mgr_type_t;
++typedef pan_uint64_t pan_mgr_id_t;
 +
-+	RESERVE_SPACE(4);
-+	WRITE32(lcp->res.lc_size_chg);
-+	ADJUST_ARGS();
-+	if (lcp->res.lc_size_chg) {
-+		RESERVE_SPACE(8);
-+		WRITE64(lcp->res.lc_newsize);
-+		ADJUST_ARGS();
-+	}
-+out:
-+	return nfserr;
++#define PAN_MGR_SM ((pan_mgr_type_t) 2U)
++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U)
++
++/*
++ * from pan_mgr_types_c.h
++ */
++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \
++  pan_mgr_id_t  _id1, _id2; \
++\
++  _id1 = (_mgr_type_); \
++  _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \
++  _id1 &= PAN_MGR_ID_TYPE_MASK; \
++  _id2 = (_mgr_uniq_); \
++  _id2 &= PAN_MGR_ID_UNIQ_MASK; \
++  _id1 |= _id2; \
++  *(_mgr_id_p_) = _id1; \
 +}
 +
-+static __be32
-+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
-+			  struct nfsd4_pnfs_layoutreturn *lrp)
-+{
-+	__be32 *p;
++/*
++ * from pan_storage_c.h
++ */
++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \
++    ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \
++	== PAN_MGR_OBSD)
 +
-+	if (nfserr)
-+		goto out;
++/*
++ * pnfs_shim internal definitions
++ */
 +
-+	RESERVE_SPACE(4);
-+	WRITE32(lrp->lrs_present != 0);    /* got stateid? */
-+	ADJUST_ARGS();
-+	if (lrp->lrs_present)
-+		nfsd4_encode_stateid(resp, &lrp->lr_sid);
-+out:
-+	return nfserr;
-+}
-+#endif /* CONFIG_PNFSD */
++struct panfs_shim_io_state {
++	struct objlayout_io_state ol_state;
 +
- static __be32
- nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
- {
-@@ -3143,11 +3675,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
- 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
- 	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
- 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
-+#if defined(CONFIG_PNFSD)
-+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdevinfo,
-+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_getdevlist,
-+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
-+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
-+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
-+#else  /* CONFIG_PNFSD */
- 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
- 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
- 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
- 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
- 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
-+#endif /* CONFIG_PNFSD */
- 	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
- 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
- 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
-diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
-index b53b1d0..1bbd9c2 100644
---- a/fs/nfsd/nfsctl.c
-+++ b/fs/nfsd/nfsctl.c
-@@ -13,10 +13,15 @@
- #include <linux/nfsd/syscall.h>
- #include <linux/lockd/lockd.h>
- #include <linux/sunrpc/clnt.h>
-+#include <linux/nfsd/nfs4pnfsdlm.h>
- 
- #include "nfsd.h"
- #include "cache.h"
- 
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
-+#include <linux/nfsd4_spnfs.h>
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
++	pan_sg_entry_t *sg_list;
++	pan_sam_obj_sec_t obj_sec;
++	void *ucreds;
++	union {
++		struct {
++			pan_sam_read_args_t args;
++			pan_sam_read_res_t res;
++		} read;
++		struct {
++			pan_sam_write_args_t args;
++			pan_sam_write_res_t res;
++		} write;
++	} u;
++};
 +
- /*
-  *	We have a single directory with 9 nodes in it.
-  */
-@@ -49,6 +54,9 @@ enum {
- 	NFSD_Gracetime,
- 	NFSD_RecoveryDir,
- #endif
-+#ifdef CONFIG_PNFSD
-+	NFSD_pnfs_dlm_device,
-+#endif
- };
- 
- /*
-@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
- static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
- static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
- #endif
-+#ifdef CONFIG_PNFSD
-+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size);
-+#endif
- 
- static ssize_t (*write_op[])(struct file *, char *, size_t) = {
- 	[NFSD_Svc] = write_svc,
-@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
- 	[NFSD_Gracetime] = write_gracetime,
- 	[NFSD_RecoveryDir] = write_recoverydir,
- #endif
-+#ifdef CONFIG_PNFSD
-+	[NFSD_pnfs_dlm_device] = write_pnfs_dlm_device,
-+#endif
- };
- 
- static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
-@@ -1347,6 +1361,68 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
- 
- #endif
- 
-+#ifdef CONFIG_PNFSD
++#endif /* _PANLAYOUT_PANFS_SHIM_H */
+diff -up linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+--- linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig	2011-01-28 09:43:53.337772132 -0500
++++ linux-2.6.37.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c	2011-01-28 09:43:53.337772132 -0500
+@@ -0,0 +1,435 @@
++/*
++ *  pnfs_osd_xdr.c
++ *
++ *  Object-Based pNFS Layout XDR layer
++ *
++ *  Copyright (C) 2007-2009 Panasas Inc.
++ *  All rights reserved.
++ *
++ *  Benny Halevy <bhalevy at panasas.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2
++ *  See the file COPYING included with this distribution for more details.
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the Panasas company nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
 +
-+static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf,
-+				       size_t size)
-+{
-+	char *mesg = buf;
-+	char *pnfs_dlm_device;
-+	int max_size = NFSD_PNFS_DLM_DEVICE_MAX;
-+	int len, ret = 0;
++#include <linux/pnfs_osd_xdr.h>
 +
-+	if (size > 0) {
-+		ret = -EINVAL;
-+		if (size > max_size || buf[size-1] != '\n')
-+			return ret;
-+		buf[size-1] = 0;
++#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 +
-+		pnfs_dlm_device = mesg;
-+		len = qword_get(&mesg, pnfs_dlm_device, size);
-+		if (len <= 0)
-+			return ret;
++/*
++ * The following implementation is based on these Internet Drafts:
++ *
++ * draft-ietf-nfsv4-minorversion-21
++ * draft-ietf-nfsv4-pnfs-obj-12
++ */
 +
-+		ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len);
-+	} else
-+		return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT);
++/*
++ * struct pnfs_osd_objid {
++ * 	struct pnfs_deviceid	oid_device_id;
++ * 	u64			oid_partition_id;
++ * 	u64			oid_object_id;
++ * };
++ */
++static inline u32 *
++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid)
++{
++	COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data));
++	READ64(objid->oid_partition_id);
++	READ64(objid->oid_object_id);
++	return p;
++}
 +
-+	return ret <= 0 ? ret : strlen(buf);
++static inline u32 *
++pnfs_osd_xdr_decode_opaque_cred(u32 *p,
++				struct pnfs_osd_opaque_cred *opaque_cred)
++{
++	READ32(opaque_cred->cred_len);
++	COPYMEM(opaque_cred->cred, opaque_cred->cred_len);
++	return p;
 +}
 +
-+/**
-+ * write_pnfs_dlm_device - Set or report the current pNFS data server list
-+ *
-+ * Input:
-+ *			buf:		ignored
-+ *			size:		zero
-+ *
-+ * OR
-+ *
-+ * Input:
-+ *			buf:		C string containing a block device name,
-+ *					a colon, and then a comma separated
-+ *					list of pNFS data server IPv4 addresses
-+ *			size:		non-zero length of C string in @buf
-+ * Output:
-+ *	On success:	passed-in buffer filled with '\n'-terminated C
-+ *			string containing a block device name, a colon, and
-+ *			then a comma separated list of pNFS
-+ *			data server IPv4 addresses.
-+ *			return code is the size in bytes of the string
-+ *	On error:	return code is a negative errno value
++/*
++ * struct pnfs_osd_object_cred {
++ * 	struct pnfs_osd_objid		oc_object_id;
++ * 	u32				oc_osd_version;
++ * 	u32				oc_cap_key_sec;
++ * 	struct pnfs_osd_opaque_cred	oc_cap_key
++ * 	struct pnfs_osd_opaque_cred	oc_cap;
++ * };
 + */
-+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size)
++static inline u32 *
++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp,
++				u8 **credp)
 +{
-+	ssize_t rv;
++	u8 *cred;
 +
-+	mutex_lock(&nfsd_mutex);
-+	rv = __write_pnfs_dlm_device(file, buf, size);
-+	mutex_unlock(&nfsd_mutex);
-+	return rv;
-+}
++	p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id);
++	READ32(comp->oc_osd_version);
++	READ32(comp->oc_cap_key_sec);
 +
-+#endif /* CONFIG_PNFSD */
++	cred = *credp;
++	comp->oc_cap_key.cred = cred;
++	p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key);
++	cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len));
++	comp->oc_cap.cred = cred;
++	p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap);
++	cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len));
++	*credp = cred;
 +
- /*----------------------------------------------------------------------------*/
- /*
-  *	populating the filesystem.
-@@ -1381,6 +1457,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
- 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
- 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
- #endif
-+#ifdef CONFIG_PNFSD
-+		[NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops,
-+					   S_IWUSR|S_IRUSR},
-+#endif
- 		/* last one */ {""}
- 	};
- 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
-@@ -1419,6 +1499,9 @@ static int create_proc_exports_entry(void)
- }
- #endif
- 
-+#if defined(CONFIG_SPNFS_BLOCK)
-+int nfsd_bl_init(void);
-+#endif
- static int __init init_nfsd(void)
- {
- 	int retval;
-@@ -1441,6 +1524,15 @@ static int __init init_nfsd(void)
- 	retval = create_proc_exports_entry();
- 	if (retval)
- 		goto out_free_idmap;
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
-+	retval = spnfs_init_proc();
-+	if (retval != 0)
-+		goto out_free_idmap;
-+#if defined(CONFIG_SPNFS_BLOCK)
-+	nfsd_bl_init();
-+#endif /* CONFIG_SPNFS_BLOCK */
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
++	return p;
++}
 +
- 	retval = register_filesystem(&nfsd_fs_type);
- 	if (retval)
- 		goto out_free_all;
-@@ -1463,7 +1555,22 @@ out_free_stat:
- 
- static void __exit exit_nfsd(void)
- {
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
-+	remove_proc_entry("fs/nfs/spnfs/recall", NULL);
-+	remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
-+	remove_proc_entry("fs/nfs/spnfs/getfh", NULL);
-+	remove_proc_entry("fs/nfs/spnfs/config", NULL);
-+	remove_proc_entry("fs/nfs/spnfs/ctl", NULL);
-+	remove_proc_entry("fs/nfs/spnfs", NULL);
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
++/*
++ * struct pnfs_osd_data_map {
++ * 	u32	odm_num_comps;
++ * 	u64	odm_stripe_unit;
++ * 	u32	odm_group_width;
++ * 	u32	odm_group_depth;
++ * 	u32	odm_mirror_cnt;
++ * 	u32	odm_raid_algorithm;
++ * };
++ */
++static inline u32 *
++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map)
++{
++	READ32(data_map->odm_num_comps);
++	READ64(data_map->odm_stripe_unit);
++	READ32(data_map->odm_group_width);
++	READ32(data_map->odm_group_depth);
++	READ32(data_map->odm_mirror_cnt);
++	READ32(data_map->odm_raid_algorithm);
++	dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
++		"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
++		__func__,
++		data_map->odm_num_comps,
++		(unsigned long long)data_map->odm_stripe_unit,
++		data_map->odm_group_width,
++		data_map->odm_group_depth,
++		data_map->odm_mirror_cnt,
++		data_map->odm_raid_algorithm);
++	return p;
++}
 +
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
-+	remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
-+	remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL);
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */
++struct pnfs_osd_layout *
++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p)
++{
++	int i;
++	u32 *start = p;
++	struct pnfs_osd_object_cred *comp;
++	u8 *cred;
 +
- 	nfsd_export_shutdown();
-+	nfsd4_pnfs_dlm_shutdown();
- 	nfsd_reply_cache_shutdown();
- 	remove_proc_entry("fs/nfs/exports", NULL);
- 	remove_proc_entry("fs/nfs", NULL);
-diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
-index b76ac3a..cef6770 100644
---- a/fs/nfsd/nfsd.h
-+++ b/fs/nfsd/nfsd.h
-@@ -286,11 +286,22 @@ extern time_t nfsd4_grace;
- #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
- 	NFSD4_SUPPORTED_ATTRS_WORD0
- 
-+#if defined(CONFIG_PNFSD)
-+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-+	(NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES)
-+#else /* CONFIG_PNFSD */
- #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
- 	NFSD4_SUPPORTED_ATTRS_WORD1
-+#endif /* CONFIG_PNFSD */
- 
-+#if defined(CONFIG_PNFSD)
-+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-+	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \
-+	 FATTR4_WORD2_LAYOUT_BLKSIZE)
-+#else /* CONFIG_PNFSD */
- #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
- 	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
-+#endif /* CONFIG_PNFSD */
- 
- static inline u32 nfsd_suppattrs0(u32 minorversion)
- {
-diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
-index 55c8e63..544c957 100644
---- a/fs/nfsd/nfsfh.c
-+++ b/fs/nfsd/nfsfh.c
-@@ -10,6 +10,7 @@
- #include <linux/exportfs.h>
- 
- #include <linux/sunrpc/svcauth_gss.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
- #include "nfsd.h"
- #include "vfs.h"
- #include "auth.h"
-@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
- static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
- {
- 	struct knfsd_fh	*fh = &fhp->fh_handle;
-+	int fsid_type;
- 	struct fid *fid = NULL, sfid;
- 	struct svc_export *exp;
- 	struct dentry *dentry;
-@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
- 			return error;
- 		if (fh->fh_auth_type != 0)
- 			return error;
--		len = key_len(fh->fh_fsid_type) / 4;
-+		fsid_type = pnfs_fh_fsid_type(fh);
-+		len = key_len(fsid_type) / 4;
- 		if (len == 0)
- 			return error;
- 		if  (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
-@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
- 		data_left -= len;
- 		if (data_left < 0)
- 			return error;
--		exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
-+		exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth);
- 		fid = (struct fid *)(fh->fh_auth + len);
- 	} else {
- 		__u32 tfh[2];
-diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
-index c16f8d8..4263812 100644
---- a/fs/nfsd/nfsfh.h
-+++ b/fs/nfsd/nfsfh.h
-@@ -14,6 +14,7 @@ enum nfsd_fsid {
- 	FSID_UUID8,
- 	FSID_UUID16,
- 	FSID_UUID16_INUM,
-+	FSID_MAX
- };
- 
- enum fsid_source {
-@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp)
- 	}
- }
- 
-+#if defined(CONFIG_PNFSD)
++	p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map);
++	READ32(layout->olo_comps_index);
++	READ32(layout->olo_num_comps);
++	layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1);
++	comp = layout->olo_comps;
++	cred = (u8 *)(comp + layout->olo_num_comps);
++	dprintk("%s: comps_index=%u num_comps=%u\n",
++		__func__, layout->olo_comps_index, layout->olo_num_comps);
++	for (i = 0; i < layout->olo_num_comps; i++) {
++		p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred);
++		dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx "
++			"key_len=%u cap_len=%u\n",
++			__func__, i,
++			_DEVID_LO(&comp->oc_object_id.oid_device_id),
++			_DEVID_HI(&comp->oc_object_id.oid_device_id),
++			comp->oc_object_id.oid_partition_id,
++			comp->oc_object_id.oid_object_id,
++			comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
++		comp++;
++	}
++	dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__,
++	       (char *)p - (char *)start, cred, (char *)cred - (char *)layout);
++	return layout;
++}
 +
 +/*
-+ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied
-+ * to a DS by LAYOUTGET.  nfs4_preprocess_stateid_op() uses this to decide how
-+ * to handle a given stateid.
++ * Get Device Information Decoding
++ *
++ * Note: since Device Information is currently done synchronously, most
++ *       of the actual fields are left inside the rpc buffer and are only
++ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
++ *       should not be freed while the returned information is in use.
 + */
-+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
-+{
-+	return fh->fh_fsid_type >= FSID_MAX;
-+}
 +
-+static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh)
++u32 *__xdr_read_calc_nfs4_string(
++	u32 *p, struct nfs4_string *str, u8 **freespace)
 +{
-+	BUG_ON(fh->fh_version != 1);
-+	BUG_ON(pnfs_fh_is_ds(fh));
-+	fh->fh_fsid_type += FSID_MAX;
++	u32 len;
++	char *data;
++	bool need_copy;
++
++	READ32(len);
++	data = (char *)p;
++
++	if (data[len]) { /* Not null terminated we'll need extra space */
++		data = *freespace;
++		*freespace += len + 1;
++		need_copy = true;
++	} else {
++		need_copy = false;
++	}
++
++	if (str) {
++		str->len = len;
++		str->data = data;
++		if (need_copy) {
++			memcpy(data, p, len);
++			data[len] = 0;
++		}
++	}
++
++	p += XDR_QUADLEN(len);
++	return p;
 +}
 +
-+#else  /* CONFIG_PNFSD */
-+
-+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
++u32 *__xdr_read_calc_u8_opaque(
++	u32 *p, struct nfs4_string *str)
 +{
-+	return 0;
-+}
++	u32 len;
 +
-+#endif /* CONFIG_PNFSD */
++	READ32(len);
 +
-+/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */
-+static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh)
-+{
-+	int fsid_type = fh->fh_fsid_type;
++	if (str) {
++		str->len = len;
++		str->data = (char *)p;
++	}
 +
-+	if (pnfs_fh_is_ds(fh))
-+		return fsid_type - FSID_MAX;
-+	return fsid_type;
++	p += XDR_QUADLEN(len);
++	return p;
 +}
 +
- #endif /* _LINUX_NFSD_FH_INT_H */
-diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
-index e2c4346..d1262ec 100644
---- a/fs/nfsd/nfssvc.c
-+++ b/fs/nfsd/nfssvc.c
-@@ -115,7 +115,7 @@ struct svc_program		nfsd_program = {
- 
- };
- 
--u32 nfsd_supported_minorversion;
-+u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION;
- 
- int nfsd_vers(int vers, enum vers_op change)
- {
-diff --git a/fs/nfsd/pnfsd.h b/fs/nfsd/pnfsd.h
-new file mode 100644
-index 0000000..a181bc3
---- /dev/null
-+++ b/fs/nfsd/pnfsd.h
-@@ -0,0 +1,143 @@
 +/*
-+ *  Copyright (c) 2005 The Regents of the University of Michigan.
-+ *  All rights reserved.
-+ *
-+ *  Andy Adamson <andros at umich.edu>
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the University nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
++ * struct pnfs_osd_targetid {
++ * 	u32			oti_type;
++ * 	struct nfs4_string	oti_scsi_device_id;
++ * };
 + */
++u32 *__xdr_read_calc_targetid(
++	u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace)
++{
++	u32 oti_type;
 +
-+#ifndef LINUX_NFSD_PNFSD_H
-+#define LINUX_NFSD_PNFSD_H
-+
-+#include <linux/list.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+
-+#include "state.h"
-+#include "xdr4.h"
-+
-+/* outstanding layout stateid */
-+struct nfs4_layout_state {
-+	struct list_head	ls_perfile;
-+	struct list_head	ls_layouts; /* list of nfs4_layouts */
-+	struct kref		ls_ref;
-+	struct nfs4_client	*ls_client;
-+	struct nfs4_file	*ls_file;
-+	stateid_t		ls_stateid;
-+};
-+
-+/* outstanding layout */
-+struct nfs4_layout {
-+	struct list_head		lo_perfile;	/* hash by f_id */
-+	struct list_head		lo_perclnt;	/* hash by clientid */
-+	struct list_head		lo_perstate;
-+	struct nfs4_file		*lo_file;	/* backpointer */
-+	struct nfs4_client		*lo_client;
-+	struct nfs4_layout_state	*lo_state;
-+	struct nfsd4_layout_seg 	lo_seg;
-+};
++	READ32(oti_type);
++	if (targetid)
++		targetid->oti_type = oti_type;
 +
-+struct pnfs_inval_state {
-+	struct knfsd_fh		mdsfh; /* needed only by invalidate all */
-+	stateid_t		stid;
-+	clientid_t		clid;
-+	u32			status;
-+};
++	switch (oti_type) {
++	case OBJ_TARGET_SCSI_NAME:
++	case OBJ_TARGET_SCSI_DEVICE_ID:
++		p = __xdr_read_calc_u8_opaque(p,
++			targetid ? &targetid->oti_scsi_device_id : NULL);
++	}
 +
-+/* pNFS Data Server state */
-+#define DS_STATEID_VALID   0
-+#define DS_STATEID_ERROR   1
-+#define DS_STATEID_NEW     2
++	return p;
++}
 +
-+struct pnfs_ds_stateid {
-+	struct list_head	ds_hash;        /* ds_stateid hash entry */
-+	struct list_head	ds_perclid;     /* per client hash entry */
-+	stateid_t		ds_stid;
-+	struct knfsd_fh		ds_fh;
-+	unsigned long		ds_access;
-+	u32			ds_status;      /* from MDS */
-+	u32			ds_verifier[2]; /* from MDS */
-+	wait_queue_head_t	ds_waitq;
-+	unsigned long		ds_flags;
-+	struct kref		ds_ref;
-+	clientid_t		ds_mdsclid;
-+};
++/*
++ * struct pnfs_osd_net_addr {
++ * 	struct nfs4_string	r_netid;
++ * 	struct nfs4_string	r_addr;
++ * };
++ */
++u32 *__xdr_read_calc_net_addr(
++	u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace)
++{
 +
-+struct pnfs_ds_clientid {
-+	struct list_head	dc_hash;        /* mds_clid_hashtbl entry */
-+	struct list_head	dc_stateid;     /* ds_stateid head */
-+	struct list_head	dc_permdsid;    /* per mdsid hash entry */
-+	clientid_t		dc_mdsclid;
-+	struct kref		dc_ref;
-+	uint32_t		dc_mdsid;
-+};
++	p = __xdr_read_calc_nfs4_string(p,
++			netaddr ? &netaddr->r_netid : NULL,
++			freespace);
 +
-+struct pnfs_mds_id {
-+	struct list_head	di_hash;        /* mds_nodeid list entry */
-+	struct list_head	di_mdsclid;     /* mds_clientid head */
-+	uint32_t		di_mdsid;
-+	time_t			di_mdsboot;	/* mds boot time */
-+	struct kref		di_ref;
-+};
++	p = __xdr_read_calc_nfs4_string(p,
++			netaddr ? &netaddr->r_addr : NULL,
++			freespace);
 +
-+/* notify device request (from exported filesystem) */
-+struct nfs4_notify_device {
-+	struct nfsd4_pnfs_cb_dev_list  *nd_list;
-+	struct nfs4_client	       *nd_client;
-+	struct list_head	        nd_perclnt;
++	return p;
++}
 +
-+	void				*nd_args;	/* nfsd internal */
-+};
++/*
++ * struct pnfs_osd_targetaddr {
++ * 	u32				ota_available;
++ * 	struct pnfs_osd_net_addr	ota_netaddr;
++ * };
++ */
++u32 *__xdr_read_calc_targetaddr(
++	u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace)
++{
++	u32 ota_available;
 +
-+u64 find_create_sbid(struct super_block *);
-+struct super_block *find_sbid_id(u64);
-+__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *);
-+int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *,
-+					struct nfsd4_pnfs_layoutreturn *);
-+int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *);
-+int nfs4_pnfs_cb_change_state(struct pnfs_get_state *);
-+void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
-+int put_layoutrecall(struct nfs4_layoutrecall *);
-+void nomatching_layout(struct nfs4_layoutrecall *);
-+void *layoutrecall_done(struct nfs4_layoutrecall *);
-+int nfsd4_cb_layout(struct nfs4_layoutrecall *);
-+int nfsd_layout_recall_cb(struct super_block *, struct inode *,
-+			  struct nfsd4_pnfs_cb_layout *);
-+int nfsd_device_notify_cb(struct super_block *,
-+			  struct nfsd4_pnfs_cb_dev_list *);
-+int nfsd4_cb_notify_device(struct nfs4_notify_device *);
-+void pnfs_set_device_notify(clientid_t *, unsigned int types);
-+void pnfs_clear_device_notify(struct nfs4_client *);
++	READ32(ota_available);
++	if (targetaddr)
++		targetaddr->ota_available = ota_available;
 +
-+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
-+extern struct sockaddr pnfsd_lexp_addr;
-+extern size_t pnfs_lexp_addr_len;
++	if (ota_available) {
++		p = __xdr_read_calc_net_addr(p,
++				targetaddr ? &targetaddr->ota_netaddr : NULL,
++				freespace);
++	}
 +
-+extern void pnfsd_lexp_init(struct inode *);
-+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
++	return p;
++}
 +
-+#endif /* LINUX_NFSD_PNFSD_H */
-diff --git a/fs/nfsd/pnfsd_lexp.c b/fs/nfsd/pnfsd_lexp.c
-new file mode 100644
-index 0000000..bf2f403
---- /dev/null
-+++ b/fs/nfsd/pnfsd_lexp.c
-@@ -0,0 +1,225 @@
 +/*
-+ * linux/fs/nfsd/pnfs_lexp.c
-+ *
-+ * pNFS export of local filesystems.
-+ *
-+ * Export local file systems over the files layout type.
-+ * The MDS (metadata server) functions also as a single DS (data server).
-+ * This is mostly useful for development and debugging purposes.
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ * Copyright (C) 2008 Benny Halevy, <bhalevy at panasas.com>
-+ *
-+ * Initial implementation was based on the pnfs-gfs2 patches done
-+ * by David M. Richter <richterd at citi.umich.edu>
++ * struct pnfs_osd_deviceaddr {
++ * 	struct pnfs_osd_targetid	oda_targetid;
++ * 	struct pnfs_osd_targetaddr	oda_targetaddr;
++ * 	u8				oda_lun[8];
++ * 	struct nfs4_string		oda_systemid;
++ * 	struct pnfs_osd_object_cred	oda_root_obj_cred;
++ * 	struct nfs4_string		oda_osdname;
++ * };
 + */
++u32 *__xdr_read_calc_deviceaddr(
++	u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace)
++{
++	p = __xdr_read_calc_targetid(p,
++			deviceaddr ? &deviceaddr->oda_targetid : NULL,
++			freespace);
 +
-+#include <linux/sunrpc/svc_xprt.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
++	p = __xdr_read_calc_targetaddr(p,
++			deviceaddr ? &deviceaddr->oda_targetaddr : NULL,
++			freespace);
++
++	if (deviceaddr)
++		COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun));
++	else
++		p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun));
 +
-+#include "pnfsd.h"
++	p = __xdr_read_calc_u8_opaque(p,
++			deviceaddr ? &deviceaddr->oda_systemid : NULL);
 +
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
++	if (deviceaddr) {
++		p = pnfs_osd_xdr_decode_object_cred(p,
++				&deviceaddr->oda_root_obj_cred, freespace);
++	} else {
++		*freespace += pnfs_osd_object_cred_incore_sz(p);
++		p += pnfs_osd_object_cred_xdr_sz(p);
++	}
 +
-+struct sockaddr pnfsd_lexp_addr;
-+size_t pnfs_lexp_addr_len;
++	p = __xdr_read_calc_u8_opaque(p,
++			deviceaddr ? &deviceaddr->oda_osdname : NULL);
 +
-+static int
-+pnfsd_lexp_layout_type(struct super_block *sb)
++	return p;
++}
++
++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p)
 +{
-+	int ret = LAYOUT_NFSV4_1_FILES;
-+	dprintk("<-- %s: return %d\n", __func__, ret);
-+	return ret;
++	u8 *null_freespace = NULL;
++	size_t sz;
++
++	__xdr_read_calc_deviceaddr(p, NULL, &null_freespace);
++	sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace;
++
++	return sz;
 +}
 +
-+static int
-+pnfsd_lexp_get_device_iter(struct super_block *sb,
-+			   u32 layout_type,
-+			   struct nfsd4_pnfs_dev_iter_res *res)
++void pnfs_osd_xdr_decode_deviceaddr(
++	struct pnfs_osd_deviceaddr *deviceaddr, u32 *p)
 +{
-+	dprintk("--> %s: sb=%p\n", __func__, sb);
++	u8 *freespace = (u8 *)(deviceaddr + 1);
 +
-+	BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
++	__xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
++}
 +
-+	res->gd_eof = 1;
-+	if (res->gd_cookie)
-+		return -ENOENT;
-+	res->gd_cookie = 1;
-+	res->gd_verf = 1;
-+	res->gd_devid = 1;
++/*
++ * struct pnfs_osd_layoutupdate {
++ * 	u32	dsu_valid;
++ * 	s64	dsu_delta;
++ * 	u32	olu_ioerr_flag;
++ * };
++ */
++int
++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
++				 struct pnfs_osd_layoutupdate *lou)
++{
++	__be32 *p = xdr_reserve_space(xdr, 16);
 +
-+	dprintk("<-- %s: return 0\n", __func__);
++	if (!p)
++		return -E2BIG;
++
++	*p++ = cpu_to_be32(lou->dsu_valid);
++	if (lou->dsu_valid)
++		p = xdr_encode_hyper(p, lou->dsu_delta);
++	*p++ = cpu_to_be32(lou->olu_ioerr_flag);
 +	return 0;
 +}
 +
-+static int
-+pnfsd_lexp_get_device_info(struct super_block *sb,
-+			   struct exp_xdr_stream *xdr,
-+			   u32 layout_type,
-+			   const struct nfsd4_pnfs_deviceid *devid)
++/*
++ * struct pnfs_osd_objid {
++ * 	struct pnfs_deviceid	oid_device_id;
++ * 	u64			oid_partition_id;
++ * 	u64			oid_object_id;
++ */
++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
++					    struct pnfs_osd_objid *object_id)
 +{
-+	int err;
-+	struct pnfs_filelayout_device fdev;
-+	struct pnfs_filelayout_multipath fl_devices[1];
-+	u32 fl_stripe_indices[1] = { 0 };
-+	struct pnfs_filelayout_devaddr daddr;
-+	/* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */
-+	char daddr_buf[8*4 + 2*3 + 10];
++	__be32 *p;
 +
-+	dprintk("--> %s: sb=%p\n", __func__, sb);
++	p = xdr_reserve_space(xdr, 32);
++	if (!p)
++		return -E2BIG;
 +
-+	BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
++	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
++				    sizeof(object_id->oid_device_id.data));
++	p = xdr_encode_hyper(p, object_id->oid_partition_id);
++	p = xdr_encode_hyper(p, object_id->oid_object_id);
 +
-+	memset(&fdev, '\0', sizeof(fdev));
++	return 0;
++}
 +
-+	if (devid->devid != 1) {
-+		printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 "
-+			"(got: 0x%llx)\n", __func__, devid->devid);
-+		err = -EINVAL;
-+		goto out;
-+	}
++/*
++ * struct pnfs_osd_ioerr {
++ * 	struct pnfs_osd_objid	oer_component;
++ * 	u64			oer_comp_offset;
++ * 	u64			oer_comp_length;
++ * 	u32			oer_iswrite;
++ * 	u32			oer_errno;
++ * };
++ */
++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
++			      struct pnfs_osd_ioerr *ioerr)
++{
++	__be32 *p;
++	int ret;
 +
-+	/* count the number of comma-delimited DS IPs */
-+	fdev.fl_device_length = 1;
-+	fdev.fl_device_list = fl_devices;
++	ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
++	if (ret)
++		return ret;
 +
-+	fdev.fl_stripeindices_length = fdev.fl_device_length;
-+	fdev.fl_stripeindices_list = fl_stripe_indices;
++	p = xdr_reserve_space(xdr, 24);
++	if (!p)
++		return -E2BIG;
 +
-+	daddr.r_addr.data = daddr_buf;
-+	daddr.r_addr.len = sizeof(daddr_buf);
-+	err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr);
-+	if (err < 0)
-+		goto out;
-+	daddr.r_addr.len = err;
-+	switch (pnfsd_lexp_addr.sa_family) {
-+	case AF_INET:
-+		daddr.r_netid.data = "tcp";
-+		daddr.r_netid.len = 3;
-+		break;
-+	case AF_INET6:
-+		daddr.r_netid.data = "tcp6";
-+		daddr.r_netid.len = 4;
-+		break;
-+	default:
-+		BUG();
-+	}
-+	fdev.fl_device_list[0].fl_multipath_length = 1;
-+	fdev.fl_device_list[0].fl_multipath_list = &daddr;
++	p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
++	p = xdr_encode_hyper(p, ioerr->oer_comp_length);
++	*p++ = cpu_to_be32(ioerr->oer_iswrite);
++	*p   = cpu_to_be32(ioerr->oer_errno);
 +
-+	/* have nfsd encode the device info */
-+	err = filelayout_encode_devinfo(xdr, &fdev);
-+out:
-+	dprintk("<-- %s: return %d\n", __func__, err);
-+	return err;
++	return 0;
 +}
-+
-+static int get_stripe_unit(int blocksize)
+diff -up linux-2.6.37.noarch/fs/nfs/pagelist.c.orig linux-2.6.37.noarch/fs/nfs/pagelist.c
+--- linux-2.6.37.noarch/fs/nfs/pagelist.c.orig	2011-01-28 09:37:32.547979774 -0500
++++ linux-2.6.37.noarch/fs/nfs/pagelist.c	2011-01-28 09:43:53.338771991 -0500
+@@ -20,6 +20,7 @@
+ #include <linux/nfs_mount.h>
+ 
+ #include "internal.h"
++#include "pnfs.h"
+ 
+ static struct kmem_cache *nfs_page_cachep;
+ 
+@@ -53,7 +54,8 @@ nfs_page_free(struct nfs_page *p)
+ struct nfs_page *
+ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
+ 		   struct page *page,
+-		   unsigned int offset, unsigned int count)
++		   unsigned int offset, unsigned int count,
++		   struct pnfs_layout_segment *lseg)
+ {
+ 	struct nfs_page		*req;
+ 
+@@ -84,6 +86,9 @@ nfs_create_request(struct nfs_open_conte
+ 	req->wb_bytes   = count;
+ 	req->wb_context = get_nfs_open_context(ctx);
+ 	kref_init(&req->wb_kref);
++	req->wb_lseg    = lseg;
++	if (lseg)
++		get_lseg(lseg);
+ 	return req;
+ }
+ 
+@@ -159,9 +164,12 @@ void nfs_clear_request(struct nfs_page *
+ 		put_nfs_open_context(ctx);
+ 		req->wb_context = NULL;
+ 	}
++	if (req->wb_lseg != NULL) {
++		put_lseg(req->wb_lseg);
++		req->wb_lseg = NULL;
++	}
+ }
+ 
+-
+ /**
+  * nfs_release_request - Release the count on an NFS read/write request
+  * @req: request to release
+@@ -240,7 +248,8 @@ void nfs_pageio_init(struct nfs_pageio_d
+  * Return 'true' if this is the case, else return 'false'.
+  */
+ static int nfs_can_coalesce_requests(struct nfs_page *prev,
+-				     struct nfs_page *req)
++				     struct nfs_page *req,
++				     struct nfs_pageio_descriptor *pgio)
+ {
+ 	if (req->wb_context->cred != prev->wb_context->cred)
+ 		return 0;
+@@ -254,6 +263,12 @@ static int nfs_can_coalesce_requests(str
+ 		return 0;
+ 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ 		return 0;
++	if (req->wb_lseg != prev->wb_lseg)
++		return 0;
++#ifdef CONFIG_NFS_V4_1
++	if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
++		return 0;
++#endif /* CONFIG_NFS_V4_1 */
+ 	return 1;
+ }
+ 
+@@ -286,7 +301,7 @@ static int nfs_pageio_do_add_request(str
+ 		if (newlen > desc->pg_bsize)
+ 			return 0;
+ 		prev = nfs_list_entry(desc->pg_list.prev);
+-		if (!nfs_can_coalesce_requests(prev, req))
++		if (!nfs_can_coalesce_requests(prev, req, desc))
+ 			return 0;
+ 	} else
+ 		desc->pg_base = req->wb_pgbase;
+@@ -375,6 +390,7 @@ void nfs_pageio_cond_complete(struct nfs
+  * @idx_start: lower bound of page->index to scan
+  * @npages: idx_start + npages sets the upper bound to scan.
+  * @tag: tag to scan for
++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver
+  *
+  * Moves elements from one of the inode request lists.
+  * If the number of requests is set to 0, the entire address_space
+@@ -384,7 +400,7 @@ void nfs_pageio_cond_complete(struct nfs
+  */
+ int nfs_scan_list(struct nfs_inode *nfsi,
+ 		struct list_head *dst, pgoff_t idx_start,
+-		unsigned int npages, int tag)
++		  unsigned int npages, int tag, int *use_pnfs)
+ {
+ 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+ 	struct nfs_page *req;
+@@ -415,6 +431,8 @@ int nfs_scan_list(struct nfs_inode *nfsi
+ 				radix_tree_tag_clear(&nfsi->nfs_page_tree,
+ 						req->wb_index, tag);
+ 				nfs_list_add_request(req, dst);
++				if (req->wb_lseg)
++					*use_pnfs = 1;
+ 				res++;
+ 				if (res == INT_MAX)
+ 					goto out;
+diff -up linux-2.6.37.noarch/fs/nfs/pnfs.c.orig linux-2.6.37.noarch/fs/nfs/pnfs.c
+--- linux-2.6.37.noarch/fs/nfs/pnfs.c.orig	2011-01-28 09:37:32.548979739 -0500
++++ linux-2.6.37.noarch/fs/nfs/pnfs.c	2011-01-28 09:43:53.339771853 -0500
+@@ -30,6 +30,7 @@
+ #include <linux/nfs_fs.h>
+ #include "internal.h"
+ #include "pnfs.h"
++#include "iostat.h"
+ 
+ #define NFSDBG_FACILITY		NFSDBG_PNFS
+ 
+@@ -71,6 +72,52 @@ find_pnfs_driver(u32 id)
+ 	return local;
+ }
+ 
++/* Set cred to indicate we require a layoutcommit
++ * If we don't even have a layout, we don't need to commit it.
++ */
++void
++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx)
 +{
-+	if (blocksize < NFSSVC_MAXBLKSIZE)
-+		blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
-+	dprintk("%s: return %d\n", __func__, blocksize);
-+	return blocksize;
++	dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx);
++	spin_lock(&nfsi->vfs_inode.i_lock);
++	if (has_layout(nfsi) &&
++	    !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags)) {
++		nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred);
++		__set_bit(NFS_LAYOUT_NEED_LCOMMIT,
++			  &nfsi->layout->plh_flags);
++		nfsi->change_attr++;
++		spin_unlock(&nfsi->vfs_inode.i_lock);
++		dprintk("%s: Set layoutcommit\n", __func__);
++		return;
++	}
++	spin_unlock(&nfsi->vfs_inode.i_lock);
 +}
 +
-+static enum nfsstat4
-+pnfsd_lexp_layout_get(struct inode *inode,
-+		      struct exp_xdr_stream *xdr,
-+		      const struct nfsd4_pnfs_layoutget_arg *arg,
-+		      struct nfsd4_pnfs_layoutget_res *res)
++/* Update last_write_offset for layoutcommit.
++ * TODO: We should only use commited extents, but the current nfs
++ * implementation does not calculate the written range in nfs_commit_done.
++ * We therefore update this field in writeback_done.
++ */
++void
++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent)
 +{
-+	enum nfsstat4 rc = NFS4_OK;
-+	struct pnfs_filelayout_layout *layout = NULL;
-+	struct knfsd_fh *fhp = NULL;
-+
-+	dprintk("--> %s: inode=%p\n", __func__, inode);
-+
-+	res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
-+	res->lg_seg.offset = 0;
-+	res->lg_seg.length = NFS4_MAX_UINT64;
-+
-+	layout = kzalloc(sizeof(*layout), GFP_KERNEL);
-+	if (layout == NULL) {
-+		rc = -ENOMEM;
-+		goto error;
-+	}
++	loff_t end_pos;
 +
-+	/* Set file layout response args */
-+	layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
-+	layout->lg_stripe_type = STRIPE_SPARSE;
-+	layout->lg_commit_through_mds = true;
-+	layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
-+	layout->lg_fh_length = 1;
-+	layout->device_id.sbid = arg->lg_sbid;
-+	layout->device_id.devid = 1;				/*FSFTEMP*/
-+	layout->lg_first_stripe_index = 0;			/*FSFTEMP*/
-+	layout->lg_pattern_offset = 0;
++	spin_lock(&nfsi->vfs_inode.i_lock);
++	if (offset < nfsi->layout->write_begin_pos)
++		nfsi->layout->write_begin_pos = offset;
++	end_pos = offset + extent - 1; /* I'm being inclusive */
++	if (end_pos > nfsi->layout->write_end_pos)
++		nfsi->layout->write_end_pos = end_pos;
++	dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n",
++		__func__,
++		(unsigned long) extent,
++		(unsigned long) offset ,
++		(unsigned long) nfsi->layout->write_begin_pos,
++		(unsigned long) nfsi->layout->write_end_pos);
++	spin_unlock(&nfsi->vfs_inode.i_lock);
++}
 +
-+	fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
-+	if (fhp == NULL) {
-+		rc = -ENOMEM;
-+		goto error;
+ void
+ unset_pnfs_layoutdriver(struct nfs_server *nfss)
+ {
+@@ -88,7 +135,8 @@ unset_pnfs_layoutdriver(struct nfs_serve
+  * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+  */
+ void
+-set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
++		      u32 id)
+ {
+ 	struct pnfs_layoutdriver_type *ld_type = NULL;
+ 
+@@ -115,7 +163,7 @@ set_pnfs_layoutdriver(struct nfs_server 
+ 		goto out_no_driver;
+ 	}
+ 	server->pnfs_curr_ld = ld_type;
+-	if (ld_type->set_layoutdriver(server)) {
++	if (ld_type->set_layoutdriver(server, mntfh)) {
+ 		printk(KERN_ERR
+ 		       "%s: Error initializing mount point for layout driver %u.\n",
+ 		       __func__, id);
+@@ -146,6 +194,14 @@ pnfs_register_layoutdriver(struct pnfs_l
+ 		return status;
+ 	}
+ 
++	if (!ld_type->read_pagelist || !ld_type->write_pagelist ||
++	    !ld_type->commit) {
++		printk(KERN_ERR "%s Layout driver must provide "
++		       "read_pagelist, write_pagelist, and commit.\n",
++		       __func__);
++		return status;
 +	}
 +
-+	memcpy(fhp, arg->lg_fh, sizeof(*fhp));
-+	pnfs_fh_mark_ds(fhp);
-+	layout->lg_fh_list = fhp;
-+
-+	/* Call nfsd to encode layout */
-+	rc = filelayout_encode_layout(xdr, layout);
-+exit:
-+	kfree(layout);
-+	kfree(fhp);
-+	dprintk("<-- %s: return %d\n", __func__, rc);
-+	return rc;
-+
-+error:
-+	res->lg_seg.length = 0;
-+	goto exit;
+ 	spin_lock(&pnfs_spinlock);
+ 	tmp = find_pnfs_driver_locked(ld_type->id);
+ 	if (!tmp) {
+@@ -184,18 +240,35 @@ get_layout_hdr(struct pnfs_layout_hdr *l
+ 	atomic_inc(&lo->plh_refcount);
+ }
+ 
++static struct pnfs_layout_hdr *
++pnfs_alloc_layout_hdr(struct inode *ino)
++{
++	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
++	return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
++		kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
 +}
 +
-+static int
-+pnfsd_lexp_layout_commit(struct inode *inode,
-+			 const struct nfsd4_pnfs_layoutcommit_arg *args,
-+			 struct nfsd4_pnfs_layoutcommit_res *res)
++static void
++pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 +{
-+	dprintk("%s: (unimplemented)\n", __func__);
-+
-+	return 0;
++	struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->inode)->pnfs_curr_ld;
++	return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
 +}
 +
-+static int
-+pnfsd_lexp_layout_return(struct inode *inode,
-+			 const struct nfsd4_pnfs_layoutreturn_arg *args)
+ static void
+ destroy_layout_hdr(struct pnfs_layout_hdr *lo)
+ {
+ 	dprintk("%s: freeing layout cache %p\n", __func__, lo);
+-	BUG_ON(!list_empty(&lo->plh_layouts));
+-	NFS_I(lo->plh_inode)->layout = NULL;
+-	kfree(lo);
++	BUG_ON(!list_empty(&lo->layouts));
++	NFS_I(lo->inode)->layout = NULL;
++	pnfs_free_layout_hdr(lo);
+ }
+ 
+ static void
+ put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+ {
++	assert_spin_locked(&lo->inode->i_lock);
++	BUG_ON(atomic_read(&lo->plh_refcount) == 0);
+ 	if (atomic_dec_and_test(&lo->plh_refcount))
+ 		destroy_layout_hdr(lo);
+ }
+@@ -203,8 +276,9 @@ put_layout_hdr_locked(struct pnfs_layout
+ void
+ put_layout_hdr(struct pnfs_layout_hdr *lo)
+ {
+-	struct inode *inode = lo->plh_inode;
++	struct inode *inode = lo->inode;
+ 
++	BUG_ON(atomic_read(&lo->plh_refcount) == 0);
+ 	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ 		destroy_layout_hdr(lo);
+ 		spin_unlock(&inode->i_lock);
+@@ -214,27 +288,52 @@ put_layout_hdr(struct pnfs_layout_hdr *l
+ static void
+ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+ {
+-	INIT_LIST_HEAD(&lseg->pls_list);
++	INIT_LIST_HEAD(&lseg->fi_list);
+ 	atomic_set(&lseg->pls_refcount, 1);
+ 	smp_mb();
+ 	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
+-	lseg->pls_layout = lo;
++	lseg->layout = lo;
++	lseg->pls_notify_mask = 0;
+ }
+ 
+ static void free_lseg(struct pnfs_layout_segment *lseg)
+ {
+-	struct inode *ino = lseg->pls_layout->plh_inode;
++	struct inode *ino = lseg->layout->inode;
++	u64 mask = lseg->pls_notify_mask;
+ 
++	BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
+ 	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+-	/* Matched by get_layout_hdr in pnfs_insert_layout */
++	notify_drained(NFS_SERVER(ino)->nfs_client, mask);
++	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+ 	put_layout_hdr(NFS_I(ino)->layout);
+ }
+ 
++static void
++_put_lseg_common(struct pnfs_layout_segment *lseg)
 +{
-+	dprintk("%s: (unimplemented)\n", __func__);
++	struct inode *ino = lseg->layout->inode;
 +
-+	return 0;
-+}
++	BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
++	list_del(&lseg->fi_list);
++	if (list_empty(&lseg->layout->segs)) {
++		struct nfs_client *clp;
 +
-+static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh,
-+				struct pnfs_get_state *p)
-+{
-+	return 0;	/* just use the current stateid */
++		clp = NFS_SERVER(ino)->nfs_client;
++		spin_lock(&clp->cl_lock);
++		/* List does not take a reference, so no need for put here */
++		list_del_init(&lseg->layout->layouts);
++		spin_unlock(&clp->cl_lock);
++		clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->layout->plh_flags);
++		if (!pnfs_layoutgets_blocked(lseg->layout, NULL))
++			rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
++	}
++	rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq);
 +}
 +
-+static struct pnfs_export_operations pnfsd_lexp_ops = {
-+	.layout_type = pnfsd_lexp_layout_type,
-+	.get_device_info = pnfsd_lexp_get_device_info,
-+	.get_device_iter = pnfsd_lexp_get_device_iter,
-+	.layout_get = pnfsd_lexp_layout_get,
-+	.layout_commit = pnfsd_lexp_layout_commit,
-+	.layout_return = pnfsd_lexp_layout_return,
-+	.get_state = pnfsd_lexp_get_state,
-+};
-+
+ /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+  * could sleep, so must be called outside of the lock.
+- * Returns 1 if object was removed, otherwise return 0.
+  */
+-static int
++static void
+ put_lseg_locked(struct pnfs_layout_segment *lseg,
+ 		struct list_head *tmp_list)
+ {
+@@ -242,74 +341,142 @@ put_lseg_locked(struct pnfs_layout_segme
+ 		atomic_read(&lseg->pls_refcount),
+ 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ 	if (atomic_dec_and_test(&lseg->pls_refcount)) {
+-		struct inode *ino = lseg->pls_layout->plh_inode;
++		_put_lseg_common(lseg);
++		list_add(&lseg->fi_list, tmp_list);
++	}
++}
+ 
+-		BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+-		list_del(&lseg->pls_list);
+-		if (list_empty(&lseg->pls_layout->plh_segs)) {
+-			struct nfs_client *clp;
 +void
-+pnfsd_lexp_init(struct inode *inode)
++put_lseg(struct pnfs_layout_segment *lseg)
 +{
-+	dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops);
-+	inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops;
-+}
-diff --git a/fs/nfsd/spnfs_com.c b/fs/nfsd/spnfs_com.c
-new file mode 100644
-index 0000000..1ce9ee6
---- /dev/null
-+++ b/fs/nfsd/spnfs_com.c
-@@ -0,0 +1,535 @@
-+/*
-+ * fs/nfsd/spnfs_com.c
-+ *
-+ * Communcation layer between spNFS kernel and userspace
-+ * Based heavily on idmap.c
-+ *
-+ */
-+
-+/*
-+ *  Copyright (c) 2002 The Regents of the University of Michigan.
-+ *  All rights reserved.
-+ *
-+ *  Marius Aamodt Eriksen <marius at umich.edu>
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the University nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
-+#include <linux/namei.h>
-+#include <linux/mount.h>
-+#include <linux/path.h>
-+#include <linux/sunrpc/clnt.h>
-+#include <linux/sunrpc/rpc_pipe_fs.h>
-+#include <linux/nfsd/debug.h>
++	struct inode *ino;
+ 
+-			clp = NFS_SERVER(ino)->nfs_client;
+-			spin_lock(&clp->cl_lock);
+-			/* List does not take a reference, so no need for put here */
+-			list_del_init(&lseg->pls_layout->plh_layouts);
+-			spin_unlock(&clp->cl_lock);
+-			clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+-		}
+-		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+-		list_add(&lseg->pls_list, tmp_list);
+-		return 1;
++	if (!lseg)
++		return;
 +
-+#include <linux/nfsd4_spnfs.h>
++	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
++		atomic_read(&lseg->pls_refcount),
++		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
++	ino = lseg->layout->inode;
++	if (atomic_dec_and_lock(&lseg->pls_refcount, &ino->i_lock)) {
++		_put_lseg_common(lseg);
++		spin_unlock(&ino->i_lock);
++		free_lseg(lseg);
+ 	}
+-	return 0;
+ }
++EXPORT_SYMBOL_GPL(put_lseg);
+ 
+-static bool
+-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
++void get_lseg(struct pnfs_layout_segment *lseg)
+ {
+-	return (recall_iomode == IOMODE_ANY ||
+-		lseg_iomode == recall_iomode);
++	atomic_inc(&lseg->pls_refcount);
++	smp_mb__after_atomic_inc();
+ }
++EXPORT_SYMBOL_GPL(get_lseg);
+ 
+-/* Returns 1 if lseg is removed from list, 0 otherwise */
+-static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+-			     struct list_head *tmp_list)
++static inline u64
++end_offset(u64 start, u64 len)
+ {
+-	int rv = 0;
++	u64 end;
 +
-+#define	NFSDDBG_FACILITY		NFSDDBG_PROC
++	end = start + len;
++	return end >= start ? end: NFS4_MAX_UINT64;
++}
+ 
++/* last octet in a range */
++static inline u64
++last_byte_offset(u64 start, u64 len)
++{
++	u64 end;
 +
-+static ssize_t   spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-+		     char __user *, size_t);
-+static ssize_t   spnfs_pipe_downcall(struct file *, const char __user *,
-+		     size_t);
-+static void      spnfs_pipe_destroy_msg(struct rpc_pipe_msg *);
++	BUG_ON(!len);
++	end = start + len;
++	return end > start ? end - 1: NFS4_MAX_UINT64;
++}
 +
-+static struct rpc_pipe_ops spnfs_upcall_ops = {
-+	.upcall		= spnfs_pipe_upcall,
-+	.downcall	= spnfs_pipe_downcall,
-+	.destroy_msg	= spnfs_pipe_destroy_msg,
-+};
++/*
++ * is l2 fully contained in l1?
++ *   start1                             end1
++ *   [----------------------------------)
++ *           start2           end2
++ *           [----------------)
++ */
++static inline int
++lo_seg_contained(struct pnfs_layout_range *l1,
++		 struct pnfs_layout_range *l2)
++{
++	u64 start1 = l1->offset;
++	u64 end1 = end_offset(start1, l1->length);
++	u64 start2 = l2->offset;
++	u64 end2 = end_offset(start2, l2->length);
 +
-+/* evil global variable */
-+struct spnfs *global_spnfs;
-+struct spnfs_config *spnfs_config;
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+int spnfs_use_layoutsegments;
-+uint64_t layoutsegment_size;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++	return (start1 <= start2) && (end1 >= end2);
++}
 +
 +/*
-+ * Used by spnfs_enabled()
-+ * Tracks if the subsystem has been initialized at some point.  It doesn't
-+ * matter if it's not currently initialized.
++ * is l1 and l2 intersecting?
++ *   start1                             end1
++ *   [----------------------------------)
++ *                              start2           end2
++ *                              [----------------)
 + */
-+static int spnfs_enabled_at_some_point;
-+
-+/* call this to start the ball rolling */
-+/* code it like we're going to avoid the global variable in the future */
-+int
-+nfsd_spnfs_new(void)
++static inline int
++lo_seg_intersecting(struct pnfs_layout_range *l1,
++		    struct pnfs_layout_range *l2)
 +{
-+	struct spnfs *spnfs = NULL;
-+	struct path path;
-+	struct nameidata nd;
-+	int rc;
++	u64 start1 = l1->offset;
++	u64 end1 = end_offset(start1, l1->length);
++	u64 start2 = l2->offset;
++	u64 end2 = end_offset(start2, l2->length);
 +
-+	if (global_spnfs != NULL)
-+		return -EEXIST;
++	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
++	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
++}
 +
-+	path.mnt = rpc_get_mount();
-+	if (IS_ERR(path.mnt))
-+		return PTR_ERR(path.mnt);
++bool
++should_free_lseg(struct pnfs_layout_range *lseg_range,
++		 struct pnfs_layout_range *recall_range)
++{
++	return (recall_range->iomode == IOMODE_ANY ||
++		lseg_range->iomode == recall_range->iomode) &&
++	       lo_seg_intersecting(lseg_range, recall_range);
++}
 +
-+	/* FIXME: do not abuse rpc_pipefs/nfs */
-+	rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
-+	if (rc)
-+		goto err;
++static void mark_lseg_invalid(struct pnfs_layout_segment *lseg,
++			      struct list_head *tmp_list)
++{
++	assert_spin_locked(&lseg->layout->inode->i_lock);
+ 	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ 		/* Remove the reference keeping the lseg in the
+ 		 * list.  It will now be removed when all
+ 		 * outstanding io is finished.
+ 		 */
+-		rv = put_lseg_locked(lseg, tmp_list);
++		put_lseg_locked(lseg, tmp_list);
+ 	}
+-	return rv;
+ }
+ 
+-/* Returns count of number of matching invalid lsegs remaining in list
+- * after call.
+- */
+-int
+-mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+-			    struct list_head *tmp_list,
+-			    u32 iomode)
++/* Returns false if there was nothing to do, true otherwise */
++static bool
++pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
++		     struct pnfs_layout_range *range)
+ {
+ 	struct pnfs_layout_segment *lseg, *next;
+-	int invalid = 0, removed = 0;
++	bool rv = false;
+ 
+-	dprintk("%s:Begin lo %p\n", __func__, lo);
++	dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
++		__func__, lo, range->offset, range->length, range->iomode);
+ 
+-	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+-		if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
++	assert_spin_locked(&lo->inode->i_lock);
++	list_for_each_entry_safe(lseg, next, &lo->segs, fi_list)
++		if (should_free_lseg(&lseg->range, range)) {
+ 			dprintk("%s: freeing lseg %p iomode %d "
+ 				"offset %llu length %llu\n", __func__,
+-				lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+-				lseg->pls_range.length);
+-			invalid++;
+-			removed += mark_lseg_invalid(lseg, tmp_list);
++				lseg, lseg->range.iomode, lseg->range.offset,
++				lseg->range.length);
++			mark_lseg_invalid(lseg, tmp_list);
++			rv = true;
+ 		}
+-	dprintk("%s:Return %i\n", __func__, invalid - removed);
+-	return invalid - removed;
++	dprintk("%s:Return\n", __func__);
++	return rv;
+ }
+ 
+ void
+@@ -317,23 +484,29 @@ pnfs_free_lseg_list(struct list_head *fr
+ {
+ 	struct pnfs_layout_segment *lseg, *tmp;
+ 
+-	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+-		list_del(&lseg->pls_list);
++	list_for_each_entry_safe(lseg, tmp, free_me, fi_list)
+ 		free_lseg(lseg);
+-	}
++	INIT_LIST_HEAD(free_me);
+ }
+ 
+ void
+ pnfs_destroy_layout(struct nfs_inode *nfsi)
+ {
+ 	struct pnfs_layout_hdr *lo;
++	struct pnfs_layout_range range = {
++		.iomode = IOMODE_ANY,
++		.offset = 0,
++		.length = NFS4_MAX_UINT64,
++	};
+ 	LIST_HEAD(tmp_list);
+ 
+ 	spin_lock(&nfsi->vfs_inode.i_lock);
+ 	lo = nfsi->layout;
+ 	if (lo) {
+-		set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+-		mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
++		pnfs_clear_lseg_list(lo, &tmp_list, &range);
++		WARN_ON(!list_empty(&nfsi->layout->segs));
++		WARN_ON(!list_empty(&nfsi->layout->layouts));
 +
-+	spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL);
-+	if (spnfs == NULL){
-+		rc = -ENOMEM;
-+		goto err;
+ 		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
+ 		put_layout_hdr_locked(lo);
+ 	}
+@@ -357,30 +530,28 @@ pnfs_destroy_all_layouts(struct nfs_clie
+ 
+ 	while (!list_empty(&tmp_list)) {
+ 		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+-				plh_layouts);
++				layouts);
+ 		dprintk("%s freeing layout for inode %lu\n", __func__,
+-			lo->plh_inode->i_ino);
+-		pnfs_destroy_layout(NFS_I(lo->plh_inode));
++			lo->inode->i_ino);
++		pnfs_destroy_layout(NFS_I(lo->inode));
+ 	}
+ }
+ 
+-/* update lo->plh_stateid with new if is more recent */
++/* update lo->stateid with new if is more recent */
+ void
+ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+ 			bool update_barrier)
+ {
+ 	u32 oldseq, newseq;
+ 
+-	oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
++	assert_spin_locked(&lo->inode->i_lock);
++	oldseq = be32_to_cpu(lo->stateid.stateid.seqid);
+ 	newseq = be32_to_cpu(new->stateid.seqid);
+ 	if ((int)(newseq - oldseq) > 0) {
+-		memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+-		if (update_barrier) {
+-			u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+-
+-			if ((int)(new_barrier - lo->plh_barrier))
+-				lo->plh_barrier = new_barrier;
+-		} else {
++		memcpy(&lo->stateid, &new->stateid, sizeof(new->stateid));
++		if (update_barrier)
++			lo->plh_barrier = be32_to_cpu(new->stateid.seqid);
++		else {
+ 			/* Because of wraparound, we want to keep the barrier
+ 			 * "close" to the current seqids.  It needs to be
+ 			 * within 2**31 to count as "behind", so if it
+@@ -394,20 +565,6 @@ pnfs_set_layout_stateid(struct pnfs_layo
+ 	}
+ }
+ 
+-/* lget is set to 1 if called from inside send_layoutget call chain */
+-static bool
+-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+-			int lget)
+-{
+-	if ((stateid) &&
+-	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+-		return true;
+-	return lo->plh_block_lgets ||
+-		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+-		(list_empty(&lo->plh_segs) &&
+-		 (atomic_read(&lo->plh_outstanding) > lget));
+-}
+-
+ int
+ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+ 			      struct nfs4_state *open_state)
+@@ -415,10 +572,11 @@ pnfs_choose_layoutget_stateid(nfs4_state
+ 	int status = 0;
+ 
+ 	dprintk("--> %s\n", __func__);
+-	spin_lock(&lo->plh_inode->i_lock);
+-	if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
++	assert_spin_locked(&lo->inode->i_lock);
++	if (lo->plh_block_lgets ||
++	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ 		status = -EAGAIN;
+-	} else if (list_empty(&lo->plh_segs)) {
++	} else if (list_empty(&lo->segs)) {
+ 		int seq;
+ 
+ 		do {
+@@ -427,9 +585,8 @@ pnfs_choose_layoutget_stateid(nfs4_state
+ 			       sizeof(open_state->stateid.data));
+ 		} while (read_seqretry(&open_state->seqlock, seq));
+ 	} else
+-		memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+-	spin_unlock(&lo->plh_inode->i_lock);
+-	dprintk("<-- %s\n", __func__);
++		memcpy(dst->data, lo->stateid.data, sizeof(lo->stateid.data));
++	dprintk("<-- %s status=%d\n", __func__, status);
+ 	return status;
+ }
+ 
+@@ -442,9 +599,9 @@ pnfs_choose_layoutget_stateid(nfs4_state
+ static struct pnfs_layout_segment *
+ send_layoutget(struct pnfs_layout_hdr *lo,
+ 	   struct nfs_open_context *ctx,
+-	   u32 iomode)
++	   struct pnfs_layout_range *range)
+ {
+-	struct inode *ino = lo->plh_inode;
++	struct inode *ino = lo->inode;
+ 	struct nfs_server *server = NFS_SERVER(ino);
+ 	struct nfs4_layoutget *lgp;
+ 	struct pnfs_layout_segment *lseg = NULL;
+@@ -453,13 +610,15 @@ send_layoutget(struct pnfs_layout_hdr *l
+ 
+ 	BUG_ON(ctx == NULL);
+ 	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+-	if (lgp == NULL)
++	if (lgp == NULL) {
++		put_layout_hdr(lo);
+ 		return NULL;
+-	lgp->args.minlength = NFS4_MAX_UINT64;
++	}
++	lgp->args.minlength = PAGE_CACHE_SIZE;
++	if (lgp->args.minlength > range->length)
++		lgp->args.minlength = range->length;
+ 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+-	lgp->args.range.iomode = iomode;
+-	lgp->args.range.offset = 0;
+-	lgp->args.range.length = NFS4_MAX_UINT64;
++	lgp->args.range = *range;
+ 	lgp->args.type = server->pnfs_curr_ld->id;
+ 	lgp->args.inode = ino;
+ 	lgp->args.ctx = get_nfs_open_context(ctx);
+@@ -471,86 +630,119 @@ send_layoutget(struct pnfs_layout_hdr *l
+ 	nfs4_proc_layoutget(lgp);
+ 	if (!lseg) {
+ 		/* remember that LAYOUTGET failed and suspend trying */
+-		set_bit(lo_fail_bit(iomode), &lo->plh_flags);
++		set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
+ 	}
+ 	return lseg;
+ }
+ 
+-bool pnfs_roc(struct inode *ino)
++void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
++				struct pnfs_layout_range *range,
++				int notify_bit, atomic_t *notify_count,
++				struct list_head *tmp_list)
+ {
+-	struct pnfs_layout_hdr *lo;
+ 	struct pnfs_layout_segment *lseg, *tmp;
+-	LIST_HEAD(tmp_list);
+-	bool found = false;
+ 
+-	spin_lock(&ino->i_lock);
+-	lo = NFS_I(ino)->layout;
+-	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+-	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+-		goto out_nolayout;
+-	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+-		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+-			mark_lseg_invalid(lseg, &tmp_list);
+-			found = true;
++	assert_spin_locked(&lo->inode->i_lock);
++	list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list)
++		if (should_free_lseg(&lseg->range, range)) {
++			lseg->pls_notify_mask |= (1 << notify_bit);
++			atomic_inc(notify_count);
++			mark_lseg_invalid(lseg, tmp_list);
+ 		}
+-	if (!found)
+-		goto out_nolayout;
+-	lo->plh_block_lgets++;
+-	get_layout_hdr(lo); /* matched in pnfs_roc_release */
+-	spin_unlock(&ino->i_lock);
+-	pnfs_free_lseg_list(&tmp_list);
+-	return true;
+-
+-out_nolayout:
+-	spin_unlock(&ino->i_lock);
+-	return false;
+ }
+ 
+-void pnfs_roc_release(struct inode *ino)
++/* Return true if there is layout based io in progress in the given range.
++ * Assumes range has already been marked invalid, and layout marked to
++ * prevent any new lseg from being inserted.
++ */
++bool
++pnfs_return_layout_barrier(struct nfs_inode *nfsi,
++			   struct pnfs_layout_range *range)
+ {
+-	struct pnfs_layout_hdr *lo;
++	struct pnfs_layout_segment *lseg;
++	bool ret = false;
+ 
+-	spin_lock(&ino->i_lock);
+-	lo = NFS_I(ino)->layout;
+-	lo->plh_block_lgets--;
+-	put_layout_hdr_locked(lo);
+-	spin_unlock(&ino->i_lock);
++	spin_lock(&nfsi->vfs_inode.i_lock);
++	list_for_each_entry(lseg, &nfsi->layout->segs, fi_list)
++		if (should_free_lseg(&lseg->range, range)) {
++			ret = true;
++			break;
++		}
++	spin_unlock(&nfsi->vfs_inode.i_lock);
++	dprintk("%s:Return %d\n", __func__, ret);
++	return ret;
+ }
+ 
+-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
++static int
++return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
+ {
+-	struct pnfs_layout_hdr *lo;
++	struct nfs4_layoutreturn *lrp;
++	struct nfs_server *server = NFS_SERVER(ino);
++	int status = -ENOMEM;
+ 
+-	spin_lock(&ino->i_lock);
+-	lo = NFS_I(ino)->layout;
+-	if ((int)(barrier - lo->plh_barrier) > 0)
+-		lo->plh_barrier = barrier;
+-	spin_unlock(&ino->i_lock);
++	dprintk("--> %s\n", __func__);
++
++	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
++	if (lrp == NULL) {
++		put_layout_hdr(NFS_I(ino)->layout);
++		goto out;
 +	}
++	lrp->args.reclaim = 0;
++	lrp->args.layout_type = server->pnfs_curr_ld->id;
++	lrp->args.return_type = RETURN_FILE;
++	lrp->args.range = *range;
++	lrp->args.inode = ino;
++	lrp->clp = server->nfs_client;
 +
-+	spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs,
-+					 &spnfs_upcall_ops, 0);
-+	if (IS_ERR(spnfs->spnfs_dentry)) {
-+		rc = -EPIPE;
-+		goto err;
++	status = nfs4_proc_layoutreturn(lrp, wait);
++out:
++	dprintk("<-- %s status: %d\n", __func__, status);
++	return status;
+ }
+ 
+-bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
++/* Initiates a LAYOUTRETURN(FILE) */
++int
++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
++		    bool wait)
+ {
++	struct pnfs_layout_hdr *lo = NULL;
+ 	struct nfs_inode *nfsi = NFS_I(ino);
+-	struct pnfs_layout_segment *lseg;
+-	bool found = false;
++	struct pnfs_layout_range arg;
++	LIST_HEAD(tmp_list);
++	int status = 0;
+ 
+-	spin_lock(&ino->i_lock);
+-	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+-		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+-			found = true;
+-			break;
+-		}
+-	if (!found) {
+-		struct pnfs_layout_hdr *lo = nfsi->layout;
+-		u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
++	dprintk("--> %s\n", __func__);
+ 
+-		/* Since close does not return a layout stateid for use as
+-		 * a barrier, we choose the worst-case barrier.
+-		 */
+-		*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
++	arg.iomode = range ? range->iomode : IOMODE_ANY;
++	arg.offset = 0;
++	arg.length = NFS4_MAX_UINT64;
++
++	spin_lock(&ino->i_lock);
++	lo = nfsi->layout;
++	if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) {
++		spin_unlock(&ino->i_lock);
++		dprintk("%s: no layout segments to return\n", __func__);
++		goto out;
+ 	}
++	lo->plh_block_lgets++;
++	/* Reference matched in nfs4_layoutreturn_release */
++	get_layout_hdr(lo);
+ 	spin_unlock(&ino->i_lock);
+-	return found;
++	pnfs_free_lseg_list(&tmp_list);
++
++	if (layoutcommit_needed(nfsi)) {
++		status = pnfs_layoutcommit_inode(ino, wait);
++		if (status) {
++			/* Return layout even if layoutcommit fails */
++			dprintk("%s: layoutcommit failed, status=%d. "
++				"Returning layout anyway\n",
++				__func__, status);
++		}
 +	}
++	status = return_layout(ino, &arg, wait);
++out:
++	dprintk("<-- %s status: %d\n", __func__, status);
++	return status;
+ }
+ 
+ /*
+@@ -559,10 +751,24 @@ bool pnfs_roc_drain(struct inode *ino, u
+  * are seen first.
+  */
+ static s64
+-cmp_layout(u32 iomode1, u32 iomode2)
++cmp_layout(struct pnfs_layout_range *l1,
++	   struct pnfs_layout_range *l2)
+ {
++	s64 d;
 +
-+	mutex_init(&spnfs->spnfs_lock);
-+	mutex_init(&spnfs->spnfs_plock);
-+	init_waitqueue_head(&spnfs->spnfs_wq);
++	/* higher offset > lower offset */
++	d = l1->offset - l2->offset;
++	if (d)
++		return d;
 +
-+	global_spnfs = spnfs;
-+	spnfs_enabled_at_some_point = 1;
++	/* longer length > shorter length */
++	d = l1->length - l2->length;
++	if (d)
++		return d;
 +
-+	return 0;
-+err:
-+	rpc_put_mount();
-+	kfree(spnfs);
-+	return rc;
-+}
+ 	/* read > read/write */
+-	return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
++	return (int)(l2->iomode == IOMODE_READ) -
++		(int)(l1->iomode == IOMODE_READ);
+ }
+ 
+ static void
+@@ -574,27 +780,30 @@ pnfs_insert_layout(struct pnfs_layout_hd
+ 
+ 	dprintk("%s:Begin\n", __func__);
+ 
+-	assert_spin_locked(&lo->plh_inode->i_lock);
+-	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
+-		if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
++	assert_spin_locked(&lo->inode->i_lock);
++	list_for_each_entry(lp, &lo->segs, fi_list) {
++		if (cmp_layout(&lp->range, &lseg->range) > 0)
+ 			continue;
+-		list_add_tail(&lseg->pls_list, &lp->pls_list);
++		list_add_tail(&lseg->fi_list, &lp->fi_list);
+ 		dprintk("%s: inserted lseg %p "
+ 			"iomode %d offset %llu length %llu before "
+ 			"lp %p iomode %d offset %llu length %llu\n",
+-			__func__, lseg, lseg->pls_range.iomode,
+-			lseg->pls_range.offset, lseg->pls_range.length,
+-			lp, lp->pls_range.iomode, lp->pls_range.offset,
+-			lp->pls_range.length);
++			__func__, lseg, lseg->range.iomode,
++			lseg->range.offset, lseg->range.length,
++			lp, lp->range.iomode, lp->range.offset,
++			lp->range.length);
+ 		found = 1;
+ 		break;
+ 	}
+ 	if (!found) {
+-		list_add_tail(&lseg->pls_list, &lo->plh_segs);
++		list_add_tail(&lseg->fi_list, &lo->segs);
++		if (list_is_singular(&lo->segs) &&
++		    !pnfs_layoutgets_blocked(lo, NULL))
++			rpc_wake_up(&NFS_I(lo->inode)->lo_rpcwaitq_stateid);
+ 		dprintk("%s: inserted lseg %p "
+ 			"iomode %d offset %llu length %llu at tail\n",
+-			__func__, lseg, lseg->pls_range.iomode,
+-			lseg->pls_range.offset, lseg->pls_range.length);
++			__func__, lseg, lseg->range.iomode,
++			lseg->range.offset, lseg->range.length);
+ 	}
+ 	get_layout_hdr(lo);
+ 
+@@ -606,14 +815,14 @@ alloc_init_layout_hdr(struct inode *ino)
+ {
+ 	struct pnfs_layout_hdr *lo;
+ 
+-	lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
++	lo = pnfs_alloc_layout_hdr(ino);
+ 	if (!lo)
+ 		return NULL;
+ 	atomic_set(&lo->plh_refcount, 1);
+-	INIT_LIST_HEAD(&lo->plh_layouts);
+-	INIT_LIST_HEAD(&lo->plh_segs);
++	INIT_LIST_HEAD(&lo->layouts);
++	INIT_LIST_HEAD(&lo->segs);
+ 	INIT_LIST_HEAD(&lo->plh_bulk_recall);
+-	lo->plh_inode = ino;
++	lo->inode = ino;
+ 	return lo;
+ }
+ 
+@@ -626,12 +835,9 @@ pnfs_find_alloc_layout(struct inode *ino
+ 	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+ 
+ 	assert_spin_locked(&ino->i_lock);
+-	if (nfsi->layout) {
+-		if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+-			return NULL;
+-		else
+-			return nfsi->layout;
+-	}
++	if (nfsi->layout)
++		return nfsi->layout;
 +
-+/* again, code it like we're going to remove the global variable */
-+void
-+nfsd_spnfs_delete(void)
-+{
-+	struct spnfs *spnfs = global_spnfs;
+ 	spin_unlock(&ino->i_lock);
+ 	new = alloc_init_layout_hdr(ino);
+ 	spin_lock(&ino->i_lock);
+@@ -639,13 +845,13 @@ pnfs_find_alloc_layout(struct inode *ino
+ 	if (likely(nfsi->layout == NULL))	/* Won the race? */
+ 		nfsi->layout = new;
+ 	else
+-		kfree(new);
++		pnfs_free_layout_hdr(new);
+ 	return nfsi->layout;
+ }
+ 
+ /*
+  * iomode matching rules:
+- * iomode	lseg	match
++ * range	lseg	match
+  * -----	-----	-----
+  * ANY		READ	true
+  * ANY		RW	true
+@@ -655,34 +861,47 @@ pnfs_find_alloc_layout(struct inode *ino
+  * READ		RW	true
+  */
+ static int
+-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
++is_matching_lseg(struct pnfs_layout_segment *lseg,
++		 struct pnfs_layout_range *range)
+ {
+-	return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
++	struct pnfs_layout_range range1;
 +
-+	if (!spnfs)
-+		return;
-+	rpc_unlink(spnfs->spnfs_dentry);
-+	rpc_put_mount();
-+	global_spnfs = NULL;
-+	kfree(spnfs);
-+}
++	if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) ||
++	    !lo_seg_intersecting(&lseg->range, range))
++		return 0;
 +
-+/* RPC pipefs upcall/downcall routines */
-+/* looks like this code is invoked by the rpc_pipe code */
-+/* to handle upcalls on things we've queued elsewhere */
-+/* See nfs_idmap_id for an exmaple of enqueueing */
-+static ssize_t
-+spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-+    char __user *dst, size_t buflen)
++	/* range1 covers only the first byte in the range */
++	range1 = *range;
++	range1.length = 1;
++	return lo_seg_contained(&lseg->range, &range1);
+ }
+ 
+ /*
+  * lookup range in layout
+  */
+ static struct pnfs_layout_segment *
+-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
++pnfs_find_lseg(struct pnfs_layout_hdr *lo,
++		struct pnfs_layout_range *range)
+ {
+ 	struct pnfs_layout_segment *lseg, *ret = NULL;
+ 
+ 	dprintk("%s:Begin\n", __func__);
+ 
+-	assert_spin_locked(&lo->plh_inode->i_lock);
+-	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
++	assert_spin_locked(&lo->inode->i_lock);
++	list_for_each_entry(lseg, &lo->segs, fi_list) {
+ 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+-		    is_matching_lseg(lseg, iomode)) {
++		    is_matching_lseg(lseg, range)) {
++			get_lseg(lseg);
+ 			ret = lseg;
+ 			break;
+ 		}
+-		if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
++		if (cmp_layout(range, &lseg->range) > 0)
+ 			break;
+ 	}
+ 
+-	dprintk("%s:Return lseg %p ref %d\n",
+-		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
++	dprintk("%s:Return lseg %p ref %d valid %d\n",
++		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0,
++		ret ? test_bit(NFS_LSEG_VALID, &ret->pls_flags) : 0);
+ 	return ret;
+ }
+ 
+@@ -693,8 +912,15 @@ pnfs_find_lseg(struct pnfs_layout_hdr *l
+ struct pnfs_layout_segment *
+ pnfs_update_layout(struct inode *ino,
+ 		   struct nfs_open_context *ctx,
++		   loff_t pos,
++		   u64 count,
+ 		   enum pnfs_iomode iomode)
+ {
++	struct pnfs_layout_range arg = {
++		.iomode = iomode,
++		.offset = pos,
++		.length = count,
++	};
+ 	struct nfs_inode *nfsi = NFS_I(ino);
+ 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+ 	struct pnfs_layout_hdr *lo;
+@@ -709,14 +935,8 @@ pnfs_update_layout(struct inode *ino,
+ 		goto out_unlock;
+ 	}
+ 
+-	/* Do we even need to bother with this? */
+-	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+-	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+-		dprintk("%s matches recall, use MDS\n", __func__);
+-		goto out_unlock;
+-	}
+ 	/* Check to see if the layout for the given range already exists */
+-	lseg = pnfs_find_lseg(lo, iomode);
++	lseg = pnfs_find_lseg(lo, &arg);
+ 	if (lseg)
+ 		goto out_unlock;
+ 
+@@ -724,35 +944,29 @@ pnfs_update_layout(struct inode *ino,
+ 	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+ 		goto out_unlock;
+ 
+-	if (pnfs_layoutgets_blocked(lo, NULL, 0))
+-		goto out_unlock;
+-	atomic_inc(&lo->plh_outstanding);
+-
+-	get_layout_hdr(lo);
+-	if (list_empty(&lo->plh_segs)) {
++	get_layout_hdr(lo); /* Matched in pnfs_layoutget_release */
++	if (list_empty(&lo->segs)) {
+ 		/* The lo must be on the clp list if there is any
+ 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
+ 		 */
+ 		spin_lock(&clp->cl_lock);
+-		BUG_ON(!list_empty(&lo->plh_layouts));
+-		list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
++		BUG_ON(!list_empty(&lo->layouts));
++		list_add_tail(&lo->layouts, &clp->cl_layouts);
+ 		spin_unlock(&clp->cl_lock);
+ 	}
+ 	spin_unlock(&ino->i_lock);
+ 
+-	lseg = send_layoutget(lo, ctx, iomode);
++	lseg = send_layoutget(lo, ctx, &arg);
+ 	if (!lseg) {
+ 		spin_lock(&ino->i_lock);
+-		if (list_empty(&lo->plh_segs)) {
++		if (list_empty(&lo->segs)) {
+ 			spin_lock(&clp->cl_lock);
+-			list_del_init(&lo->plh_layouts);
++			list_del_init(&lo->layouts);
+ 			spin_unlock(&clp->cl_lock);
+ 			clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ 		}
+ 		spin_unlock(&ino->i_lock);
+ 	}
+-	atomic_dec(&lo->plh_outstanding);
+-	put_layout_hdr(lo);
+ out:
+ 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+ 		nfsi->layout->plh_flags, lseg);
+@@ -762,27 +976,29 @@ out_unlock:
+ 	goto out;
+ }
+ 
++bool
++pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid)
 +{
-+	char *data = (char *)msg->data + msg->copied;
-+	ssize_t mlen = msg->len - msg->copied;
-+	ssize_t left;
-+
-+	if (mlen > buflen)
-+		mlen = buflen;
++	assert_spin_locked(&lo->inode->i_lock);
++	if ((stateid) &&
++	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
++		return true;
++	return lo->plh_block_lgets ||
++		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
++		(list_empty(&lo->segs) &&
++		 (atomic_read(&lo->plh_outstanding) != 0));
++}
 +
-+	left = copy_to_user(dst, data, mlen);
-+	if (left < 0) {
-+		msg->errno = left;
-+		return left;
-+	}
-+	mlen -= left;
-+	msg->copied += mlen;
-+	msg->errno = 0;
-+	return mlen;
+ int
+ pnfs_layout_process(struct nfs4_layoutget *lgp)
+ {
+ 	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+ 	struct nfs4_layoutget_res *res = &lgp->res;
+ 	struct pnfs_layout_segment *lseg;
+-	struct inode *ino = lo->plh_inode;
++	struct inode *ino = lo->inode;
+ 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+ 	int status = 0;
+ 
+-	/* Verify we got what we asked for.
+-	 * Note that because the xdr parsing only accepts a single
+-	 * element array, this can fail even if the server is behaving
+-	 * correctly.
+-	 */
+-	if (lgp->args.range.iomode > res->range.iomode ||
+-	    res->range.offset != 0 ||
+-	    res->range.length != NFS4_MAX_UINT64) {
+-		status = -EINVAL;
+-		goto out;
+-	}
+ 	/* Inject layout blob into I/O device driver */
+ 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+ 	if (!lseg || IS_ERR(lseg)) {
+@@ -792,43 +1008,572 @@ pnfs_layout_process(struct nfs4_layoutge
+ 			status = PTR_ERR(lseg);
+ 		dprintk("%s: Could not allocate layout: error %d\n",
+ 		       __func__, status);
++		spin_lock(&ino->i_lock);
+ 		goto out;
+ 	}
+ 
+ 	spin_lock(&ino->i_lock);
+-	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+-	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
++	/* decrement needs to be done before call to pnfs_layoutget_blocked */
++	atomic_dec(&lo->plh_outstanding);
++	spin_lock(&clp->cl_lock);
++	if (matches_outstanding_recall(ino, &res->range)) {
++		spin_unlock(&clp->cl_lock);
+ 		dprintk("%s forget reply due to recall\n", __func__);
+ 		goto out_forget_reply;
+ 	}
++	spin_unlock(&clp->cl_lock);
+ 
+-	if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
++	if (pnfs_layoutgets_blocked(lo, &res->stateid)) {
+ 		dprintk("%s forget reply due to state\n", __func__);
+ 		goto out_forget_reply;
+ 	}
+ 	init_lseg(lo, lseg);
+-	lseg->pls_range = res->range;
++	lseg->range = res->range;
++	get_lseg(lseg);
+ 	*lgp->lsegpp = lseg;
+ 	pnfs_insert_layout(lo, lseg);
+ 
+ 	if (res->return_on_close) {
+-		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+-		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
++		/* FI: This needs to be re-examined.  At lo level,
++		 * all it needs is a bit indicating whether any of
++		 * the lsegs in the list have the flags set.
++		 */
++		lo->roc_iomode |= res->range.iomode;
+ 	}
+ 
+ 	/* Done processing layoutget. Set the layout stateid */
+ 	pnfs_set_layout_stateid(lo, &res->stateid, false);
+-	spin_unlock(&ino->i_lock);
+ out:
++	if (!pnfs_layoutgets_blocked(lo, NULL))
++		rpc_wake_up(&NFS_I(ino)->lo_rpcwaitq_stateid);
++	spin_unlock(&ino->i_lock);
+ 	return status;
+ 
+ out_forget_reply:
+ 	spin_unlock(&ino->i_lock);
+-	lseg->pls_layout = lo;
++	lseg->layout = lo;
+ 	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
++	spin_lock(&ino->i_lock);
++	goto out;
 +}
 +
-+static ssize_t
-+spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
++void
++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset,
++		size_t *count)
 +{
-+	struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
-+	struct spnfs *spnfs = (struct spnfs *)rpci->private;
-+	struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im;
-+	int ret;
-+
-+	if (mlen != sizeof(struct spnfs_msg))
-+		return -ENOSPC;
-+
-+	im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+	if (im_in == NULL)
-+		return -ENOMEM;
-+
-+	if (copy_from_user(im_in, src, mlen) != 0)
-+		return -EFAULT;
-+
-+	mutex_lock(&spnfs->spnfs_plock);
-+
-+	ret = mlen;
-+	im->im_status = im_in->im_status;
-+	/* If we got an error, terminate now, and wake up pending upcalls */
-+	if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) {
-+		wake_up(&spnfs->spnfs_wq);
-+		goto out;
-+	}
++	struct page *first, *last;
++	loff_t foff, i_size = i_size_read(inode);
++	pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
++	size_t range;
 +
-+	ret = -EINVAL;
-+	/* Did we match the current upcall? */
-+	/* DMXXX: do not understand the comment above, from original code */
-+	/* DMXXX: when do we _not_ match the current upcall? */
-+	/* DMXXX: anyway, let's to a simplistic check */
-+	if (im_in->im_type == im->im_type) {
-+		/* copy the response into the spnfs struct */
-+		memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res));
-+		ret = mlen;
-+	} else
-+		dprintk("spnfs: downcall type != upcall type\n");
++	first = list_entry((pages)->prev, struct page, lru);
++	last = list_entry((pages)->next, struct page, lru);
 +
++	foff = (loff_t)first->index << PAGE_CACHE_SHIFT;
 +
-+	wake_up(&spnfs->spnfs_wq);
-+/* DMXXX handle rval processing */
-+out:
-+	mutex_unlock(&spnfs->spnfs_plock);
-+	kfree(im_in);
-+	return ret;
++	range = (last->index - first->index) * PAGE_CACHE_SIZE;
++	if (last->index == end_index)
++		range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
++	else
++		range += PAGE_CACHE_SIZE;
++	dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff,
++		range);
++	*offset = foff;
++	*count = range;
 +}
 +
-+static void
-+spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg)
++void
++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
 +{
-+	struct spnfs_msg *im = msg->data;
-+	struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im);
++	struct pnfs_layout_hdr *lo;
++	struct pnfs_layoutdriver_type *ld;
 +
-+	if (msg->errno >= 0)
++	pgio->pg_test = NULL;
++
++	lo = NFS_I(inode)->layout;
++	ld = NFS_SERVER(inode)->pnfs_curr_ld;
++	if (!ld || !lo)
 +		return;
-+	mutex_lock(&spnfs->spnfs_plock);
-+	im->im_status = SPNFS_STATUS_FAIL;  /* DMXXX */
-+	wake_up(&spnfs->spnfs_wq);
-+	mutex_unlock(&spnfs->spnfs_plock);
++
++	pgio->pg_test = ld->pg_test;
 +}
 +
-+/* generic upcall.  called by functions in spnfs_ops.c  */
-+int
-+spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg,
-+		union spnfs_msg_res *res)
++/*
++ * rsize is already set by caller to MDS rsize.
++ */
++void
++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
++		  struct inode *inode,
++		  struct nfs_open_context *ctx,
++		  struct list_head *pages,
++		  size_t *rsize)
 +{
-+	struct rpc_pipe_msg msg;
-+	struct spnfs_msg *im;
-+	DECLARE_WAITQUEUE(wq, current);
-+	int ret = -EIO;
-+	int rval;
++	struct nfs_server *nfss = NFS_SERVER(inode);
++	size_t count = 0;
++	loff_t loff;
 +
-+	im = &spnfs->spnfs_im;
++	pgio->pg_iswrite = 0;
++	pgio->pg_test = NULL;
++	pgio->pg_lseg = NULL;
 +
-+	mutex_lock(&spnfs->spnfs_lock);
-+	mutex_lock(&spnfs->spnfs_plock);
++	if (!pnfs_enabled_sb(nfss))
++		return;
 +
-+	memset(im, 0, sizeof(*im));
-+	memcpy(im, upmsg, sizeof(*upmsg));
++	readahead_range(inode, pages, &loff, &count);
++	pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ);
++	if (pgio->pg_lseg) {
++		pnfs_set_pg_test(inode, pgio);
++		*rsize = NFS_SERVER(inode)->ds_rsize;
++	}
++}
 +
-+	memset(&msg, 0, sizeof(msg));
-+	msg.data = im;
-+	msg.len = sizeof(*im);
++void
++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
++		       size_t *wsize)
++{
++	struct nfs_server *server = NFS_SERVER(inode);
 +
-+	add_wait_queue(&spnfs->spnfs_wq, &wq);
-+	rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg);
-+	if (rval < 0) {
-+		remove_wait_queue(&spnfs->spnfs_wq, &wq);
-+		goto out;
++	pgio->pg_iswrite = 1;
++	if (!pnfs_enabled_sb(server))
++		pgio->pg_test = NULL;
++	else {
++		pnfs_set_pg_test(inode, pgio);
++		*wsize = server->ds_wsize;
 +	}
++}
 +
-+	set_current_state(TASK_UNINTERRUPTIBLE);
-+	mutex_unlock(&spnfs->spnfs_plock);
-+	schedule();
-+	current->state = TASK_RUNNING;
-+	remove_wait_queue(&spnfs->spnfs_wq, &wq);
-+	mutex_lock(&spnfs->spnfs_plock);
++/* Set buffer size for data servers */
++void
++pnfs_set_ds_iosize(struct nfs_server *server)
++{
++	unsigned dssize = 0;
 +
-+	if (im->im_status & SPNFS_STATUS_SUCCESS) {
-+		/* copy our result from the upcall */
-+		memcpy(res, &im->im_res, sizeof(*res));
-+		ret = 0;
++	if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize)
++		dssize = server->pnfs_curr_ld->get_blocksize();
++	if (dssize)
++		server->ds_rsize = server->ds_wsize =
++			nfs_block_size(dssize, NULL);
++	else {
++		server->ds_wsize = server->wsize;
++		server->ds_rsize = server->rsize;
 +	}
++}
 +
-+out:
-+	memset(im, 0, sizeof(*im));
-+	mutex_unlock(&spnfs->spnfs_plock);
-+	mutex_unlock(&spnfs->spnfs_lock);
-+	return(ret);
++static int
++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data)
++{
++	put_lseg(pdata->lseg);
++	pdata->lseg = NULL;
++	pdata->call_ops->rpc_call_done(task, data);
++	if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN)
++		return -EAGAIN;
++	if (pdata->pnfsflags & PNFS_NO_RPC) {
++		pdata->call_ops->rpc_release(data);
++	} else {
++		/*
++		 * just restore original rpc call ops
++		 * rpc_release will be called later by the rpc scheduling layer.
++		 */
++		task->tk_ops = pdata->call_ops;
++	}
++	return 0;
 +}
 +
-+/*
-+ * This is used to determine if the spnfsd daemon has been started at
-+ * least once since the system came up.  This is used to by the export
-+ * mechanism to decide if spnfs is in use.
++/* Post-write completion function
++ * Invoked by all layout drivers when write_pagelist is done.
 + *
-+ * Returns non-zero if the spnfsd has initialized the communication pipe
-+ * at least once.
++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC
++ * so that the NFS cleanup routines perform only the page cache
++ * cleanup.
 + */
-+int spnfs_enabled(void)
++static void
++pnfs_write_retry(struct work_struct *work)
 +{
-+	return spnfs_enabled_at_some_point;
++	struct rpc_task *task;
++	struct nfs_write_data *wdata;
++	struct pnfs_layout_range range;
++
++	dprintk("%s enter\n", __func__);
++	task = container_of(work, struct rpc_task, u.tk_work);
++	wdata = container_of(task, struct nfs_write_data, task);
++	range.iomode = IOMODE_RW;
++	range.offset = wdata->args.offset;
++	range.length = wdata->args.count;
++	_pnfs_return_layout(wdata->inode, &range, true);
++	pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode),
++			    wdata->pdata.call_ops, wdata->pdata.how);
 +}
 +
-+#ifdef CONFIG_PROC_FS
++void
++pnfs_writeback_done(struct nfs_write_data *data)
++{
++	struct pnfs_call_data *pdata = &data->pdata;
 +
-+/*
-+ * procfs virtual files for user/kernel space communication:
-+ *
-+ * ctl - currently just an on/off switch...can be expanded
-+ * getfh - fd to fh conversion
-+ * recall - recall a layout from the command line, for example:
-+ *		echo <path> > /proc/fs/spnfs/recall
-+ * config - configuration info, e.g., stripe size, num ds, etc.
-+ */
++	dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
 +
-+/*-------------- start ctl -------------------------*/
-+static ssize_t ctl_write(struct file *file, const char __user *buf,
-+			 size_t count, loff_t *offset)
-+{
-+	int cmd, rc;
++	/* update last write offset and need layout commit
++	 * for non-files layout types (files layout calls
++	 * pnfs4_write_done for this)
++	 */
++	if ((pdata->pnfsflags & PNFS_NO_RPC) &&
++	    data->task.tk_status >= 0 && data->res.count > 0) {
++		struct nfs_inode *nfsi = NFS_I(data->inode);
 +
-+	if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int)))
-+		return -EFAULT;
-+	if (cmd) {
-+		rc = nfsd_spnfs_new();
-+		if (rc != 0)
-+			return rc;
-+	} else
-+		nfsd_spnfs_delete();
++		pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
++		pnfs_need_layoutcommit(nfsi, data->args.context);
++	}
 +
-+	return count;
++	if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
++		INIT_WORK(&data->task.u.tk_work, pnfs_write_retry);
++		queue_work(nfsiod_workqueue, &data->task.u.tk_work);
++	}
 +}
++EXPORT_SYMBOL_GPL(pnfs_writeback_done);
 +
-+static const struct file_operations ctl_ops = {
-+	.write		= ctl_write,
-+};
-+/*-------------- end ctl ---------------------------*/
++static void _pnfs_clear_lseg_from_pages(struct list_head *head)
++{
++	struct nfs_page *req;
 +
-+/*-------------- start config -------------------------*/
-+static ssize_t config_write(struct file *file, const char __user *buf,
-+			    size_t count, loff_t *offset)
++	list_for_each_entry(req, head, wb_list) {
++		put_lseg(req->wb_lseg);
++		req->wb_lseg = NULL;
++	}
++}
++
++/*
++ * Call the appropriate parallel I/O subsystem write function.
++ * If no I/O device driver exists, or one does match the returned
++ * fstype, then return a positive status for regular NFS processing.
++ *
++ * TODO: Is wdata->how and wdata->args.stable always the same value?
++ * TODO: It seems in NFS, the server may not do a stable write even
++ * though it was requested (and vice-versa?).  To check, it looks
++ * in data->res.verf->committed.  Do we need this ability
++ * for non-file layout drivers?
++ */
++enum pnfs_try_status
++pnfs_try_to_write_data(struct nfs_write_data *wdata,
++			const struct rpc_call_ops *call_ops, int how)
 +{
-+	static struct spnfs_config cfg;
++	struct inode *inode = wdata->inode;
++	enum pnfs_try_status trypnfs;
++	struct nfs_server *nfss = NFS_SERVER(inode);
++	struct pnfs_layout_segment *lseg = wdata->req->wb_lseg;
 +
-+	if (copy_from_user(&cfg, buf, count))
-+		return -EFAULT;
++	wdata->pdata.call_ops = call_ops;
++	wdata->pdata.pnfs_error = 0;
++	wdata->pdata.how = how;
 +
-+	spnfs_config = &cfg;
-+	return 0;
-+}
++	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
++		inode->i_ino, wdata->args.count, wdata->args.offset, how);
 +
-+static const struct file_operations config_ops = {
-+	.write		= config_write,
-+};
-+/*-------------- end config ---------------------------*/
++	get_lseg(lseg);
 +
-+/*-------------- start getfh -----------------------*/
-+static int getfh_open(struct inode *inode, struct file *file)
-+{
-+	file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
-+	if (file->private_data == NULL)
-+		return -ENOMEM;
++	if (!pnfs_use_rpc(nfss))
++		wdata->pdata.pnfsflags |= PNFS_NO_RPC;
++	wdata->pdata.lseg = lseg;
++	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata,
++		nfs_page_array_len(wdata->args.pgbase, wdata->args.count),
++		how);
 +
-+	return 0;
++	if (trypnfs == PNFS_NOT_ATTEMPTED) {
++		wdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
++		wdata->pdata.lseg = NULL;
++		put_lseg(lseg);
++		_pnfs_clear_lseg_from_pages(&wdata->pages);
++	} else {
++		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
++	}
++	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
++	return trypnfs;
 +}
 +
-+static ssize_t getfh_read(struct file *file, char __user *buf, size_t count,
-+			  loff_t *offset)
++/* Post-read completion function.  Invoked by all layout drivers when
++ * read_pagelist is done
++ */
++static void
++pnfs_read_retry(struct work_struct *work)
 +{
-+	if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh)))
-+		return -EFAULT;
++	struct rpc_task *task;
++	struct nfs_read_data *rdata;
++	struct pnfs_layout_range range;
 +
-+	return count;
++	dprintk("%s enter\n", __func__);
++	task = container_of(work, struct rpc_task, u.tk_work);
++	rdata = container_of(task, struct nfs_read_data, task);
++	range.iomode = IOMODE_RW;
++	range.offset = rdata->args.offset;
++	range.length = rdata->args.count;
++	_pnfs_return_layout(rdata->inode, &range, true);
++	pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode),
++			   rdata->pdata.call_ops);
 +}
 +
-+static ssize_t getfh_write(struct file *file, const char __user *buf,
-+			   size_t count, loff_t *offset)
++void
++pnfs_read_done(struct nfs_read_data *data)
 +{
-+	int fd;
++	struct pnfs_call_data *pdata = &data->pdata;
 +
-+	if (copy_from_user((int *)&fd, (int *)buf, sizeof(int)))
-+		return -EFAULT;
-+	if (spnfs_getfh(fd, file->private_data) != 0)
-+		return -EIO;
++	dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
 +
-+	return count;
++	if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
++		INIT_WORK(&data->task.u.tk_work, pnfs_read_retry);
++		queue_work(nfsiod_workqueue, &data->task.u.tk_work);
++	}
 +}
++EXPORT_SYMBOL_GPL(pnfs_read_done);
 +
-+static int getfh_release(struct inode *inode, struct file *file)
++/*
++ * Call the appropriate parallel I/O subsystem read function.
++ * If no I/O device driver exists, or one does match the returned
++ * fstype, then return a positive status for regular NFS processing.
++ */
++enum pnfs_try_status
++pnfs_try_to_read_data(struct nfs_read_data *rdata,
++		       const struct rpc_call_ops *call_ops)
 +{
-+	kfree(file->private_data);
-+	return 0;
-+}
++	struct inode *inode = rdata->inode;
++	struct nfs_server *nfss = NFS_SERVER(inode);
++	struct pnfs_layout_segment *lseg = rdata->req->wb_lseg;
++	enum pnfs_try_status trypnfs;
 +
-+static const struct file_operations getfh_ops = {
-+	.open		= getfh_open,
-+	.read		= getfh_read,
-+	.write		= getfh_write,
-+	.release	= getfh_release,
-+};
-+/*-------------- end getfh ------------------------*/
++	rdata->pdata.call_ops = call_ops;
++	rdata->pdata.pnfs_error = 0;
 +
++	dprintk("%s: Reading ino:%lu %u@%llu\n",
++		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
 +
-+/*-------------- start recall layout --------------*/
-+static ssize_t recall_write(struct file *file, const char __user *buf,
-+			    size_t count, loff_t *offset)
-+{
-+	char input[128];
-+	char *path, *str, *p;
-+	int rc;
-+	u64 off = 0, len = 0;
++	get_lseg(lseg);
 +
-+	if (count > 128)
-+		return -EINVAL;
++	if (!pnfs_use_rpc(nfss))
++		rdata->pdata.pnfsflags |= PNFS_NO_RPC;
++	rdata->pdata.lseg = lseg;
++	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata,
++		nfs_page_array_len(rdata->args.pgbase, rdata->args.count));
++	if (trypnfs == PNFS_NOT_ATTEMPTED) {
++		rdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
++		rdata->pdata.lseg = NULL;
++		put_lseg(lseg);
++		_pnfs_clear_lseg_from_pages(&rdata->pages);
++	} else {
++		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
++	}
++	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
++	return trypnfs;
++}
 +
-+	if (copy_from_user(input, buf, count))
-+		return -EFAULT;
++/*
++ * This gives the layout driver an opportunity to read in page "around"
++ * the data to be written.  It returns 0 on success, otherwise an error code
++ * which will either be passed up to user, or ignored if
++ * some previous part of write succeeded.
++ * Note the range [pos, pos+len-1] is entirely within the page.
++ */
++int _pnfs_write_begin(struct inode *inode, struct page *page,
++		      loff_t pos, unsigned len,
++		      struct pnfs_layout_segment *lseg,
++		      struct pnfs_fsdata **fsdata)
++{
++	struct pnfs_fsdata *data;
++	int status = 0;
 +
-+	/* assumes newline-terminated path */
-+	p = memchr(input, '\n', count);
-+	if (p == NULL)
-+		return -EINVAL;
-+	*p = '\0';
++	dprintk("--> %s: pos=%llu len=%u\n",
++		__func__, (unsigned long long)pos, len);
++	data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
++	if (!data) {
++		status = -ENOMEM;
++		goto out;
++	}
++	data->lseg = lseg; /* refcount passed into data to be managed there */
++	status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
++						lseg, page, pos, len, data);
++	if (status) {
++		kfree(data);
++		data = NULL;
++	}
++out:
++	*fsdata = data;
++	dprintk("<-- %s: status=%d\n", __func__, status);
++	return status;
++}
 +
-+	/*
-+	 * Scan for path and, optionally, an offset and length
-+	 * of a layout segment to be recalled; if there are two
-+	 * fields, they're assumed to be path and offset.
-+	 */
-+	p = input;
-+	path = strsep(&p, " ");
-+	if (path == NULL)
-+		return -EINVAL;
++/* pNFS Commit callback function for all layout drivers */
++void
++pnfs_commit_done(struct nfs_write_data *data)
++{
++	struct pnfs_call_data *pdata = &data->pdata;
 +
-+	str = strsep(&p, " ");
-+	if (str != NULL) {
-+		rc = strict_strtoull(str, 10, &off);
-+		if (rc != 0)
-+			return -EINVAL;
++	dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
 +
-+		str = strsep(&p, " ");
-+		if (str != NULL) {
-+			rc = strict_strtoull(str, 10, &len);
-+			if (rc != 0)
-+				return -EINVAL;
-+		}
++	if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
++		struct pnfs_layout_range range = {
++			.iomode = IOMODE_RW,
++			.offset = data->args.offset,
++			.length = data->args.count,
++		};
++		dprintk("%s: retrying\n", __func__);
++		_pnfs_return_layout(data->inode, &range, true);
++		pnfs_initiate_commit(data, NFS_CLIENT(data->inode),
++				     pdata->call_ops, pdata->how, 1);
 +	}
++}
++EXPORT_SYMBOL_GPL(pnfs_commit_done);
 +
-+	rc = spnfs_test_layoutrecall(path, off, len);
-+	if (rc != 0)
-+		return rc;
++enum pnfs_try_status
++pnfs_try_to_commit(struct nfs_write_data *data,
++		    const struct rpc_call_ops *call_ops, int sync)
++{
++	struct inode *inode = data->inode;
++	struct nfs_server *nfss = NFS_SERVER(data->inode);
++	enum pnfs_try_status trypnfs;
 +
-+	return count;
++	dprintk("%s: Begin\n", __func__);
++
++	if (!pnfs_use_rpc(nfss))
++		data->pdata.pnfsflags |= PNFS_NO_RPC;
++	/* We need to account for possibility that
++	 * each nfs_page can point to a different lseg (or be NULL).
++	 * For the immediate case of whole-file-only layouts, we at
++	 * least know there can be only a single lseg.
++	 * We still have to account for the possibility of some being NULL.
++	 * This will be done by passing the buck to the layout driver.
++	 */
++	data->pdata.call_ops = call_ops;
++	data->pdata.pnfs_error = 0;
++	data->pdata.how = sync;
++	data->pdata.lseg = NULL;
++	trypnfs = nfss->pnfs_curr_ld->commit(data, sync);
++	if (trypnfs == PNFS_NOT_ATTEMPTED) {
++		data->pdata.pnfsflags &= ~PNFS_NO_RPC;
++		_pnfs_clear_lseg_from_pages(&data->pages);
++	} else
++		nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT);
++	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
++	return trypnfs;
 +}
 +
-+static const struct file_operations recall_ops = {
-+	.write		= recall_write,
-+};
-+/*-------------- end recall layout --------------*/
++void pnfs_cleanup_layoutcommit(struct inode *inode,
++			       struct nfs4_layoutcommit_data *data)
++{
++	struct nfs_server *nfss = NFS_SERVER(inode);
 +
++	/* TODO: Maybe we should avoid this by allowing the layout driver
++	* to directly xdr its layout on the wire.
++	*/
++	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
++		nfss->pnfs_curr_ld->cleanup_layoutcommit(
++					NFS_I(inode)->layout, data);
++}
 +
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+/*-------------- start layoutseg -------------------------*/
-+static ssize_t layoutseg_write(struct file *file, const char __user *buf,
-+			       size_t count, loff_t *offset)
++/*
++ * Set up the argument/result storage required for the RPC call.
++ */
++static int
++pnfs_setup_layoutcommit(struct inode *inode,
++			struct nfs4_layoutcommit_data *data,
++			loff_t write_begin_pos, loff_t write_end_pos)
 +{
-+	char cmd[3];
++	struct nfs_server *nfss = NFS_SERVER(inode);
++	int result = 0;
 +
-+	if (copy_from_user(cmd, buf, 1))
-+		return -EFAULT;
-+	if (cmd[0] == '0')
-+		spnfs_use_layoutsegments = 0;
-+	else
-+		spnfs_use_layoutsegments = 1;
++	dprintk("--> %s\n", __func__);
 +
-+	return count;
-+}
++	data->args.inode = inode;
++	data->args.fh = NFS_FH(inode);
++	data->args.layout_type = nfss->pnfs_curr_ld->id;
++	data->res.fattr = &data->fattr;
++	nfs_fattr_init(&data->fattr);
 +
-+static const struct file_operations layoutseg_ops = {
-+	.write		= layoutseg_write,
-+};
-+/*-------------- end layoutseg ---------------------------*/
++	/* TODO: Need to determine the correct values */
++	data->args.time_modify_changed = 0;
 +
-+/*-------------- start layoutsegsize -------------------------*/
-+static ssize_t layoutsegsize_write(struct file *file, const char __user *buf,
-+				   size_t count, loff_t *offset)
-+{
-+	char cmd[50];
++	/* Set values from inode so it can be reset
++	 */
++	data->args.range.iomode = IOMODE_RW;
++	data->args.range.offset = write_begin_pos;
++	data->args.range.length = write_end_pos - write_begin_pos + 1;
++	data->args.lastbytewritten =  min(write_end_pos,
++					  i_size_read(inode) - 1);
++	data->args.bitmask = nfss->attr_bitmask;
++	data->res.server = nfss;
 +
-+	if (copy_from_user(cmd, buf, 49))
-+		return -EFAULT;
-+	layoutsegment_size = simple_strtoull(cmd, NULL, 10);
++	/* Call layout driver to set the arguments */
++	if (nfss->pnfs_curr_ld->setup_layoutcommit)
++		result = nfss->pnfs_curr_ld->setup_layoutcommit(
++				NFS_I(inode)->layout, &data->args);
 +
-+	return count;
++	dprintk("<-- %s Status %d\n", __func__, result);
++	return result;
 +}
 +
-+static const struct file_operations layoutsegsize_ops = {
-+	.write		= layoutsegsize_write,
-+};
-+/*-------------- end layoutsegsize ---------------------------*/
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
-+
++/* Issue a async layoutcommit for an inode.
++ */
 +int
-+spnfs_init_proc(void)
++pnfs_layoutcommit_inode(struct inode *inode, int sync)
 +{
-+	struct proc_dir_entry *entry;
-+
-+	entry = proc_mkdir("fs/spnfs", NULL);
-+	if (!entry)
-+		return -ENOMEM;
-+
-+	entry = create_proc_entry("fs/spnfs/ctl", 0, NULL);
-+	if (!entry)
-+		return -ENOMEM;
-+	entry->proc_fops = &ctl_ops;
-+
-+	entry = create_proc_entry("fs/spnfs/config", 0, NULL);
-+	if (!entry)
-+		return -ENOMEM;
-+	entry->proc_fops = &config_ops;
++	struct nfs4_layoutcommit_data *data;
++	struct nfs_inode *nfsi = NFS_I(inode);
++	loff_t write_begin_pos;
++	loff_t write_end_pos;
 +
-+	entry = create_proc_entry("fs/spnfs/getfh", 0, NULL);
-+	if (!entry)
-+		return -ENOMEM;
-+	entry->proc_fops = &getfh_ops;
++	int status = 0;
 +
-+	entry = create_proc_entry("fs/spnfs/recall", 0, NULL);
-+	if (!entry)
-+		return -ENOMEM;
-+	entry->proc_fops = &recall_ops;
++	dprintk("%s Begin (sync:%d)\n", __func__, sync);
 +
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+	entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL);
-+	if (!entry)
-+		return -ENOMEM;
-+	entry->proc_fops = &layoutseg_ops;
++	BUG_ON(!has_layout(nfsi));
 +
-+	entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL);
-+	if (!entry)
++	data = kzalloc(sizeof(*data), GFP_NOFS);
++	if (!data)
 +		return -ENOMEM;
-+	entry->proc_fops = &layoutsegsize_ops;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
 +
-+	return 0;
-+}
-+#endif /* CONFIG_PROC_FS */
-diff --git a/fs/nfsd/spnfs_ops.c b/fs/nfsd/spnfs_ops.c
-new file mode 100644
-index 0000000..b97a5af
---- /dev/null
-+++ b/fs/nfsd/spnfs_ops.c
-@@ -0,0 +1,878 @@
-+/*
-+ * fs/nfsd/spnfs_ops.c
-+ *
-+ * Communcation layer between spNFS kernel and userspace
-+ *
-+ */
-+/******************************************************************************
++	spin_lock(&inode->i_lock);
++	if (!layoutcommit_needed(nfsi)) {
++		spin_unlock(&inode->i_lock);
++		goto out_free;
++	}
 +
-+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
++	/* Clear layoutcommit properties in the inode so
++	 * new lc info can be generated
++	 */
++	write_begin_pos = nfsi->layout->write_begin_pos;
++	write_end_pos = nfsi->layout->write_end_pos;
++	data->cred = nfsi->layout->cred;
++	nfsi->layout->write_begin_pos = 0;
++	nfsi->layout->write_end_pos = 0;
++	nfsi->layout->cred = NULL;
++	__clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
++	memcpy(data->args.stateid.data, nfsi->layout->stateid.data,
++	       NFS4_STATEID_SIZE);
 +
-+Network Appliance provides this source code under the GPL v2 License.
-+The GPL v2 license is available at
-+http://opensource.org/licenses/gpl-license.php.
++	/* Reference for layoutcommit matched in pnfs_layoutcommit_release */
++	get_layout_hdr(NFS_I(inode)->layout);
 +
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++	spin_unlock(&inode->i_lock);
 +
-+******************************************************************************/
++	/* Set up layout commit args */
++	status = pnfs_setup_layoutcommit(inode, data, write_begin_pos,
++					 write_end_pos);
++	if (status) {
++		/* The layout driver failed to setup the layoutcommit */
++		put_rpccred(data->cred);
++		put_layout_hdr(NFS_I(inode)->layout);
++		goto out_free;
++	}
++	status = nfs4_proc_layoutcommit(data, sync);
++out:
++	dprintk("%s end (err:%d)\n", __func__, status);
++	return status;
++out_free:
++	kfree(data);
+ 	goto out;
+ }
+ 
++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
++{
++	/* lseg refcounting handled directly in nfs_write_end */
++	kfree(fsdata);
++}
 +
-+#include <linux/sched.h>
-+#include <linux/file.h>
-+#include <linux/namei.h>
-+#include <linux/nfs_fs.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd/debug.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
+ /*
+  * Device ID cache. Currently supports one layout type per struct nfs_client.
+  * Add layout type to the lookup key to expand to support multiple types.
+@@ -861,6 +1606,25 @@ pnfs_alloc_init_deviceid_cache(struct nf
+ }
+ EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+ 
++/* Must be called with locked c->dc_lock */
++static struct pnfs_deviceid_node *
++pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c,
++		     struct nfs4_deviceid *id)
++{
++	struct pnfs_deviceid_node *d;
++	struct hlist_node *n;
++	long h = nfs4_deviceid_hash(id);
 +
-+#include "pnfsd.h"
++	dprintk("%s hash %ld\n", __func__, h);
++	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
++		if (!memcmp(&d->de_id, id, sizeof(*id))) {
++			hlist_del_rcu(&d->de_node);
++			return d;
++		}
 +
-+/* comment out CONFIG_SPNFS_TEST for non-test behaviour */
-+/* #define CONFIG_SPNFS_TEST 1 */
++	return NULL;
++}
 +
-+#define	NFSDDBG_FACILITY		NFSDDBG_PNFS
+ /*
+  * Called from pnfs_layoutdriver_type->free_lseg
+  * last layout segment reference frees deviceid
+@@ -869,29 +1633,33 @@ void
+ pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+ 		  struct pnfs_deviceid_node *devid)
+ {
+-	struct nfs4_deviceid *id = &devid->de_id;
+-	struct pnfs_deviceid_node *d;
+-	struct hlist_node *n;
+-	long h = nfs4_deviceid_hash(id);
+-
+ 	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+ 	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+ 		return;
+ 
+-	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+-		if (!memcmp(&d->de_id, id, sizeof(*id))) {
+-			hlist_del_rcu(&d->de_node);
+-			spin_unlock(&c->dc_lock);
+-			synchronize_rcu();
+-			c->dc_free_callback(devid);
+-			return;
+-		}
++	pnfs_unhash_deviceid(c, &devid->de_id);
+ 	spin_unlock(&c->dc_lock);
+-	/* Why wasn't it found in  the list? */
+-	BUG();
++	synchronize_rcu();
++	c->dc_free_callback(devid);
+ }
+ EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+ 
++void
++pnfs_delete_deviceid(struct pnfs_deviceid_cache *c,
++		     struct nfs4_deviceid *id)
++{
++	struct pnfs_deviceid_node *devid;
 +
-+/*
-+ * The functions that are called from elsewhere in the kernel
-+ * to perform tasks in userspace
-+ *
-+ */
++	spin_lock(&c->dc_lock);
++	devid = pnfs_unhash_deviceid(c, id);
++	spin_unlock(&c->dc_lock);
++	synchronize_rcu();
++	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
++	if (atomic_dec_and_test(&devid->de_ref))
++		c->dc_free_callback(devid);
++}
++EXPORT_SYMBOL_GPL(pnfs_delete_deviceid);
 +
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+extern int spnfs_use_layoutsegments;
-+extern uint64_t layoutsegment_size;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
-+extern struct spnfs *global_spnfs;
+ /* Find and reference a deviceid */
+ struct pnfs_deviceid_node *
+ pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+diff -up linux-2.6.37.noarch/fs/nfs/pnfs.h.orig linux-2.6.37.noarch/fs/nfs/pnfs.h
+--- linux-2.6.37.noarch/fs/nfs/pnfs.h.orig	2011-01-28 09:37:32.549979704 -0500
++++ linux-2.6.37.noarch/fs/nfs/pnfs.h	2011-01-28 09:43:53.341771581 -0500
+@@ -30,17 +30,31 @@
+ #ifndef FS_NFS_PNFS_H
+ #define FS_NFS_PNFS_H
+ 
++#include <linux/nfs_page.h>
++#include "callback.h"
 +
-+int
-+spnfs_layout_type(struct super_block *sb)
-+{
-+	return LAYOUT_NFSV4_1_FILES;
-+}
+ enum {
+ 	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
+-	NFS_LSEG_ROC,		/* roc bit received from server */
+ };
+ 
+ struct pnfs_layout_segment {
+-	struct list_head pls_list;
+-	struct pnfs_layout_range pls_range;
++	struct list_head fi_list;
++	struct pnfs_layout_range range;
+ 	atomic_t pls_refcount;
+ 	unsigned long pls_flags;
+-	struct pnfs_layout_hdr *pls_layout;
++	struct pnfs_layout_hdr *layout;
++	u64 pls_notify_mask;
++};
 +
-+enum nfsstat4
-+spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr,
-+		const struct nfsd4_pnfs_layoutget_arg *lg_arg,
-+		struct nfsd4_pnfs_layoutget_res *lg_res)
-+{
-+	struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
-+	struct spnfs_msg *im = NULL;
-+	union spnfs_msg_res *res = NULL;
-+	struct pnfs_filelayout_layout *flp = NULL;
-+	int status, i;
-+	enum nfsstat4 nfserr;
++enum pnfs_try_status {
++	PNFS_ATTEMPTED     = 0,
++	PNFS_NOT_ATTEMPTED = 1,
++};
 +
-+	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+	if (im == NULL) {
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
-+	}
++struct pnfs_fsdata {
++	struct pnfs_layout_segment *lseg;
++	int bypass_eof;
++	void *private;
+ };
+ 
+ #ifdef CONFIG_NFS_V4_1
+@@ -51,8 +65,15 @@ enum {
+ 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
+ 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
+ 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
+-	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */
+-	NFS_LAYOUT_DESTROYED,		/* no new use of layout allowed */
++	NFS_LAYOUT_NEED_LCOMMIT,	/* LAYOUTCOMMIT needed */
++};
 +
-+	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+	if (res == NULL) {
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
-+	}
++enum layoutdriver_policy_flags {
++	/* Should the full nfs rpc cleanup code be used after io */
++	PNFS_USE_RPC_CODE		= 1 << 0,
 +
-+	im->im_type = SPNFS_TYPE_LAYOUTGET;
-+	im->im_args.layoutget_args.inode = inode->i_ino;
-+	im->im_args.layoutget_args.generation = inode->i_generation;
++	/* Should the pNFS client commit and return the layout upon a setattr */
++	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 1,
+ };
+ 
+ /* Per-layout driver specific registration structure */
+@@ -61,23 +82,88 @@ struct pnfs_layoutdriver_type {
+ 	const u32 id;
+ 	const char *name;
+ 	struct module *owner;
+-	int (*set_layoutdriver) (struct nfs_server *);
++	unsigned flags;
++	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+ 	int (*clear_layoutdriver) (struct nfs_server *);
 +
-+	/* call function to queue the msg for upcall */
-+	if (spnfs_upcall(spnfs, im, res) != 0) {
-+		dprintk("failed spnfs upcall: layoutget\n");
-+		nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
-+		goto layoutget_cleanup;
-+	}
-+	status = res->layoutget_res.status;
-+	if (status != 0) {
-+		/* FIXME? until user mode is fixed, translate system error */
-+		switch (status) {
-+		case -E2BIG:
-+		case -ETOOSMALL:
-+			nfserr = NFS4ERR_TOOSMALL;
-+			break;
-+		case -ENOMEM:
-+		case -EAGAIN:
-+		case -EINTR:
-+			nfserr = NFS4ERR_LAYOUTTRYLATER;
-+			break;
-+		case -ENOENT:
-+			nfserr = NFS4ERR_BADLAYOUT;
-+			break;
-+ 		default:
-+			nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
-+		}
-+		dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n",
-+			status, nfserr);
-+		goto layoutget_cleanup;
-+	}
++	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
++	void (*free_layout_hdr) (struct pnfs_layout_hdr *);
 +
-+	lg_res->lg_return_on_close = 0;
-+#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
-+	/* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */
-+	/* the amount requested by the client.			      */
-+	if (spnfs_use_layoutsegments) {
-+		if (layoutsegment_size != 0)
-+			lg_res->lg_seg.length = layoutsegment_size;
-+	} else
-+		lg_res->lg_seg.length = NFS4_MAX_UINT64;
-+#else
-+	lg_res->lg_seg.length = NFS4_MAX_UINT64;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+ 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+ 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 +
-+	flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL);
-+	if (flp == NULL) {
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
-+	}
-+	flp->device_id.sbid = lg_arg->lg_sbid;
-+	flp->device_id.devid = res->layoutget_res.devid;
-+	flp->lg_layout_type = 1; /* XXX */
-+	flp->lg_stripe_type = res->layoutget_res.stripe_type;
-+	flp->lg_commit_through_mds = 0;
-+	flp->lg_stripe_unit =  res->layoutget_res.stripe_size;
-+	flp->lg_first_stripe_index = 0;
-+	flp->lg_pattern_offset = 0;
-+	flp->lg_fh_length = res->layoutget_res.stripe_count;
++	/* test for nfs page cache coalescing */
++	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 +
-+	flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh),
-+				  GFP_KERNEL);
-+	if (flp->lg_fh_list == NULL) {
-+		nfserr = NFS4ERR_LAYOUTTRYLATER;
-+		goto layoutget_cleanup;
-+	}
-+	/*
-+	 * FIX: Doing an extra copy here.  Should group res.flist's fh_len
-+	 * and fh_val into a knfsd_fh structure.
++	/* Retreive the block size of the file system.
++	 * If gather_across_stripes == 1, then the file system will gather
++	 * requests into the block size.
++	 * TODO: Where will the layout driver get this info?  It is hard
++	 * coded in PVFS2.
 +	 */
-+	for (i = 0; i < flp->lg_fh_length; i++) {
-+		flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len;
-+		memcpy(&flp->lg_fh_list[i].fh_base,
-+		       res->layoutget_res.flist[i].fh_val,
-+		       res->layoutget_res.flist[i].fh_len);
-+	}
++	ssize_t (*get_blocksize) (void);
 +
-+	/* encode the layoutget body */
-+	nfserr = filelayout_encode_layout(xdr, flp);
++/* read and write pagelist should return just 0 (to indicate that
++	 * the layout code has taken control) or 1 (to indicate that the
++	 * layout code wishes to fall back to normal nfs.)  If 0 is returned,
++	 * information can be passed back through nfs_data->res and
++	 * nfs_data->task.tk_status, and the appropriate pnfs done function
++	 * MUST be called.
++	 */
++	enum pnfs_try_status
++	(*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages);
++	enum pnfs_try_status
++	(*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how);
++	int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
++			    loff_t pos, unsigned count,
++			    struct pnfs_fsdata *fsdata);
++	int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
++			 unsigned count, unsigned copied,
++			 struct pnfs_layout_segment *lseg);
++	void (*write_end_cleanup)(struct file *filp,
++				  struct pnfs_fsdata *fsdata);
 +
-+layoutget_cleanup:
-+	if (flp) {
-+		if (flp->lg_fh_list)
-+			kfree(flp->lg_fh_list);
-+		kfree(flp);
-+	}
-+	kfree(im);
-+	kfree(res);
++	/* Consistency ops */
++	/* 2 problems:
++	 * 1) the page list contains nfs_pages, NOT pages
++	 * 2) currently the NFS code doesn't create a page array (as it does with read/write)
++	 */
++	enum pnfs_try_status
++	(*commit) (struct nfs_write_data *nfs_data, int how);
 +
-+	return nfserr;
-+}
++	int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
++				   struct nfs4_layoutcommit_args *args);
 +
-+int
-+spnfs_layoutcommit(void)
-+{
-+	return 0;
-+}
++	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
++				     struct xdr_stream *xdr,
++				     const struct nfs4_layoutcommit_args *args);
 +
-+int
-+spnfs_layoutreturn(struct inode *inode,
-+		   const struct nfsd4_pnfs_layoutreturn_arg *args)
-+{
-+	return 0;
-+}
++	void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
++				      struct nfs4_layoutcommit_data *data);
 +
-+int
-+spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
-+{
-+	struct super_block *sb;
-+	struct nfsd4_pnfs_cb_layout lr;
++	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
++				     struct xdr_stream *xdr,
++				     const struct nfs4_layoutreturn_args *args);
+ };
+ 
+ struct pnfs_layout_hdr {
+ 	atomic_t		plh_refcount;
+-	struct list_head	plh_layouts;   /* other client layouts */
++	struct list_head	layouts;   /* other client layouts */
+ 	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
+-	struct list_head	plh_segs;      /* layout segments list */
+-	nfs4_stateid		plh_stateid;
++	struct list_head	segs;      /* layout segments list */
++	int			roc_iomode;/* return on close iomode, 0=none */
++	nfs4_stateid		stateid;
+ 	atomic_t		plh_outstanding; /* number of RPCs out */
+ 	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
+ 	u32			plh_barrier; /* ignore lower seqids */
+ 	unsigned long		plh_flags;
+-	struct inode		*plh_inode;
++	struct rpc_cred		*cred;     /* layoutcommit credential */
++	/* DH: These vars keep track of the maximum write range
++	 * so the values can be used for layoutcommit.
++	 */
++	loff_t			write_begin_pos;
++	loff_t			write_end_pos;
++	struct inode		*inode;
+ };
+ 
+ struct pnfs_device {
+@@ -90,6 +176,23 @@ struct pnfs_device {
+ 	unsigned int  pglen;
+ };
+ 
++struct pnfs_cb_lrecall_info {
++	struct list_head	pcl_list; /* hook into cl_layoutrecalls list */
++	atomic_t		pcl_count;
++	int			pcl_notify_bit;
++	struct nfs_client	*pcl_clp;
++	struct inode		*pcl_ino;
++	struct cb_layoutrecallargs pcl_args;
++};
 +
-+	switch (type) {
-+	case RETURN_FILE:
-+		sb = inode->i_sb;
-+		dprintk("%s: recalling layout for ino = %lu\n",
-+			__func__, inode->i_ino);
-+		break;
-+	case RETURN_FSID:
-+		sb = inode->i_sb;
-+		dprintk("%s: recalling layout for fsid x (unimplemented)\n",
-+			__func__);
-+		return 0;
-+	case RETURN_ALL:
-+		/* XXX figure out how to get a sb since there's no inode ptr */
-+		dprintk("%s: recalling all layouts (unimplemented)\n",
-+			__func__);
-+		return 0;
-+	default:
-+		return -EINVAL;
-+	}
++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
 +
-+	lr.cbl_recall_type = type;
-+	lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES;
-+	lr.cbl_seg.clientid = 0;
-+	lr.cbl_seg.offset = offset;
-+	lr.cbl_seg.length = len;
-+	lr.cbl_seg.iomode = IOMODE_ANY;
-+	lr.cbl_layoutchanged = 0;
++struct pnfs_devicelist {
++	unsigned int		eof;
++	unsigned int		num_devs;
++	struct nfs4_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
++};
++
+ /*
+  * Device ID RCU cache. A device ID is unique per client ID and layout type.
+  */
+@@ -135,22 +238,55 @@ extern struct pnfs_deviceid_node *pnfs_a
+ 				struct pnfs_deviceid_node *);
+ extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+ 			      struct pnfs_deviceid_node *devid);
++extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *,
++				 struct nfs4_deviceid *);
+ 
+ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+ 
+ /* nfs4proc.c */
++extern int nfs4_proc_getdevicelist(struct nfs_server *server,
++				   const struct nfs_fh *fh,
++				   struct pnfs_devicelist *devlist);
+ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+ 				   struct pnfs_device *dev);
+ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
++				   int issync);
++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
+ 
+ /* pnfs.c */
+ void get_layout_hdr(struct pnfs_layout_hdr *lo);
++void get_lseg(struct pnfs_layout_segment *lseg);
++void put_lseg(struct pnfs_layout_segment *lseg);
++bool should_free_lseg(struct pnfs_layout_range *lseg_range,
++		      struct pnfs_layout_range *recall_range);
+ struct pnfs_layout_segment *
+ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+-		   enum pnfs_iomode access_type);
+-void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
++		   loff_t pos, u64 count, enum pnfs_iomode access_type);
++bool pnfs_return_layout_barrier(struct nfs_inode *, struct pnfs_layout_range *);
++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id);
+ void unset_pnfs_layoutdriver(struct nfs_server *);
++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
++					     const struct rpc_call_ops *, int);
++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
++					    const struct rpc_call_ops *);
++void pnfs_cleanup_layoutcommit(struct inode *,
++			       struct nfs4_layoutcommit_data *);
++int pnfs_layoutcommit_inode(struct inode *inode, int sync);
++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent);
++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx);
++void pnfs_set_ds_iosize(struct nfs_server *server);
++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
++					 const struct rpc_call_ops *, int);
++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
++			   struct nfs_open_context *, struct list_head *,
++			   size_t *);
++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
++			    size_t *);
++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
++bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid);
+ int pnfs_layout_process(struct nfs4_layoutget *lgp);
+ void pnfs_free_lseg_list(struct list_head *tmp_list);
+ void pnfs_destroy_layout(struct nfs_inode *);
+@@ -162,14 +298,23 @@ void pnfs_set_layout_stateid(struct pnfs
+ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+ 				  struct pnfs_layout_hdr *lo,
+ 				  struct nfs4_state *open_state);
+-int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+-				struct list_head *tmp_list,
+-				u32 iomode);
+-bool pnfs_roc(struct inode *ino);
+-void pnfs_roc_release(struct inode *ino);
+-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+-bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
++void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo,
++				struct pnfs_layout_range *range,
++				int notify_bit, atomic_t *notify_count,
++				struct list_head *tmp_list);
++void pnfs_read_done(struct nfs_read_data *);
++void pnfs_writeback_done(struct nfs_write_data *);
++void pnfs_commit_done(struct nfs_write_data *);
++int _pnfs_write_begin(struct inode *inode, struct page *page,
++		      loff_t pos, unsigned len,
++		      struct pnfs_layout_segment *lseg,
++		      struct pnfs_fsdata **fsdata);
+ 
++static inline bool
++has_layout(struct nfs_inode *nfsi)
++{
++	return nfsi->layout != NULL;
++}
+ 
+ static inline int lo_fail_bit(u32 iomode)
+ {
+@@ -183,6 +328,125 @@ static inline int pnfs_enabled_sb(struct
+ 	return nfss->pnfs_curr_ld != NULL;
+ }
+ 
++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
++			       struct pnfs_fsdata *fsdata)
++{
++	return !fsdata  || ((struct pnfs_layout_segment *)fsdata == lseg) ||
++		!fsdata->bypass_eof;
++}
 +
-+	nfsd_layout_recall_cb(sb, inode, &lr);
++/* Should the pNFS client commit and return the layout upon a setattr */
++static inline bool
++pnfs_ld_layoutret_on_setattr(struct inode *inode)
++{
++	if (!pnfs_enabled_sb(NFS_SERVER(inode)))
++		return false;
++	return NFS_SERVER(inode)->pnfs_curr_ld->flags &
++		PNFS_LAYOUTRET_ON_SETATTR;
++}
 +
-+	return 0;
++static inline bool pnfs_use_rpc(struct nfs_server *nfss)
++{
++	if (pnfs_enabled_sb(nfss))
++		return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE;
++
++	return true;
 +}
 +
++/* Should the pNFS client commit and return the layout on close
++ */
++static inline int
++pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
++{
++	return nfsi->layout->roc_iomode;
++}
 +
-+int
-+spnfs_test_layoutrecall(char *path, u64 offset, u64 len)
++static inline int pnfs_write_begin(struct file *filp, struct page *page,
++				   loff_t pos, unsigned len,
++				   struct pnfs_layout_segment *lseg,
++				   void **fsdata)
 +{
-+	struct nameidata nd;
-+	struct inode *inode;
-+	int type, rc;
++	struct inode *inode = filp->f_dentry->d_inode;
++	struct nfs_server *nfss = NFS_SERVER(inode);
++	int status = 0;
 +
-+	dprintk("%s: path=%s, offset=%llu, len=%llu\n",
-+		__func__, path, offset, len);
++	*fsdata = lseg;
++	if (lseg && nfss->pnfs_curr_ld->write_begin)
++		status = _pnfs_write_begin(inode, page, pos, len, lseg,
++					   (struct pnfs_fsdata **) fsdata);
++	return status;
++}
 +
-+	if (strcmp(path, "all") == 0) {
-+		inode = NULL;
-+		type = RETURN_ALL;
-+	} else {
-+		rc = path_lookup(path, 0, &nd);
-+		if (rc != 0)
-+			return -ENOENT;
++/* CAREFUL - what happens if copied < len??? */
++static inline int pnfs_write_end(struct file *filp, struct page *page,
++				 loff_t pos, unsigned len, unsigned copied,
++				 struct pnfs_layout_segment *lseg)
++{
++	struct inode *inode = filp->f_dentry->d_inode;
++	struct nfs_server *nfss = NFS_SERVER(inode);
 +
-+		/*
-+		 * XXX todo: add a RETURN_FSID scenario here...maybe if
-+		 * inode is a dir...
-+		 */
++	if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
++		return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
++						     copied, lseg);
++	else
++		return 0;
++}
 +
-+		inode = nd.path.dentry->d_inode;
-+		type = RETURN_FILE;
++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
++{
++	struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
++
++	if (fsdata && nfss->pnfs_curr_ld) {
++		if (nfss->pnfs_curr_ld->write_end_cleanup)
++			nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
++		if (nfss->pnfs_curr_ld->write_begin)
++			pnfs_free_fsdata(fsdata);
 +	}
++}
 +
-+	if (len == 0)
-+		len = NFS4_MAX_UINT64;
++static inline int pnfs_return_layout(struct inode *ino,
++				     struct pnfs_layout_range *range,
++				     bool wait)
++{
++	struct nfs_inode *nfsi = NFS_I(ino);
++	struct nfs_server *nfss = NFS_SERVER(ino);
 +
-+	rc = spnfs_layoutrecall(inode, type, offset, len);
++	if (pnfs_enabled_sb(nfss) && has_layout(nfsi))
++		return _pnfs_return_layout(ino, range, wait);
 +
-+	if (type != RETURN_ALL)
-+		path_put(&nd.path);
-+	return rc;
++	return 0;
 +}
 +
-+int
-+spnfs_getdeviceiter(struct super_block *sb,
-+		    u32 layout_type,
-+		    struct nfsd4_pnfs_dev_iter_res *gd_res)
++static inline bool
++layoutcommit_needed(struct nfs_inode *nfsi)
 +{
-+	struct spnfs *spnfs = global_spnfs;   /* XXX keep up the pretence */
-+	struct spnfs_msg *im = NULL;
-+	union spnfs_msg_res *res = NULL;
-+	int status = 0;
-+
-+	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+	if (im == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceiter_out;
-+	}
++	return has_layout(nfsi) &&
++	       test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->plh_flags);
++}
 +
-+	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+	if (res == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceiter_out;
-+	}
++static inline int pnfs_get_write_status(struct nfs_write_data *data)
++{
++	return data->pdata.pnfs_error;
++}
 +
-+	im->im_type = SPNFS_TYPE_GETDEVICEITER;
-+	im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie;
-+	im->im_args.getdeviceiter_args.verf = gd_res->gd_verf;
++static inline int pnfs_get_read_status(struct nfs_read_data *data)
++{
++	return data->pdata.pnfs_error;
++}
 +
-+	/* call function to queue the msg for upcall */
-+	status = spnfs_upcall(spnfs, im, res);
-+	if (status != 0) {
-+		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+		status = -EIO;
-+		goto getdeviceiter_out;
-+	}
-+	status = res->getdeviceiter_res.status;
++static inline struct pnfs_layout_segment *
++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
++{
++	if (fsdata) {
++		struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
 +
-+	if (res->getdeviceiter_res.eof)
-+		gd_res->gd_eof = 1;
-+	else {
-+		gd_res->gd_devid = res->getdeviceiter_res.devid;
-+		gd_res->gd_cookie = res->getdeviceiter_res.cookie;
-+		gd_res->gd_verf = res->getdeviceiter_res.verf;
-+		gd_res->gd_eof = 0;
++		if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
++			return ((struct pnfs_fsdata *) fsdata)->lseg;
++		return (struct pnfs_layout_segment *)fsdata;
 +	}
-+
-+getdeviceiter_out:
-+	kfree(im);
-+	kfree(res);
-+
-+	return status;
++	return NULL;
 +}
 +
-+#ifdef CONFIG_SPNFS_TEST
-+/*
-+ * Setup the rq_res xdr_buf.  The svc_rqst rq_respages[1] page contains the
-+ * 1024 encoded stripe indices.
-+ *
-+ * Skip the devaddr4 length and encode the indicies count (1024) in the
-+ * rq_res.head and set the rq_res.head length.
-+ *
-+ * Set the rq_res page_len to 4096 (for the 1024 stripe indices).
-+ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the
-+ * rq_res head to hold the rest of the getdeviceinfo return.
-+ *
-+ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and
-+ * rq_respages[rq_resused] contains the rq_res.pages.
-+ */
-+static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info,
-+				  const struct pnfs_filelayout_device *fdev)
+ #else  /* CONFIG_NFS_V4_1 */
+ 
+ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+@@ -193,36 +457,90 @@ static inline void pnfs_destroy_layout(s
+ {
+ }
+ 
++static inline void get_lseg(struct pnfs_layout_segment *lseg)
 +{
-+	struct nfsd4_compoundres *resp = info->resp;
-+	struct svc_rqst *rqstp = resp->rqstp;
-+	struct xdr_buf *xb = &resp->rqstp->rq_res;
-+	__be32 *p;
++}
 +
-+	p = nfsd4_xdr_reserve_space(resp, 8);
-+	p++; /* Fill in length later */
-+	*p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */
-+	resp->p = p;
++static inline void put_lseg(struct pnfs_layout_segment *lseg)
++{
++}
 +
-+	xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base;
-+	xb->pages = &rqstp->rq_respages[rqstp->rq_resused];
-+	xb->page_base = 0;
-+	xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */
-+	xb->tail[0].iov_base = resp->p;
-+	resp->end = xb->head[0].iov_base + PAGE_SIZE;
-+	xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p;
+ static inline struct pnfs_layout_segment *
+ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+-		   enum pnfs_iomode access_type)
++		   loff_t pos, u64 count, enum pnfs_iomode access_type)
+ {
+ 	return NULL;
+ }
+ 
+ static inline bool
+-pnfs_roc(struct inode *ino)
++has_layout(struct nfs_inode *nfsi)
+ {
+ 	return false;
+ }
+ 
+-static inline void
+-pnfs_roc_release(struct inode *ino)
++static inline bool
++layoutcommit_needed(struct nfs_inode *nfsi)
+ {
 +	return 0;
+ }
+ 
+-static inline void
+-pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
++			       struct pnfs_fsdata *fsdata)
+ {
++	return 1;
 +}
-+/*
-+ * Return a stripeindices of length 1024 to test
-+ * the pNFS client multipage getdeviceinfo implementation.
-+ *
-+ * Encode a page of stripe indices.
-+ */
-+static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev,
-+				  struct spnfs_device *dev,
-+				  struct pnfs_devinfo_arg *info)
++
++static inline enum pnfs_try_status
++pnfs_try_to_read_data(struct nfs_read_data *data,
++		      const struct rpc_call_ops *call_ops)
 +{
-+	struct svc_rqst *rqstp = info->xdr.resp->rqstp;
-+	__be32 *p;
-+	int i, j = 0;
++	return PNFS_NOT_ATTEMPTED;
++}
 +
-+	p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]);
-+	fldev->fl_stripeindices_length = 1024;
-+	/* round-robin the data servers device index into the stripe indicie */
-+	for (i = 0; i < 1024; i++) {
-+		*p++ = cpu_to_be32(j);
-+		if (j < dev->dscount - 1)
-+			j++;
-+		else
-+			j = 0;
-+	}
-+	fldev->fl_stripeindices_list = NULL;
++static inline enum pnfs_try_status
++pnfs_try_to_write_data(struct nfs_write_data *data,
++		       const struct rpc_call_ops *call_ops, int how)
++{
++	return PNFS_NOT_ATTEMPTED;
 +}
-+#endif /* CONFIG_SPNFS_TEST */
 +
-+int
-+spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
-+		    u32 layout_type,
-+		    const struct nfsd4_pnfs_deviceid *devid)
++static inline enum pnfs_try_status
++pnfs_try_to_commit(struct nfs_write_data *data,
++		   const struct rpc_call_ops *call_ops, int how)
 +{
-+	struct spnfs *spnfs = global_spnfs;
-+	struct spnfs_msg *im = NULL;
-+	union spnfs_msg_res *res = NULL;
-+	struct spnfs_device *dev;
-+	struct pnfs_filelayout_device *fldev = NULL;
-+	struct pnfs_filelayout_multipath *mp = NULL;
-+	struct pnfs_filelayout_devaddr *fldap = NULL;
-+	int status = 0, i, len;
-+
-+	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+	if (im == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceinfo_out;
-+	}
++	return PNFS_NOT_ATTEMPTED;
++}
 +
-+	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+	if (res == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceinfo_out;
-+	}
++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync)
++{
++	return 0;
+ }
+ 
+ static inline bool
+-pnfs_roc_drain(struct inode *ino, u32 *barrier)
++pnfs_ld_layoutret_on_setattr(struct inode *inode)
+ {
+ 	return false;
+ }
+ 
+-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
++static inline bool pnfs_use_rpc(struct nfs_server *nfss)
++{
++	return true;
++}
 +
-+	im->im_type = SPNFS_TYPE_GETDEVICEINFO;
-+	/* XXX FIX: figure out what to do about fsid */
-+	im->im_args.getdeviceinfo_args.devid = devid->devid;
++static inline int
++pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
++{
++	return 0;
++}
 +
-+	/* call function to queue the msg for upcall */
-+	status = spnfs_upcall(spnfs, im, res);
-+	if (status != 0) {
-+		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+		status = -EIO;
-+		goto getdeviceinfo_out;
-+	}
-+	status = res->getdeviceinfo_res.status;
-+	if (status != 0)
-+		goto getdeviceinfo_out;
++static inline int pnfs_return_layout(struct inode *ino,
++				     struct pnfs_layout_range *range,
++				     bool wait)
++{
++	return 0;
++}
 +
-+	dev = &res->getdeviceinfo_res.devinfo;
++static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id)
+ {
+ }
+ 
+@@ -230,6 +548,62 @@ static inline void unset_pnfs_layoutdriv
+ {
+ }
+ 
++static inline void pnfs_set_ds_iosize(struct nfs_server *server)
++{
++	server->ds_wsize = server->ds_rsize = -1;
++}
 +
-+	/* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */
-+	fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL);
-+	if (fldev == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceinfo_out;
-+	}
++static inline int pnfs_write_begin(struct file *filp, struct page *page,
++				   loff_t pos, unsigned len,
++				   struct pnfs_layout_segment *lseg,
++				   void **fsdata)
++{
++	*fsdata = NULL;
++	return 0;
++}
 +
-+	/*
-+	 * Stripe count is the same as data server count for our purposes
-+	 */
-+	fldev->fl_stripeindices_length = dev->dscount;
-+	fldev->fl_device_length = dev->dscount;
++static inline int pnfs_write_end(struct file *filp, struct page *page,
++				 loff_t pos, unsigned len, unsigned copied,
++				 struct pnfs_layout_segment *lseg)
++{
++	return 0;
++}
 +
-+	/* Set stripe indices */
-+#ifdef CONFIG_SPNFS_TEST
-+	spnfs_set_test_indices(fldev, dev, info);
-+	fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr;
-+#else /* CONFIG_SPNFS_TEST */
-+	fldev->fl_stripeindices_list =
-+		kmalloc(fldev->fl_stripeindices_length * sizeof(u32),
-+			GFP_KERNEL);
-+	if (fldev->fl_stripeindices_list == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceinfo_out;
-+	}
-+	for (i = 0; i < fldev->fl_stripeindices_length; i++)
-+		fldev->fl_stripeindices_list[i] = i;
-+#endif /* CONFIG_SPNFS_TEST */
++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
++{
++}
 +
-+	/*
-+	 * Set the device's data server addresses  No multipath for spnfs,
-+	 * so mp length is always 1.
-+	 *
-+	 */
-+	fldev->fl_device_list =
-+		kmalloc(fldev->fl_device_length *
-+			sizeof(struct pnfs_filelayout_multipath),
-+			GFP_KERNEL);
-+	if (fldev->fl_device_list == NULL) {
-+		status = -ENOMEM;
-+		goto getdeviceinfo_out;
-+	}
-+	for (i = 0; i < fldev->fl_device_length; i++) {
-+		mp = &fldev->fl_device_list[i];
-+		mp->fl_multipath_length = 1;
-+		mp->fl_multipath_list =
-+			kmalloc(sizeof(struct pnfs_filelayout_devaddr),
-+				GFP_KERNEL);
-+		if (mp->fl_multipath_list == NULL) {
-+			status = -ENOMEM;
-+			goto getdeviceinfo_out;
-+		}
-+		fldap = mp->fl_multipath_list;
++static inline int pnfs_get_write_status(struct nfs_write_data *data)
++{
++	return 0;
++}
 +
-+		/*
-+		 * Copy the netid into the device address, for example: "tcp"
-+		 */
-+		len = strlen(dev->dslist[i].netid);
-+		fldap->r_netid.data = kmalloc(len, GFP_KERNEL);
-+		if (fldap->r_netid.data == NULL) {
-+			status = -ENOMEM;
-+			goto getdeviceinfo_out;
-+		}
-+		memcpy(fldap->r_netid.data, dev->dslist[i].netid, len);
-+		fldap->r_netid.len = len;
++static inline int pnfs_get_read_status(struct nfs_read_data *data)
++{
++	return 0;
++}
 +
-+		/*
-+		 * Copy the network address into the device address,
-+		 * for example: "10.35.9.16.08.01"
-+		 */
-+		len = strlen(dev->dslist[i].addr);
-+		fldap->r_addr.data = kmalloc(len, GFP_KERNEL);
-+		if (fldap->r_addr.data == NULL) {
-+			status = -ENOMEM;
-+			goto getdeviceinfo_out;
-+		}
-+		memcpy(fldap->r_addr.data, dev->dslist[i].addr, len);
-+		fldap->r_addr.len = len;
-+	}
++static inline void
++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino,
++		      struct nfs_open_context *ctx, struct list_head *pages,
++		      size_t *rsize)
++{
++	pgio->pg_lseg = NULL;
++}
 +
-+	/* encode the device data */
-+	status = filelayout_encode_devinfo(xdr, fldev);
++static inline void
++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino,
++		       size_t *wsize)
++{
++	pgio->pg_lseg = NULL;
++}
 +
-+getdeviceinfo_out:
-+	if (fldev) {
-+		kfree(fldev->fl_stripeindices_list);
-+		if (fldev->fl_device_list) {
-+			for (i = 0; i < fldev->fl_device_length; i++) {
-+				fldap =
-+				    fldev->fl_device_list[i].fl_multipath_list;
-+				kfree(fldap->r_netid.data);
-+				kfree(fldap->r_addr.data);
-+				kfree(fldap);
-+			}
-+			kfree(fldev->fl_device_list);
-+		}
-+		kfree(fldev);
-+	}
++static inline struct pnfs_layout_segment *
++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
++{
++	return NULL;
++}
 +
-+	kfree(im);
-+	kfree(res);
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ #endif /* FS_NFS_PNFS_H */
+diff -up linux-2.6.37.noarch/fs/nfs/read.c.orig linux-2.6.37.noarch/fs/nfs/read.c
+--- linux-2.6.37.noarch/fs/nfs/read.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/read.c	2011-01-28 09:43:53.342771448 -0500
+@@ -18,14 +18,17 @@
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_page.h>
++#include <linux/smp_lock.h>
++#include <linux/module.h>
+ 
+ #include <asm/system.h>
++#include <linux/module.h>
++#include "pnfs.h"
+ 
+ #include "nfs4_fs.h"
+ #include "internal.h"
+ #include "iostat.h"
+ #include "fscache.h"
+-#include "pnfs.h"
+ 
+ #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+ 
+@@ -117,12 +120,16 @@ int nfs_readpage_async(struct nfs_open_c
+ 	LIST_HEAD(one_request);
+ 	struct nfs_page	*new;
+ 	unsigned int len;
++	loff_t pgoffs;
++	struct pnfs_layout_segment *lseg;
+ 
+ 	len = nfs_page_length(page);
+ 	if (len == 0)
+ 		return nfs_return_empty_page(page);
+-	pnfs_update_layout(inode, ctx, IOMODE_READ);
+-	new = nfs_create_request(ctx, inode, page, 0, len);
++	pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT;
++	lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ);
++	new = nfs_create_request(ctx, inode, page, 0, len, lseg);
++	put_lseg(lseg);
+ 	if (IS_ERR(new)) {
+ 		unlock_page(page);
+ 		return PTR_ERR(new);
+@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct 
+ 	nfs_release_request(req);
+ }
+ 
+-/*
+- * Set up the NFS read request struct
+- */
+-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+-		const struct rpc_call_ops *call_ops,
+-		unsigned int count, unsigned int offset)
++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++		      const struct rpc_call_ops *call_ops)
+ {
+-	struct inode *inode = req->wb_context->path.dentry->d_inode;
++	struct inode *inode = data->inode;
+ 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+ 	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_argp = &data->args,
+ 		.rpc_resp = &data->res,
+-		.rpc_cred = req->wb_context->cred,
++		.rpc_cred = data->cred,
+ 	};
+ 	struct rpc_task_setup task_setup_data = {
+ 		.task = &data->task,
+-		.rpc_client = NFS_CLIENT(inode),
++		.rpc_client = clnt,
+ 		.rpc_message = &msg,
+ 		.callback_ops = call_ops,
+ 		.callback_data = data,
+@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_
+ 		.flags = RPC_TASK_ASYNC | swap_flags,
+ 	};
+ 
++	/* Set up the initial task struct. */
++	NFS_PROTO(inode)->read_setup(data, &msg);
 +
-+	return status;
-+}
++	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
++			data->task.tk_pid,
++			inode->i_sb->s_id,
++			(long long)NFS_FILEID(inode),
++			data->args.count,
++			(unsigned long long)data->args.offset);
 +
-+int
-+spnfs_setattr(void)
-+{
++	task = rpc_run_task(&task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
++	rpc_put_task(task);
 +	return 0;
 +}
++EXPORT_SYMBOL(nfs_initiate_read);
 +
-+int
-+spnfs_open(struct inode *inode, struct nfsd4_open *open)
++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++		       const struct rpc_call_ops *call_ops)
 +{
-+	struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
-+	struct spnfs_msg *im = NULL;
-+	union spnfs_msg_res *res = NULL;
-+	int status = 0;
-+
-+	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+	if (im == NULL) {
-+		status = -ENOMEM;
-+		goto open_out;
-+	}
-+
-+	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+	if (res == NULL) {
-+		status = -ENOMEM;
-+		goto open_out;
-+	}
-+
-+	im->im_type = SPNFS_TYPE_OPEN;
-+	im->im_args.open_args.inode = inode->i_ino;
-+	im->im_args.open_args.generation = inode->i_generation;
-+	im->im_args.open_args.create = open->op_create;
-+	im->im_args.open_args.createmode = open->op_createmode;
-+	im->im_args.open_args.truncate = open->op_truncate;
-+
-+	/* call function to queue the msg for upcall */
-+	status = spnfs_upcall(spnfs, im, res);
-+	if (status != 0) {
-+		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+		status = -EIO;
-+		goto open_out;
-+	}
-+	status = res->open_res.status;
-+
-+open_out:
-+	kfree(im);
-+	kfree(res);
-+
-+	return status;
-+}
++	if (data->req->wb_lseg &&
++	    (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
++		return pnfs_get_read_status(data);
 +
-+int
-+spnfs_create(void)
-+{
-+	return 0;
++	return nfs_initiate_read(data, clnt, call_ops);
 +}
 +
 +/*
-+ * Invokes the spnfsd with the inode number of the object to remove.
-+ * The file has already been removed on the MDS, so all the spnsfd
-+ * daemon does is remove the stripes.
-+ * Returns 0 on success otherwise error code
++ * Set up the NFS read request struct
 + */
-+int
-+spnfs_remove(unsigned long ino, unsigned long generation)
++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
++		const struct rpc_call_ops *call_ops,
++		unsigned int count, unsigned int offset)
 +{
-+	struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
-+	struct spnfs_msg *im = NULL;
-+	union spnfs_msg_res *res = NULL;
-+	int status = 0;
-+
-+	im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+	if (im == NULL) {
-+		status = -ENOMEM;
-+		goto remove_out;
-+	}
++	struct inode *inode = req->wb_context->path.dentry->d_inode;
 +
-+	res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+	if (res == NULL) {
-+		status = -ENOMEM;
-+		goto remove_out;
+ 	data->req	  = req;
+ 	data->inode	  = inode;
+-	data->cred	  = msg.rpc_cred;
++	data->cred	  = req->wb_context->cred;
+ 
+ 	data->args.fh     = NFS_FH(inode);
+ 	data->args.offset = req_offset(req) + offset;
+@@ -197,21 +237,7 @@ static int nfs_read_rpcsetup(struct nfs_
+ 	data->res.eof     = 0;
+ 	nfs_fattr_init(&data->fattr);
+ 
+-	/* Set up the initial task struct. */
+-	NFS_PROTO(inode)->read_setup(data, &msg);
+-
+-	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+-			data->task.tk_pid,
+-			inode->i_sb->s_id,
+-			(long long)NFS_FILEID(inode),
+-			count,
+-			(unsigned long long)data->args.offset);
+-
+-	task = rpc_run_task(&task_setup_data);
+-	if (IS_ERR(task))
+-		return PTR_ERR(task);
+-	rpc_put_task(task);
+-	return 0;
++	return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
+ }
+ 
+ static void
+@@ -355,7 +381,14 @@ static void nfs_readpage_retry(struct rp
+ {
+ 	struct nfs_readargs *argp = &data->args;
+ 	struct nfs_readres *resp = &data->res;
++	struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client;
+ 
++#ifdef CONFIG_NFS_V4_1
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS read\n", __func__);
++		clp = data->fldata.ds_nfs_client;
 +	}
-+
-+	im->im_type = SPNFS_TYPE_REMOVE;
-+	im->im_args.remove_args.inode = ino;
-+	im->im_args.remove_args.generation = generation;
-+
-+	/* call function to queue the msg for upcall */
-+	status = spnfs_upcall(spnfs, im, res);
-+	if (status != 0) {
-+		dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+		status = -EIO;
-+		goto remove_out;
++#endif /* CONFIG_NFS_V4_1 */
+ 	if (resp->eof || resp->count == argp->count)
+ 		return;
+ 
+@@ -369,7 +402,10 @@ static void nfs_readpage_retry(struct rp
+ 	argp->offset += resp->count;
+ 	argp->pgbase += resp->count;
+ 	argp->count -= resp->count;
+-	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
++#ifdef CONFIG_NFS_V4_1
++	data->pdata.pnfs_error = -EAGAIN;
++#endif /* CONFIG_NFS_V4_1 */
++	nfs_restart_rpc(task, clp);
+ }
+ 
+ /*
+@@ -410,13 +446,19 @@ static void nfs_readpage_release_partial
+ void nfs_read_prepare(struct rpc_task *task, void *calldata)
+ {
+ 	struct nfs_read_data *data = calldata;
++	struct nfs4_session *ds_session = NULL;
+ 
+-	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS read\n", __func__);
++		ds_session = data->fldata.ds_nfs_client->cl_session;
 +	}
-+	status = res->remove_res.status;
-+
-+remove_out:
-+	kfree(im);
-+	kfree(res);
-+
-+	return status;
-+}
-+
-+static int
-+read_one(struct inode *inode, loff_t offset, size_t len, char *buf,
-+	 struct file **filp)
-+{
-+	loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
-+	size_t iolen;
-+	int completed = 0, ds, err;
-+
-+	while (len > 0) {
-+		tmp = offset;
-+		soff = do_div(tmp, spnfs_config->stripe_size);
-+		snum = tmp;
-+		ds = do_div(tmp, spnfs_config->num_ds);
-+		if (spnfs_config->dense_striping == 0)
-+			soffset = offset;
-+		else {
-+			tmp = snum;
-+			do_div(tmp, spnfs_config->num_ds);
-+			soffset = tmp * spnfs_config->stripe_size + soff;
-+		}
-+		if (len < spnfs_config->stripe_size - soff)
-+			iolen = len;
-+		else
-+			iolen = spnfs_config->stripe_size - soff;
-+
-+		pos = soffset;
-+		err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos);
-+		if (err < 0)
-+			return -EIO;
-+		if (err == 0)
-+			break;
-+		filp[ds]->f_pos = pos;
-+		iolen = err;
-+		completed += iolen;
-+		len -= iolen;
-+		offset += iolen;
-+		bufoffset += iolen;
++	if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
+ 				&data->args.seq_args, &data->res.seq_res,
+ 				0, task))
+ 		return;
+ 	rpc_call_start(task);
+ }
++EXPORT_SYMBOL(nfs_read_prepare);
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ static const struct rpc_call_ops nfs_read_partial_ops = {
+@@ -569,7 +611,20 @@ readpage_async_filler(void *data, struct
+ 	if (len == 0)
+ 		return nfs_return_empty_page(page);
+ 
+-	new = nfs_create_request(desc->ctx, inode, page, 0, len);
++	if (desc->pgio->pg_lseg) {
++		loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT;
++		struct pnfs_layout_range *range = &desc->pgio->pg_lseg->range;
++
++		/* retry later with the right lseg? */
++		if (range->offset > pgoff + len ||
++		    range->offset + range->length < pgoff) {
++			new = ERR_PTR(-EAGAIN);
++			goto out_error;
++		}
 +	}
 +
-+	return completed;
++	new = nfs_create_request(desc->ctx, inode, page, 0, len,
++				 desc->pgio->pg_lseg);
+ 	if (IS_ERR(new))
+ 		goto out_error;
+ 
+@@ -625,7 +680,7 @@ int nfs_readpages(struct file *filp, str
+ 	if (ret == 0)
+ 		goto read_complete; /* all pages were read */
+ 
+-	pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
++	pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize);
+ 	if (rsize < PAGE_CACHE_SIZE)
+ 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
+ 	else
+@@ -634,6 +689,7 @@ int nfs_readpages(struct file *filp, str
+ 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+ 
+ 	nfs_pageio_complete(&pgio);
++	put_lseg(pgio.pg_lseg);
+ 	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+ read_complete:
+diff -up linux-2.6.37.noarch/fs/nfs/super.c.orig linux-2.6.37.noarch/fs/nfs/super.c
+--- linux-2.6.37.noarch/fs/nfs/super.c.orig	2011-01-28 09:37:32.551979635 -0500
++++ linux-2.6.37.noarch/fs/nfs/super.c	2011-01-28 09:43:53.343771315 -0500
+@@ -63,6 +63,7 @@
+ #include "iostat.h"
+ #include "internal.h"
+ #include "fscache.h"
++#include "pnfs.h"
+ 
+ #define NFSDBG_FACILITY		NFSDBG_VFS
+ 
+@@ -725,6 +726,28 @@ static int nfs_show_options(struct seq_f
+ 
+ 	return 0;
+ }
++#ifdef CONFIG_NFS_V4_1
++void show_sessions(struct seq_file *m, struct nfs_server *server)
++{
++	if (nfs4_has_session(server->nfs_client))
++		seq_printf(m, ",sessions");
 +}
++#else
++void show_sessions(struct seq_file *m, struct nfs_server *server) {}
++#endif
 +
-+static __be32
-+read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
-+     struct svc_rqst *rqstp)
++#ifdef CONFIG_NFS_V4_1
++void show_pnfs(struct seq_file *m, struct nfs_server *server)
 +{
-+	int i, vnum, err, bytecount = 0;
-+	char path[128];
-+	struct file *filp[SPNFS_MAX_DATA_SERVERS];
-+	size_t iolen;
-+	__be32 status = nfs_ok;
++	seq_printf(m, ",pnfs=");
++	if (server->pnfs_curr_ld)
++		seq_printf(m, "%s", server->pnfs_curr_ld->name);
++	else
++		seq_printf(m, "not configured");
++}
++#else  /* CONFIG_NFS_V4_1 */
++void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
++#endif /* CONFIG_NFS_V4_1 */
+ 
+ /*
+  * Present statistical information for this VFS mountpoint
+@@ -763,6 +786,8 @@ static int nfs_show_stats(struct seq_fil
+ 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+ 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ 		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
++		show_sessions(m, nfss);
++		show_pnfs(m, nfss);
+ 	}
+ #endif
+ 
+diff -up linux-2.6.37.noarch/fs/nfs/unlink.c.orig linux-2.6.37.noarch/fs/nfs/unlink.c
+--- linux-2.6.37.noarch/fs/nfs/unlink.c.orig	2011-01-28 09:37:32.552979600 -0500
++++ linux-2.6.37.noarch/fs/nfs/unlink.c	2011-01-28 09:43:53.344771185 -0500
+@@ -113,7 +113,7 @@ void nfs_unlink_prepare(struct rpc_task 
+ 	struct nfs_unlinkdata *data = calldata;
+ 	struct nfs_server *server = NFS_SERVER(data->dir);
+ 
+-	if (nfs4_setup_sequence(server, &data->args.seq_args,
++	if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
+ 				&data->res.seq_res, 1, task))
+ 		return;
+ 	rpc_call_start(task);
+@@ -388,7 +388,7 @@ static void nfs_rename_prepare(struct rp
+ 	struct nfs_renamedata *data = calldata;
+ 	struct nfs_server *server = NFS_SERVER(data->old_dir);
+ 
+-	if (nfs4_setup_sequence(server, &data->args.seq_args,
++	if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
+ 				&data->res.seq_res, 1, task))
+ 		return;
+ 	rpc_call_start(task);
+diff -up linux-2.6.37.noarch/fs/nfs/write.c.orig linux-2.6.37.noarch/fs/nfs/write.c
+--- linux-2.6.37.noarch/fs/nfs/write.c.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/fs/nfs/write.c	2011-01-28 09:43:53.345771055 -0500
+@@ -28,6 +28,7 @@
+ #include "iostat.h"
+ #include "nfs4_fs.h"
+ #include "fscache.h"
++#include "pnfs.h"
+ 
+ #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
+ 
+@@ -58,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_al
+ 	}
+ 	return p;
+ }
++EXPORT_SYMBOL(nfs_commitdata_alloc);
+ 
+ void nfs_commit_free(struct nfs_write_data *p)
+ {
+@@ -426,6 +428,17 @@ static void nfs_inode_remove_request(str
+ 		spin_unlock(&inode->i_lock);
+ 	nfs_release_request(req);
+ }
++static void
++nfs_mark_request_nopnfs(struct nfs_page *req)
++{
++	struct pnfs_layout_segment *lseg = req->wb_lseg;
 +
-+	/*
-+	 * XXX We should just be doing this at open time, but it gets
-+	 * kind of messy storing this info in nfsd's state structures
-+	 * and piggybacking its path through the various state handling
-+	 * functions.  Revisit this.
-+	 */
-+	memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
-+	for (i = 0; i < spnfs_config->num_ds; i++) {
-+		sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
-+			inode->i_ino, inode->i_generation);
-+		filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0);
-+		if (filp[i] == NULL) {
-+			status = nfserr_io;
-+			goto read_out;
-+		}
-+		get_file(filp[i]);
-+	}
++	if (req->wb_lseg == NULL)
++		return;
++	req->wb_lseg = NULL;
++	put_lseg(lseg);
++	dprintk(" retry through MDS\n");
++}
+ 
+ static void
+ nfs_mark_request_dirty(struct nfs_page *req)
+@@ -531,7 +544,7 @@ nfs_need_commit(struct nfs_inode *nfsi)
+  * The requests are *not* checked to ensure that they form a contiguous set.
+  */
+ static int
+-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs)
+ {
+ 	struct nfs_inode *nfsi = NFS_I(inode);
+ 	int ret;
+@@ -539,7 +552,8 @@ nfs_scan_commit(struct inode *inode, str
+ 	if (!nfs_need_commit(nfsi))
+ 		return 0;
+ 
+-	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
++	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT,
++			    use_pnfs);
+ 	if (ret > 0)
+ 		nfsi->ncommit -= ret;
+ 	if (nfs_need_commit(NFS_I(inode)))
+@@ -568,7 +582,8 @@ static inline int nfs_scan_commit(struct
+ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ 		struct page *page,
+ 		unsigned int offset,
+-		unsigned int bytes)
++		unsigned int bytes,
++		struct pnfs_layout_segment *lseg)
+ {
+ 	struct nfs_page *req;
+ 	unsigned int rqend;
+@@ -593,8 +608,8 @@ static struct nfs_page *nfs_try_to_updat
+ 		 * Note: nfs_flush_incompatible() will already
+ 		 * have flushed out requests having wrong owners.
+ 		 */
+-		if (offset > rqend
+-		    || end < req->wb_offset)
++		if (offset > rqend || end < req->wb_offset ||
++		    req->wb_lseg != lseg)
+ 			goto out_flushme;
+ 
+ 		if (nfs_set_page_tag_locked(req))
+@@ -642,16 +657,17 @@ out_err:
+  * already called nfs_flush_incompatible() if necessary.
+  */
+ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+-		struct page *page, unsigned int offset, unsigned int bytes)
++		struct page *page, unsigned int offset, unsigned int bytes,
++		struct pnfs_layout_segment *lseg)
+ {
+ 	struct inode *inode = page->mapping->host;
+ 	struct nfs_page	*req;
+ 	int error;
+ 
+-	req = nfs_try_to_update_request(inode, page, offset, bytes);
++	req = nfs_try_to_update_request(inode, page, offset, bytes, lseg);
+ 	if (req != NULL)
+ 		goto out;
+-	req = nfs_create_request(ctx, inode, page, offset, bytes);
++	req = nfs_create_request(ctx, inode, page, offset, bytes, lseg);
+ 	if (IS_ERR(req))
+ 		goto out;
+ 	error = nfs_inode_add_request(inode, req);
+@@ -664,23 +680,27 @@ out:
+ }
+ 
+ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+-		unsigned int offset, unsigned int count)
++			       unsigned int offset, unsigned int count,
++			       struct pnfs_layout_segment *lseg,
++			       void *fsdata)
+ {
+ 	struct nfs_page	*req;
+ 
+-	req = nfs_setup_write_request(ctx, page, offset, count);
++	req = nfs_setup_write_request(ctx, page, offset, count, lseg);
+ 	if (IS_ERR(req))
+ 		return PTR_ERR(req);
+ 	nfs_mark_request_dirty(req);
+ 	/* Update file length */
+-	nfs_grow_file(page, offset, count);
++	if (pnfs_grow_ok(lseg, fsdata))
++		nfs_grow_file(page, offset, count);
+ 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ 	nfs_mark_request_dirty(req);
+ 	nfs_clear_page_tag_locked(req);
+ 	return 0;
+ }
+ 
+-int nfs_flush_incompatible(struct file *file, struct page *page)
++int nfs_flush_incompatible(struct file *file, struct page *page,
++			   struct pnfs_layout_segment *lseg)
+ {
+ 	struct nfs_open_context *ctx = nfs_file_open_context(file);
+ 	struct nfs_page	*req;
+@@ -699,7 +719,8 @@ int nfs_flush_incompatible(struct file *
+ 			return 0;
+ 		do_flush = req->wb_page != page || req->wb_context != ctx ||
+ 			req->wb_lock_context->lockowner != current->files ||
+-			req->wb_lock_context->pid != current->tgid;
++			req->wb_lock_context->pid != current->tgid ||
++			req->wb_lseg != lseg;
+ 		nfs_release_request(req);
+ 		if (!do_flush)
+ 			return 0;
+@@ -726,7 +747,8 @@ static int nfs_write_pageuptodate(struct
+  * things with a page scheduled for an RPC call (e.g. invalidate it).
+  */
+ int nfs_updatepage(struct file *file, struct page *page,
+-		unsigned int offset, unsigned int count)
++		   unsigned int offset, unsigned int count,
++		   struct pnfs_layout_segment *lseg, void *fsdata)
+ {
+ 	struct nfs_open_context *ctx = nfs_file_open_context(file);
+ 	struct inode	*inode = page->mapping->host;
+@@ -751,7 +773,7 @@ int nfs_updatepage(struct file *file, st
+ 		offset = 0;
+ 	}
+ 
+-	status = nfs_writepage_setup(ctx, page, offset, count);
++	status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
+ 	if (status < 0)
+ 		nfs_set_pageerror(page);
+ 
+@@ -781,25 +803,21 @@ static int flush_task_priority(int how)
+ 	return RPC_PRIORITY_NORMAL;
+ }
+ 
+-/*
+- * Set up the argument/result storage required for the RPC call.
+- */
+-static int nfs_write_rpcsetup(struct nfs_page *req,
+-		struct nfs_write_data *data,
+-		const struct rpc_call_ops *call_ops,
+-		unsigned int count, unsigned int offset,
+-		int how)
++int nfs_initiate_write(struct nfs_write_data *data,
++		       struct rpc_clnt *clnt,
++		       const struct rpc_call_ops *call_ops,
++		       int how)
+ {
+-	struct inode *inode = req->wb_context->path.dentry->d_inode;
++	struct inode *inode = data->inode;
+ 	int priority = flush_task_priority(how);
+ 	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_argp = &data->args,
+ 		.rpc_resp = &data->res,
+-		.rpc_cred = req->wb_context->cred,
++		.rpc_cred = data->cred,
+ 	};
+ 	struct rpc_task_setup task_setup_data = {
+-		.rpc_client = NFS_CLIENT(inode),
++		.rpc_client = clnt,
+ 		.task = &data->task,
+ 		.rpc_message = &msg,
+ 		.callback_ops = call_ops,
+@@ -810,12 +828,62 @@ static int nfs_write_rpcsetup(struct nfs
+ 	};
+ 	int ret = 0;
+ 
++	/* Set up the initial task struct.  */
++	NFS_PROTO(inode)->write_setup(data, &msg);
 +
-+	for (vnum = 0 ; vnum < vlen ; vnum++) {
-+		iolen = rqstp->rq_vec[vnum].iov_len;
-+		err = read_one(inode, offset + bytecount, iolen,
-+			       (char *)rqstp->rq_vec[vnum].iov_base, filp);
-+		if (err < 0) {
-+			status = nfserr_io;
-+			goto read_out;
-+		}
-+		if (err < iolen) {
-+			bytecount += err;
-+			goto read_out;
-+		}
-+		bytecount += rqstp->rq_vec[vnum].iov_len;
-+	}
++	dprintk("NFS: %5u initiated write call "
++		"(req %s/%lld, %u bytes @ offset %llu)\n",
++		data->task.tk_pid,
++		inode->i_sb->s_id,
++		(long long)NFS_FILEID(inode),
++		data->args.count,
++		(unsigned long long)data->args.offset);
 +
-+read_out:
-+	*lenp = bytecount;
-+	for (i = 0; i < spnfs_config->num_ds; i++) {
-+		if (filp[i]) {
-+			filp_close(filp[i], current->files);
-+			fput(filp[i]);
-+		}
++	task = rpc_run_task(&task_setup_data);
++	if (IS_ERR(task)) {
++		ret = PTR_ERR(task);
++		goto out;
 +	}
-+	return status;
-+}
-+
-+__be32
-+spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
-+	   struct svc_rqst *rqstp)
-+{
-+	if (spnfs_config)
-+		return read(inode, offset, lenp, vlen, rqstp);
-+	else {
-+		printk(KERN_ERR "Please upgrade to latest spnfsd\n");
-+		return nfserr_notsupp;
++	if (how & FLUSH_SYNC) {
++		ret = rpc_wait_for_completion_task(task);
++		if (ret == 0)
++			ret = task->tk_status;
 +	}
++	rpc_put_task(task);
++out:
++	return ret;
 +}
++EXPORT_SYMBOL(nfs_initiate_write);
 +
-+static int
-+write_one(struct inode *inode, loff_t offset, size_t len, char *buf,
-+	  struct file **filp)
++int pnfs_initiate_write(struct nfs_write_data *data,
++			struct rpc_clnt *clnt,
++			const struct rpc_call_ops *call_ops,
++			int how)
 +{
-+	loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
-+	size_t iolen;
-+	int completed = 0, ds, err;
-+
-+	while (len > 0) {
-+		tmp = offset;
-+		soff = do_div(tmp, spnfs_config->stripe_size);
-+		snum = tmp;
-+		ds = do_div(tmp, spnfs_config->num_ds);
-+		if (spnfs_config->dense_striping == 0)
-+			soffset = offset;
-+		else {
-+			tmp = snum;
-+			do_div(tmp, spnfs_config->num_ds);
-+			soffset = tmp * spnfs_config->stripe_size + soff;
-+		}
-+		if (len < spnfs_config->stripe_size - soff)
-+			iolen = len;
-+		else
-+			iolen = spnfs_config->stripe_size - soff;
-+
-+		pos = soffset;
-+		err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos);
-+		if (err < 0)
-+			return -EIO;
-+		filp[ds]->f_pos = pos;
-+		iolen = err;
-+		completed += iolen;
-+		len -= iolen;
-+		offset += iolen;
-+		bufoffset += iolen;
-+	}
++	if (data->req->wb_lseg &&
++	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
++		return pnfs_get_write_status(data);
 +
-+	return completed;
++	return nfs_initiate_write(data, clnt, call_ops, how);
 +}
 +
-+static __be32
-+write(struct inode *inode, loff_t offset, size_t len, int vlen,
-+      struct svc_rqst *rqstp)
++/*
++ * Set up the argument/result storage required for the RPC call.
++ */
++static int nfs_write_rpcsetup(struct nfs_page *req,
++		struct nfs_write_data *data,
++		const struct rpc_call_ops *call_ops,
++		unsigned int count, unsigned int offset,
++		int how)
 +{
-+	int i, vnum, err, bytecount = 0;
-+	char path[128];
-+	struct file *filp[SPNFS_MAX_DATA_SERVERS];
-+	size_t iolen;
-+	__be32 status = nfs_ok;
++	struct inode *inode = req->wb_context->path.dentry->d_inode;
 +
-+	/*
-+	 * XXX We should just be doing this at open time, but it gets
-+	 * kind of messy storing this info in nfsd's state structures
-+	 * and piggybacking its path through the various state handling
-+	 * functions.  Revisit this.
-+	 */
-+	memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
-+	for (i = 0; i < spnfs_config->num_ds; i++) {
-+		sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
-+			inode->i_ino, inode->i_generation);
-+		filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0);
-+		if (filp[i] == NULL) {
-+			status = nfserr_io;
-+			goto write_out;
-+		}
-+		get_file(filp[i]);
-+	}
+ 	/* Set up the RPC argument and reply structs
+ 	 * NB: take care not to mess about with data->commit et al. */
+ 
+ 	data->req = req;
+ 	data->inode = inode = req->wb_context->path.dentry->d_inode;
+-	data->cred = msg.rpc_cred;
++	data->cred = req->wb_context->cred;
+ 
+ 	data->args.fh     = NFS_FH(inode);
+ 	data->args.offset = req_offset(req) + offset;
+@@ -836,30 +904,7 @@ static int nfs_write_rpcsetup(struct nfs
+ 	data->res.verf    = &data->verf;
+ 	nfs_fattr_init(&data->fattr);
+ 
+-	/* Set up the initial task struct.  */
+-	NFS_PROTO(inode)->write_setup(data, &msg);
+-
+-	dprintk("NFS: %5u initiated write call "
+-		"(req %s/%lld, %u bytes @ offset %llu)\n",
+-		data->task.tk_pid,
+-		inode->i_sb->s_id,
+-		(long long)NFS_FILEID(inode),
+-		count,
+-		(unsigned long long)data->args.offset);
+-
+-	task = rpc_run_task(&task_setup_data);
+-	if (IS_ERR(task)) {
+-		ret = PTR_ERR(task);
+-		goto out;
+-	}
+-	if (how & FLUSH_SYNC) {
+-		ret = rpc_wait_for_completion_task(task);
+-		if (ret == 0)
+-			ret = task->tk_status;
+-	}
+-	rpc_put_task(task);
+-out:
+-	return ret;
++	return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
+ }
+ 
+ /* If a nfs_flush_* function fails, it should remove reqs from @head and
+@@ -870,6 +915,7 @@ static void nfs_redirty_request(struct n
+ {
+ 	struct page *page = req->wb_page;
+ 
++	nfs_mark_request_nopnfs(req);
+ 	nfs_mark_request_dirty(req);
+ 	nfs_clear_page_tag_locked(req);
+ 	nfs_end_page_writeback(page);
+@@ -982,6 +1028,8 @@ static void nfs_pageio_init_write(struct
+ {
+ 	size_t wsize = NFS_SERVER(inode)->wsize;
+ 
++	pnfs_pageio_init_write(pgio, inode, &wsize);
 +
-+	for (vnum = 0; vnum < vlen; vnum++) {
-+		iolen = rqstp->rq_vec[vnum].iov_len;
-+		err = write_one(inode, offset + bytecount, iolen,
-+				(char *)rqstp->rq_vec[vnum].iov_base, filp);
-+		if (err != iolen) {
-+			dprintk("spnfs_write: err=%d expected %Zd\n", err, len);
-+			status = nfserr_io;
-+			goto write_out;
-+		}
-+		bytecount += rqstp->rq_vec[vnum].iov_len;
+ 	if (wsize < PAGE_CACHE_SIZE)
+ 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
+ 	else
+@@ -1047,13 +1095,27 @@ out:
+ void nfs_write_prepare(struct rpc_task *task, void *calldata)
+ {
+ 	struct nfs_write_data *data = calldata;
++	struct nfs4_session *ds_session = NULL;
+ 
+-	if (nfs4_setup_sequence(NFS_SERVER(data->inode),
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS read\n", __func__);
++		ds_session = data->fldata.ds_nfs_client->cl_session;
++	} else if (data->args.count > NFS_SERVER(data->inode)->wsize) {
++		/* retrying via MDS? */
++		data->pdata.orig_count = data->args.count;
++		data->args.count = NFS_SERVER(data->inode)->wsize;
++		dprintk("%s: trimmed count %u to wsize %u\n", __func__,
++		data->pdata.orig_count, data->args.count);
++	} else
++		data->pdata.orig_count = 0;
++
++	if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
+ 				&data->args.seq_args,
+ 				&data->res.seq_res, 1, task))
+ 		return;
+ 	rpc_call_start(task);
+ }
++EXPORT_SYMBOL(nfs_write_prepare);
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+ static const struct rpc_call_ops nfs_write_partial_ops = {
+@@ -1137,10 +1199,11 @@ int nfs_writeback_done(struct rpc_task *
+ 	struct nfs_writeargs	*argp = &data->args;
+ 	struct nfs_writeres	*resp = &data->res;
+ 	struct nfs_server	*server = NFS_SERVER(data->inode);
++	struct nfs_client	*clp = server->nfs_client;
+ 	int status;
+ 
+-	dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
+-		task->tk_pid, task->tk_status);
++	dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n",
++		task->tk_pid, task->tk_status, resp->count);
+ 
+ 	/*
+ 	 * ->write_done will attempt to use post-op attributes to detect
+@@ -1153,6 +1216,13 @@ int nfs_writeback_done(struct rpc_task *
+ 	if (status != 0)
+ 		return status;
+ 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
++#ifdef CONFIG_NFS_V4_1
++	/* Is this a DS session */
++	if (data->fldata.ds_nfs_client) {
++		dprintk("%s DS write\n", __func__);
++		clp = data->fldata.ds_nfs_client;
 +	}
++#endif /* CONFIG_NFS_V4_1 */
+ 
+ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+ 	if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+@@ -1169,7 +1239,7 @@ int nfs_writeback_done(struct rpc_task *
+ 		if (time_before(complain, jiffies)) {
+ 			dprintk("NFS:       faulty NFS server %s:"
+ 				" (committed = %d) != (stable = %d)\n",
+-				server->nfs_client->cl_hostname,
++				clp->cl_hostname,
+ 				resp->verf->committed, argp->stable);
+ 			complain = jiffies + 300 * HZ;
+ 		}
+@@ -1195,6 +1265,9 @@ int nfs_writeback_done(struct rpc_task *
+ 				 */
+ 				argp->stable = NFS_FILE_SYNC;
+ 			}
++#ifdef CONFIG_NFS_V4_1
++			data->pdata.pnfs_error = -EAGAIN;
++#endif /* CONFIG_NFS_V4_1 */
+ 			nfs_restart_rpc(task, server->nfs_client);
+ 			return -EAGAIN;
+ 		}
+@@ -1239,40 +1312,73 @@ static void nfs_commitdata_release(void 
+ 	nfs_commit_free(wdata);
+ }
+ 
+-/*
+- * Set up the argument/result storage required for the RPC call.
+- */
+-static int nfs_commit_rpcsetup(struct list_head *head,
+-		struct nfs_write_data *data,
+-		int how)
++int nfs_initiate_commit(struct nfs_write_data *data,
++			struct rpc_clnt *clnt,
++			const struct rpc_call_ops *call_ops,
++			int how)
+ {
+-	struct nfs_page *first = nfs_list_entry(head->next);
+-	struct inode *inode = first->wb_context->path.dentry->d_inode;
++	struct inode *inode = data->inode;
+ 	int priority = flush_task_priority(how);
+ 	struct rpc_task *task;
+ 	struct rpc_message msg = {
+ 		.rpc_argp = &data->args,
+ 		.rpc_resp = &data->res,
+-		.rpc_cred = first->wb_context->cred,
++		.rpc_cred = data->cred,
+ 	};
+ 	struct rpc_task_setup task_setup_data = {
+ 		.task = &data->task,
+-		.rpc_client = NFS_CLIENT(inode),
++		.rpc_client = clnt,
+ 		.rpc_message = &msg,
+-		.callback_ops = &nfs_commit_ops,
++		.callback_ops = call_ops,
+ 		.callback_data = data,
+ 		.workqueue = nfsiod_workqueue,
+ 		.flags = RPC_TASK_ASYNC,
+ 		.priority = priority,
+ 	};
+ 
++	/* Set up the initial task struct.  */
++	NFS_PROTO(inode)->commit_setup(data, &msg);
 +
-+write_out:
-+	for (i = 0; i < spnfs_config->num_ds; i++) {
-+		if (filp[i]) {
-+			filp_close(filp[i], current->files);
-+			fput(filp[i]);
-+		}
-+	}
++	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 +
-+	return status;
++	task = rpc_run_task(&task_setup_data);
++	if (IS_ERR(task))
++		return PTR_ERR(task);
++	rpc_put_task(task);
++	return 0;
 +}
++EXPORT_SYMBOL(nfs_initiate_commit);
 +
-+__be32
-+spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen,
-+	    struct svc_rqst *rqstp)
-+{
-+	if (spnfs_config)
-+		return write(inode, offset, len, vlen, rqstp);
-+	else {
-+		printk(KERN_ERR "Please upgrade to latest spnfsd\n");
-+		return nfserr_notsupp;
-+	}
-+}
 +
-+int
-+spnfs_commit(void)
++int pnfs_initiate_commit(struct nfs_write_data *data,
++			 struct rpc_clnt *clnt,
++			 const struct rpc_call_ops *call_ops,
++			 int how, int pnfs)
 +{
-+	return 0;
-+}
++	if (pnfs &&
++	    (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED))
++		return pnfs_get_write_status(data);
 +
-+/*
-+ * Return the state for this object.
-+ * At this time simply return 0 to indicate success and use the existing state
-+ */
-+int
-+spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg)
-+{
-+	return 0;
++	return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how);
 +}
 +
 +/*
-+ * Return the filehandle for the specified file descriptor
++ * Set up the argument/result storage required for the RPC call.
 + */
-+int
-+spnfs_getfh(int fd, struct nfs_fh *fh)
++static int nfs_commit_rpcsetup(struct list_head *head,
++		struct nfs_write_data *data,
++		int how, int pnfs)
 +{
-+	struct file *file;
-+
-+	file = fget(fd);
-+	if (file == NULL)
-+		return -EIO;
++	struct nfs_page *first = nfs_list_entry(head->next);
++	struct inode *inode = first->wb_context->path.dentry->d_inode;
 +
-+	memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh));
-+	fput(file);
-+	return 0;
-+}
-diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
-index 322518c..2536d06 100644
---- a/fs/nfsd/state.h
-+++ b/fs/nfsd/state.h
-@@ -241,6 +241,12 @@ struct nfs4_client {
- 	u32			cl_cb_seq_nr;
- 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
- 						/* wait here for slots */
-+#if defined(CONFIG_PNFSD)
-+	struct list_head	cl_layouts;	/* outstanding layouts */
-+	struct list_head	cl_layoutrecalls; /* outstanding layoutrecall
-+						     callbacks */
-+	atomic_t		cl_deviceref;	/* Num outstanding devs */
-+#endif /* CONFIG_PNFSD */
- };
+ 	/* Set up the RPC argument and reply structs
+ 	 * NB: take care not to mess about with data->commit et al. */
  
- static inline void
-@@ -357,6 +363,14 @@ struct nfs4_file {
- 	u32                     fi_id;      /* used with stateowner->so_id 
- 					     * for stateid_hashtbl hash */
- 	bool			fi_had_conflict;
-+#if defined(CONFIG_PNFSD)
-+	struct list_head	fi_layouts;
-+	struct list_head	fi_layout_states;
-+	/* used by layoutget / layoutrecall */
-+	struct nfs4_fsid	fi_fsid;
-+	u32			fi_fhlen;
-+	u8			fi_fhval[NFS4_FHSIZE];
-+#endif /* CONFIG_PNFSD */
- };
+ 	list_splice_init(head, &data->pages);
  
- /* XXX: for first cut may fall back on returning file that doesn't work
-@@ -385,6 +399,15 @@ static inline struct file *find_any_file(struct nfs4_file *f)
- 		return f->fi_fds[O_RDONLY];
- }
+ 	data->inode	  = inode;
+-	data->cred	  = msg.rpc_cred;
++	data->cred	  = first->wb_context->cred;
  
-+#if defined(CONFIG_PNFSD)
-+/* pNFS Metadata server state */
-+
-+struct pnfs_ds_dev_entry {
-+	struct list_head	dd_dev_entry; /* st_pnfs_ds_id entry */
-+	u32			dd_dsid;
-+};
-+#endif /* CONFIG_PNFSD */
-+
- /*
- * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
- *
-@@ -407,6 +430,9 @@ struct nfs4_stateid {
- 	struct list_head              st_perfile;
- 	struct list_head              st_perstateowner;
- 	struct list_head              st_lockowners;
-+#if defined(CONFIG_PNFSD)
-+	struct list_head              st_pnfs_ds_id;
-+#endif /* CONFIG_PNFSD */
- 	struct nfs4_stateowner      * st_stateowner;
- 	struct nfs4_file            * st_file;
- 	stateid_t                     st_stateid;
-@@ -457,6 +483,34 @@ extern void nfsd4_recdir_purge_old(void);
- extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
- extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
- extern void release_session_client(struct nfsd4_session *);
-+extern void nfsd4_free_slab(struct kmem_cache **);
-+extern struct nfs4_file *find_file(struct inode *);
-+extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *);
-+extern void put_nfs4_file(struct nfs4_file *);
-+extern void get_nfs4_file(struct nfs4_file *);
-+extern struct nfs4_client *find_confirmed_client(clientid_t *);
-+extern struct nfs4_stateid *find_stateid(stateid_t *, int flags);
-+extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *);
-+extern __be32 nfs4_check_stateid(stateid_t *);
-+extern void expire_client_lock(struct nfs4_client *);
-+extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *);
-+
-+#if defined(CONFIG_PNFSD)
-+extern int nfsd4_init_pnfs_slabs(void);
-+extern void nfsd4_free_pnfs_slabs(void);
-+extern void pnfs_expire_client(struct nfs4_client *);
-+extern void release_pnfs_ds_dev_list(struct nfs4_stateid *);
-+extern void nfs4_pnfs_state_init(void);
-+extern void nfs4_pnfs_state_shutdown(void);
-+extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
-+extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *);
-+#else /* CONFIG_PNFSD */
-+static inline void nfsd4_free_pnfs_slabs(void) {}
-+static inline int nfsd4_init_pnfs_slabs(void) { return 0; }
-+static inline void pnfs_expire_client(struct nfs4_client *clp) {}
-+static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {}
-+static inline void nfs4_pnfs_state_shutdown(void) {}
-+#endif /* CONFIG_PNFSD */
+ 	data->args.fh     = NFS_FH(data->inode);
+ 	/* Note: we always request a commit of the entire inode */
+@@ -1283,45 +1389,47 @@ static int nfs_commit_rpcsetup(struct li
+ 	data->res.fattr   = &data->fattr;
+ 	data->res.verf    = &data->verf;
+ 	nfs_fattr_init(&data->fattr);
++	kref_init(&data->refcount);
++	data->parent      = NULL;
++	data->args.context = first->wb_context;  /* used by commit done */
+ 
+-	/* Set up the initial task struct.  */
+-	NFS_PROTO(inode)->commit_setup(data, &msg);
++	return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops,
++				    how, pnfs);
++}
+ 
+-	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
++/* Handle memory error during commit */
++void nfs_mark_list_commit(struct list_head *head)
++{
++	struct nfs_page         *req;
+ 
+-	task = rpc_run_task(&task_setup_data);
+-	if (IS_ERR(task))
+-		return PTR_ERR(task);
+-	rpc_put_task(task);
+-	return 0;
++	while (!list_empty(head)) {
++		req = nfs_list_entry(head->next);
++		nfs_list_remove_request(req);
++		nfs_mark_request_commit(req);
++		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
++		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
++				BDI_RECLAIMABLE);
++		nfs_clear_page_tag_locked(req);
++	}
+ }
++EXPORT_SYMBOL(nfs_mark_list_commit);
  
- static inline void
- nfs4_put_stateowner(struct nfs4_stateowner *so)
-@@ -470,4 +524,24 @@ nfs4_get_stateowner(struct nfs4_stateowner *so)
- 	kref_get(&so->so_ref);
+ /*
+  * Commit dirty pages
+  */
+ static int
+-nfs_commit_list(struct inode *inode, struct list_head *head, int how)
++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs)
+ {
+ 	struct nfs_write_data	*data;
+-	struct nfs_page         *req;
+ 
+ 	data = nfs_commitdata_alloc();
+-
+ 	if (!data)
+ 		goto out_bad;
+ 
+ 	/* Set up the argument struct */
+-	return nfs_commit_rpcsetup(head, data, how);
++	return nfs_commit_rpcsetup(head, data, how, pnfs);
+  out_bad:
+-	while (!list_empty(head)) {
+-		req = nfs_list_entry(head->next);
+-		nfs_list_remove_request(req);
+-		nfs_mark_request_commit(req);
+-		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+-		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+-				BDI_RECLAIMABLE);
+-		nfs_clear_page_tag_locked(req);
+-	}
++	nfs_mark_list_commit(head);
+ 	nfs_commit_clear_lock(NFS_I(inode));
+ 	return -ENOMEM;
+ }
+@@ -1341,6 +1449,19 @@ static void nfs_commit_done(struct rpc_t
+ 		return;
  }
  
-+static inline u64
-+end_offset(u64 start, u64 len)
-+{
-+	u64 end;
-+
-+	end = start + len;
-+	return end >= start ? end : NFS4_MAX_UINT64;
-+}
-+
-+/* last octet in a range */
-+static inline u64
-+last_byte_offset(u64 start, u64 len)
++static inline void nfs_commit_cleanup(struct kref *kref)
 +{
-+	u64 end;
++	struct nfs_write_data *data;
 +
-+	BUG_ON(!len);
-+	end = start + len;
-+	return end > start ? end - 1 : NFS4_MAX_UINT64;
++	data = container_of(kref, struct nfs_write_data, refcount);
++	/* Clear lock only when all cloned commits are finished */
++	if (data->parent)
++		kref_put(&data->parent->refcount, nfs_commit_cleanup);
++	else
++		nfs_commit_clear_lock(NFS_I(data->inode));
++	nfs_commitdata_release(data);
 +}
 +
- #endif   /* NFSD4_STATE_H */
-diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
-index 661a6cf..ed3a1b7 100644
---- a/fs/nfsd/vfs.c
-+++ b/fs/nfsd/vfs.c
-@@ -37,7 +37,12 @@
- #ifdef CONFIG_NFSD_V4
- #include <linux/nfs4_acl.h>
- #include <linux/nfsd_idmap.h>
-+#include <linux/security.h>
-+#include <linux/nfsd4_spnfs.h>
- #endif /* CONFIG_NFSD_V4 */
-+#if defined(CONFIG_SPNFS_BLOCK)
-+#include <linux/nfsd4_block.h>
-+#endif
- 
- #include "nfsd.h"
- #include "vfs.h"
-@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
- 					NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
- 			if (err)
- 				goto out;
-+#if defined(CONFIG_SPNFS_BLOCK)
-+			if (pnfs_block_enabled(inode, 0)) {
-+				err = bl_layoutrecall(inode, RETURN_FILE,
-+				    iap->ia_size, inode->i_size - iap->ia_size);
+ static void nfs_commit_release(void *calldata)
+ {
+ 	struct nfs_write_data	*data = calldata;
+@@ -1358,6 +1479,11 @@ static void nfs_commit_release(void *cal
+ 			req->wb_bytes,
+ 			(long long)req_offset(req));
+ 		if (status < 0) {
++			if (req->wb_lseg) {
++				nfs_mark_request_nopnfs(req);
++				nfs_mark_request_dirty(req);
++				goto next;
 +			}
-+#endif /* CONFIG_SPNFS_BLOCK */
+ 			nfs_context_set_write_error(req->wb_context, status);
+ 			nfs_inode_remove_request(req);
+ 			dprintk(", error = %d\n", status);
+@@ -1374,12 +1500,12 @@ static void nfs_commit_release(void *cal
  		}
+ 		/* We have a mismatch. Write the page again */
+ 		dprintk(" mismatch\n");
++		nfs_mark_request_nopnfs(req);
+ 		nfs_mark_request_dirty(req);
+ 	next:
+ 		nfs_clear_page_tag_locked(req);
+ 	}
+-	nfs_commit_clear_lock(NFS_I(data->inode));
+-	nfs_commitdata_release(calldata);
++	kref_put(&data->refcount, nfs_commit_cleanup);
+ }
  
- 		/*
-@@ -1716,6 +1727,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
- 	struct inode	*fdir, *tdir;
- 	__be32		err;
- 	int		host_err;
-+#ifdef CONFIG_SPNFS
-+	unsigned long ino = 0;
-+	unsigned long generation = 0;
-+	unsigned int nlink = 0;
-+#endif /* CONFIG_SPNFS */
+ static const struct rpc_call_ops nfs_commit_ops = {
+@@ -1395,21 +1521,22 @@ int nfs_commit_inode(struct inode *inode
+ 	LIST_HEAD(head);
+ 	int may_wait = how & FLUSH_SYNC;
+ 	int res = 0;
++	int use_pnfs = 0;
  
- 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
- 	if (err)
-@@ -1779,7 +1795,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
- 	if (host_err)
- 		goto out_dput_new;
+ 	if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+ 		goto out_mark_dirty;
+ 	spin_lock(&inode->i_lock);
+-	res = nfs_scan_commit(inode, &head, 0, 0);
++	res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs);
+ 	spin_unlock(&inode->i_lock);
+ 	if (res) {
+-		int error = nfs_commit_list(inode, &head, how);
++		int error = nfs_commit_list(inode, &head, how, use_pnfs);
+ 		if (error < 0)
+ 			return error;
+-		if (may_wait)
++		if (may_wait) {
+ 			wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
+ 					nfs_wait_bit_killable,
+ 					TASK_KILLABLE);
+-		else
++		} else
+ 			goto out_mark_dirty;
+ 	} else
+ 		nfs_commit_clear_lock(NFS_I(inode));
+@@ -1464,7 +1591,18 @@ static int nfs_commit_unstable_pages(str
  
-+#ifdef CONFIG_SPNFS
-+	/*
-+	 * if the target is a preexisting regular file, remember the
-+	 * inode number and generation so we can delete the stripes;
-+	 * save the link count as well so that the stripes only get
-+	 * get deleted when the last link is deleted
-+	 */
-+	if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) {
-+		ino = ndentry->d_inode->i_ino;
-+		generation = ndentry->d_inode->i_generation;
-+		nlink = ndentry->d_inode->i_nlink;
-+	}
-+#endif /* CONFIG_SPNFS */
-+
- 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-+#ifdef CONFIG_SPNFS
-+	if (spnfs_enabled() && (!host_err && ino && nlink == 1))
-+		spnfs_remove(ino, generation);
-+#endif /* CONFIG_SPNFS */
+ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+ {
+-	return nfs_commit_unstable_pages(inode, wbc);
++	int ret;
++	ret = nfs_commit_unstable_pages(inode, wbc);
++	if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) {
++		int err, sync = wbc->sync_mode;
 +
- 	if (!host_err) {
- 		host_err = commit_metadata(tfhp);
- 		if (!host_err)
-@@ -1820,6 +1855,11 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
- 	struct inode	*dirp;
- 	__be32		err;
- 	int		host_err;
-+#if defined(CONFIG_SPNFS)
-+	unsigned long	ino;
-+	unsigned long	generation;
-+	unsigned int	nlink;
-+#endif /* defined(CONFIG_SPNFS) */
++		if (wbc->nonblocking || wbc->for_background)
++			sync = 0;
++		err = pnfs_layoutcommit_inode(inode, sync);
++		if (err < 0)
++			ret = err;
++	}
++	return ret;
+ }
  
- 	err = nfserr_acces;
- 	if (!flen || isdotent(fname, flen))
-@@ -1843,6 +1883,17 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
- 		goto out;
- 	}
+ /*
+diff -up linux-2.6.37.noarch/include/linux/exportfs.h.orig linux-2.6.37.noarch/include/linux/exportfs.h
+--- linux-2.6.37.noarch/include/linux/exportfs.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/exportfs.h	2011-01-28 09:43:53.375767416 -0500
+@@ -2,6 +2,7 @@
+ #define LINUX_EXPORTFS_H 1
  
-+#if defined(CONFIG_SPNFS)
-+	/*
-+	 * Remember the inode number to communicate to the spnfsd
-+	 * for removal of stripes; save the link count as well so that
-+	 * the stripes only get get deleted when the last link is deleted
-+	 */
-+	ino = rdentry->d_inode->i_ino;
-+	generation = rdentry->d_inode->i_generation;
-+	nlink = rdentry->d_inode->i_nlink;
-+#endif /* defined(CONFIG_SPNFS) */
-+
- 	if (!type)
- 		type = rdentry->d_inode->i_mode & S_IFMT;
+ #include <linux/types.h>
++#include <linux/exp_xdr.h>
  
-@@ -1867,6 +1918,29 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
- 	if (!host_err)
- 		host_err = commit_metadata(fhp);
+ struct dentry;
+ struct inode;
+@@ -188,4 +189,62 @@ extern struct dentry *generic_fh_to_pare
+ 	struct fid *fid, int fh_len, int fh_type,
+ 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
  
-+#if defined(CONFIG_SPNFS)
-+	/*
-+	 * spnfs: notify spnfsd of removal to destroy stripes
-+	 */
-+/*
-+	sb = current_fh->fh_dentry->d_inode->i_sb;
-+	if (sb->s_export_op->spnfs_remove) {
-+*/
-+	dprintk("%s check if spnfs_enabled\n", __FUNCTION__);
-+	if (spnfs_enabled() && nlink == 1) {
-+		BUG_ON(ino == 0);
-+		dprintk("%s calling spnfs_remove inumber=%ld\n",
-+			__FUNCTION__, ino);
-+		if (spnfs_remove(ino, generation) == 0) {
-+			dprintk("%s spnfs_remove success\n", __FUNCTION__);
-+		} else {
-+			/* XXX How do we make this atomic? */
-+			printk(KERN_WARNING "nfsd: pNFS could not "
-+				"remove stripes for inode: %ld\n", ino);
-+		}
-+	}
-+#endif /* defined(CONFIG_SPNFS) */
++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
++struct pnfs_filelayout_device;
++struct pnfs_filelayout_layout;
++
++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr,
++				     const struct pnfs_filelayout_device *fdev);
++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr,
++				      const struct pnfs_filelayout_layout *flp);
++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
++
++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
++struct list_head;
 +
- 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
- out_nfserr:
- 	err = nfserrno(host_err);
-diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
-index 4d476ff..4cc7558 100644
---- a/fs/nfsd/xdr4.h
-+++ b/fs/nfsd/xdr4.h
-@@ -37,6 +37,8 @@
- #ifndef _LINUX_NFSD_XDR4_H
- #define _LINUX_NFSD_XDR4_H
- 
-+#include <linux/nfsd/nfsd4_pnfs.h>
++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr,
++				      const struct list_head *volumes);
 +
- #include "state.h"
- #include "nfsd.h"
- 
-@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete {
- 	u32 rca_one_fs;
- };
- 
-+struct nfsd4_pnfs_getdevinfo {
-+	struct nfsd4_pnfs_deviceid gd_devid;	/* request */
-+	u32			gd_layout_type;	/* request */
-+	u32			gd_maxcount;	/* request */
-+	u32			gd_notify_types;/* request */
-+	struct super_block	*gd_sb;
-+};
++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr,
++					       const struct list_head *layouts);
++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
 +
-+struct nfsd4_pnfs_getdevlist {
-+	u32             gd_layout_type;	/* request */
-+	u32		gd_maxdevices;	/* request */
-+	u64		gd_cookie;	/* request - response */
-+	u64		gd_verf;	/* request - response */
-+	struct svc_fh 	*gd_fhp;	/* response */
-+	u32		gd_eof;		/* response */
-+};
++#if defined(CONFIG_PNFSD)
++#include <linux/module.h>
 +
-+struct nfsd4_pnfs_layoutget {
-+	u64			lg_minlength;	/* request */
-+	u32			lg_signal;	/* request */
-+	u32			lg_maxcount;	/* request */
-+	struct svc_fh		*lg_fhp;	/* request */
-+	stateid_t		lg_sid;		/* request/response */
-+	struct nfsd4_layout_seg	lg_seg;		/* request/response */
-+	u32			lg_roc;		/* response */
-+};
++struct pnfsd_cb_operations;
 +
-+struct nfsd4_pnfs_layoutcommit {
-+	struct nfsd4_pnfs_layoutcommit_arg args;
-+	stateid_t		lc_sid;		/* request */
-+	struct nfsd4_pnfs_layoutcommit_res res;
++struct pnfsd_cb_ctl {
++	spinlock_t lock;
++	struct module *module;
++	const struct pnfsd_cb_operations *cb_op;
 +};
 +
-+enum layoutreturn_flags {
-+	LR_FLAG_INTERN = 1 << 0,	/* internal return */
-+	LR_FLAG_EXPIRE = 1 << 1,	/* return on client expiration */
-+};
++/* in expfs.c so that file systems can depend on it */
++extern struct pnfsd_cb_ctl pnfsd_cb_ctl;
 +
-+struct nfsd4_pnfs_layoutreturn {
-+	struct nfsd4_pnfs_layoutreturn_arg args;
-+	u32			lr_flags;
-+	stateid_t		lr_sid;		/* request/resopnse */
-+	u32			lrs_present;	/* response */
-+};
++static inline int
++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl)
++{
++	int ret = -ENOENT;
 +
- struct nfsd4_op {
- 	int					opnum;
- 	__be32					status;
-@@ -426,6 +473,13 @@ struct nfsd4_op {
- 		struct nfsd4_destroy_session	destroy_session;
- 		struct nfsd4_sequence		sequence;
- 		struct nfsd4_reclaim_complete	reclaim_complete;
-+#if defined(CONFIG_PNFSD)
-+		struct nfsd4_pnfs_getdevlist	pnfs_getdevlist;
-+		struct nfsd4_pnfs_getdevinfo	pnfs_getdevinfo;
-+		struct nfsd4_pnfs_layoutget	pnfs_layoutget;
-+		struct nfsd4_pnfs_layoutcommit	pnfs_layoutcommit;
-+		struct nfsd4_pnfs_layoutreturn	pnfs_layoutreturn;
++	spin_lock(&pnfsd_cb_ctl.lock);
++	if (!pnfsd_cb_ctl.cb_op)
++		goto out;
++	if (!try_module_get(pnfsd_cb_ctl.module))
++		goto out;
++	ctl->cb_op = pnfsd_cb_ctl.cb_op;
++	ctl->module = pnfsd_cb_ctl.module;
++	ret = 0;
++out:
++	spin_unlock(&pnfsd_cb_ctl.lock);
++	return ret;
++}
++
++static inline void
++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl)
++{
++	module_put(ctl->module);
++}
 +#endif /* CONFIG_PNFSD */
- 	} u;
- 	struct nfs4_replay *			replay;
- };
-diff --git a/include/linux/exp_xdr.h b/include/linux/exp_xdr.h
-new file mode 100644
-index 0000000..b69c309
---- /dev/null
-+++ b/include/linux/exp_xdr.h
+ #endif /* LINUX_EXPORTFS_H */
+diff -up linux-2.6.37.noarch/include/linux/exp_xdr.h.orig linux-2.6.37.noarch/include/linux/exp_xdr.h
+--- linux-2.6.37.noarch/include/linux/exp_xdr.h.orig	2011-01-28 09:43:53.373767706 -0500
++++ linux-2.6.37.noarch/include/linux/exp_xdr.h	2011-01-28 09:43:53.374767561 -0500
 @@ -0,0 +1,141 @@
 +#ifndef _LINUX_EXP_XDR_H
 +#define _LINUX_EXP_XDR_H
@@ -26471,86 +25001,10 @@ index 0000000..b69c309
 +	return lenp + 1 + exp_xdr_qwords(nbytes);
 +}
 +#endif /* _LINUX_EXP_XDR_H */
-diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
-index a9cd507..225560d 100644
---- a/include/linux/exportfs.h
-+++ b/include/linux/exportfs.h
-@@ -2,6 +2,7 @@
- #define LINUX_EXPORTFS_H 1
- 
- #include <linux/types.h>
-+#include <linux/exp_xdr.h>
- 
- struct dentry;
- struct inode;
-@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_parent(struct super_block *sb,
- 	struct fid *fid, int fh_len, int fh_type,
- 	struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
- 
-+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
-+struct pnfs_filelayout_device;
-+struct pnfs_filelayout_layout;
-+
-+extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr,
-+				     const struct pnfs_filelayout_device *fdev);
-+extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr,
-+				      const struct pnfs_filelayout_layout *flp);
-+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
-+
-+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
-+struct list_head;
-+
-+extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr,
-+				      const struct list_head *volumes);
-+
-+extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr,
-+					       const struct list_head *layouts);
-+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
-+
-+#if defined(CONFIG_PNFSD)
-+#include <linux/module.h>
-+
-+struct pnfsd_cb_operations;
-+
-+struct pnfsd_cb_ctl {
-+	spinlock_t lock;
-+	struct module *module;
-+	const struct pnfsd_cb_operations *cb_op;
-+};
-+
-+/* in expfs.c so that file systems can depend on it */
-+extern struct pnfsd_cb_ctl pnfsd_cb_ctl;
-+
-+static inline int
-+pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl)
-+{
-+	int ret = -ENOENT;
-+
-+	spin_lock(&pnfsd_cb_ctl.lock);
-+	if (!pnfsd_cb_ctl.cb_op)
-+		goto out;
-+	if (!try_module_get(pnfsd_cb_ctl.module))
-+		goto out;
-+	ctl->cb_op = pnfsd_cb_ctl.cb_op;
-+	ctl->module = pnfsd_cb_ctl.module;
-+	ret = 0;
-+out:
-+	spin_unlock(&pnfsd_cb_ctl.lock);
-+	return ret;
-+}
-+
-+static inline void
-+pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl)
-+{
-+	module_put(ctl->module);
-+}
-+#endif /* CONFIG_PNFSD */
- #endif /* LINUX_EXPORTFS_H */
-diff --git a/include/linux/fs.h b/include/linux/fs.h
-index 63d069b..3a8601a 100644
---- a/include/linux/fs.h
-+++ b/include/linux/fs.h
-@@ -388,6 +388,7 @@ struct inodes_stat_t {
+diff -up linux-2.6.37.noarch/include/linux/fs.h.orig linux-2.6.37.noarch/include/linux/fs.h
+--- linux-2.6.37.noarch/include/linux/fs.h.orig	2011-01-28 09:37:32.791971306 -0500
++++ linux-2.6.37.noarch/include/linux/fs.h	2011-01-28 09:43:53.377767138 -0500
+@@ -399,6 +399,7 @@ struct inodes_stat_t {
  #include <asm/byteorder.h>
  
  struct export_operations;
@@ -26558,7 +25012,7 @@ index 63d069b..3a8601a 100644
  struct hd_geometry;
  struct iovec;
  struct nameidata;
-@@ -1327,6 +1328,7 @@ struct super_block {
+@@ -1367,6 +1368,7 @@ struct super_block {
  	const struct dquot_operations	*dq_op;
  	const struct quotactl_ops	*s_qcop;
  	const struct export_operations *s_export_op;
@@ -26566,25 +25020,20 @@ index 63d069b..3a8601a 100644
  	unsigned long		s_flags;
  	unsigned long		s_magic;
  	struct dentry		*s_root;
-diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
-index 07e40c6..df29296 100644
---- a/include/linux/nfs4.h
-+++ b/include/linux/nfs4.h
-@@ -17,7 +17,10 @@
+diff -up linux-2.6.37.noarch/include/linux/nfs4.h.orig linux-2.6.37.noarch/include/linux/nfs4.h
+--- linux-2.6.37.noarch/include/linux/nfs4.h.orig	2011-01-28 09:37:32.862968843 -0500
++++ linux-2.6.37.noarch/include/linux/nfs4.h	2011-01-28 09:43:53.379766874 -0500
+@@ -17,6 +17,7 @@
  
  #define NFS4_BITMAP_SIZE	2
  #define NFS4_VERIFIER_SIZE	8
--#define NFS4_STATEID_SIZE	16
 +#define NFS4_CLIENTID_SIZE	8
-+#define NFS4_STATEID_SEQID_SIZE 4
-+#define NFS4_STATEID_OTHER_SIZE 12
-+#define NFS4_STATEID_SIZE	(NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
- #define NFS4_FHSIZE		128
- #define NFS4_MAXPATHLEN		PATH_MAX
- #define NFS4_MAXNAMLEN		NAME_MAX
-@@ -119,6 +122,13 @@
- #define EXCHGID4_FLAG_MASK_A			0x40070003
- #define EXCHGID4_FLAG_MASK_R			0x80070003
+ #define NFS4_STATEID_SEQID_SIZE 4
+ #define NFS4_STATEID_OTHER_SIZE 12
+ #define NFS4_STATEID_SIZE	(NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
+@@ -131,6 +132,13 @@
+ #define EXCHGID4_FLAG_MASK_A			0x40070103
+ #define EXCHGID4_FLAG_MASK_R			0x80070103
  
 +static inline bool
 +is_ds_only_session(u32 exchange_flags)
@@ -26593,719 +25042,558 @@ index 07e40c6..df29296 100644
 +	return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS;
 +}
 +
- #define SEQ4_STATUS_CB_PATH_DOWN		0x00000001
- #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING	0x00000002
- #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED	0x00000004
-@@ -166,8 +176,23 @@ struct nfs4_acl {
- 	struct nfs4_ace	aces[0];
- };
- 
-+struct nfs4_fsid {
-+	u64	major;
-+	u64	minor;
-+};
+ #define SEQ4_STATUS_CB_PATH_DOWN		0x00000001
+ #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING	0x00000002
+ #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED	0x00000004
+@@ -181,7 +189,13 @@ struct nfs4_acl {
+ 	struct nfs4_ace	aces[0];
+ };
+ 
++struct nfs4_fsid {
++	u64	major;
++	u64	minor;
++};
++
+ typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid;
+ 
+ struct nfs41_stateid {
+ 	__be32 seqid;
+@@ -559,7 +573,12 @@ enum {
+ 	NFSPROC4_CLNT_GET_LEASE_TIME,
+ 	NFSPROC4_CLNT_RECLAIM_COMPLETE,
+ 	NFSPROC4_CLNT_LAYOUTGET,
++	NFSPROC4_CLNT_LAYOUTCOMMIT,
++	NFSPROC4_CLNT_LAYOUTRETURN,
++	NFSPROC4_CLNT_GETDEVICELIST,
+ 	NFSPROC4_CLNT_GETDEVICEINFO,
++	NFSPROC4_CLNT_PNFS_WRITE,
++	NFSPROC4_CLNT_PNFS_COMMIT,
+ };
+ 
+ /* nfs41 types */
+@@ -582,6 +601,8 @@ enum pnfs_layouttype {
+ 	LAYOUT_NFSV4_1_FILES  = 1,
+ 	LAYOUT_OSD2_OBJECTS = 2,
+ 	LAYOUT_BLOCK_VOLUME = 3,
++
++	NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000
+ };
+ 
+ /* used for both layout return and recall */
+diff -up linux-2.6.37.noarch/include/linux/nfsd4_block.h.orig linux-2.6.37.noarch/include/linux/nfsd4_block.h
+--- linux-2.6.37.noarch/include/linux/nfsd4_block.h.orig	2011-01-28 09:43:53.392765435 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd4_block.h	2011-01-28 09:43:53.392765435 -0500
+@@ -0,0 +1,101 @@
++#ifndef NFSD4_BLOCK
++#define NFSD4_BLOCK
++
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/svcauth.h>
++#include <linux/nfsd/nfsfh.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
++
++#define PNFS_BLOCK_SUCCESS		1
++#define PNFS_BLOCK_FAILURE		0
++
++#define PNFS_BLOCK_CTL_START		1
++#define PNFS_BLOCK_CTL_STOP		2
++#define PNFS_BLOCK_CTL_VERS		3 /* Allows daemon to request current
++					   * version from kernel via an upcall.
++					   */
++
++#define PNFS_UPCALL_MSG_STOP	0
++#define PNFS_UPCALL_MSG_GETSIG	1
++#define PNFS_UPCALL_MSG_GETSLICE	2
++#define PNFS_UPCALL_MSG_DMCHK	3	// See if dev_t is a DM volume
++#define PNFS_UPCALL_MSG_DMGET	4
++#define PNFS_UPCALL_MSG_VERS	5
++
++#define PNFS_UPCALL_VERS		8
++
++typedef struct stripe_dev {
++	int	major,
++		minor,
++		offset;
++} stripe_dev_t;
++
++typedef struct bl_comm_res {
++	int				res_status;
++	union {
++		struct {
++			long long	start,
++					length;
++		} slice;
++		struct {
++			int		num_stripes,
++					stripe_size;
++			stripe_dev_t	devs[];
++		} stripe;
++		struct {
++			long long	sector;
++			int		offset,
++					len;
++			char		sig[];
++		} sig;
++		int			vers,
++					dm_vol;
++	} u;
++} bl_comm_res_t;
++
++typedef struct bl_comm_msg {
++	int		msg_type,
++			msg_status;
++	union {
++		dev_t	msg_dev;
++		int	msg_vers;
++	} u;
++	bl_comm_res_t	*msg_res;
++} bl_comm_msg_t;
 +
- typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
--typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid;
-+typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid;
++#ifdef __KERNEL__
 +
-+struct nfs41_stateid {
-+	__be32 seqid;
-+	char other[NFS4_STATEID_OTHER_SIZE];
-+} __attribute__ ((packed));
-+
-+typedef union {
-+	char data[NFS4_STATEID_SIZE];
-+	struct nfs41_stateid stateid;
-+} nfs4_stateid;
- 
- enum nfs_opnum4 {
- 	OP_ACCESS = 3,
-@@ -471,6 +496,8 @@ enum lock_type4 {
- #define FATTR4_WORD1_TIME_MODIFY        (1UL << 21)
- #define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
- #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
-+#define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
-+#define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
- 
- #define NFSPROC4_NULL 0
- #define NFSPROC4_COMPOUND 1
-@@ -532,6 +559,13 @@ enum {
- 	NFSPROC4_CLNT_SEQUENCE,
- 	NFSPROC4_CLNT_GET_LEASE_TIME,
- 	NFSPROC4_CLNT_RECLAIM_COMPLETE,
-+	NFSPROC4_CLNT_LAYOUTGET,
-+	NFSPROC4_CLNT_LAYOUTCOMMIT,
-+	NFSPROC4_CLNT_LAYOUTRETURN,
-+	NFSPROC4_CLNT_GETDEVICELIST,
-+	NFSPROC4_CLNT_GETDEVICEINFO,
-+	NFSPROC4_CLNT_PNFS_WRITE,
-+	NFSPROC4_CLNT_PNFS_COMMIT,
- };
- 
- /* nfs41 types */
-@@ -550,6 +584,51 @@ enum state_protect_how4 {
- 	SP4_SSV		= 2
- };
- 
-+enum pnfs_layouttype {
-+	LAYOUT_NFSV4_1_FILES  = 1,
-+	LAYOUT_OSD2_OBJECTS = 2,
-+	LAYOUT_BLOCK_VOLUME = 3,
++typedef struct bl_comm {
++	/* ---- protects access to this structure ---- */
++	struct mutex		lock;
++	/* ---- protects access to rpc pipe ---- */
++	struct mutex		pipe_lock;
++	struct dentry		*pipe_dentry;
++	wait_queue_head_t	pipe_wq;
++	bl_comm_msg_t		msg;
++} bl_comm_t;
 +
-+	NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000
-+};
++int pnfs_block_enabled(struct inode *, int);
++int bl_layout_type(struct super_block *sb);
++int bl_getdeviceiter(struct super_block *, u32 layout_type,
++		     struct nfsd4_pnfs_dev_iter_res *);
++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
++		     u32 layout_type,
++		     const struct nfsd4_pnfs_deviceid *);
++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *,
++			   const struct nfsd4_pnfs_layoutget_arg *,
++			   struct nfsd4_pnfs_layoutget_res *);
++int bl_layoutcommit(struct inode *,
++		    const struct nfsd4_pnfs_layoutcommit_arg *,
++		    struct nfsd4_pnfs_layoutcommit_res *);
++int bl_layoutreturn(struct inode *,
++		    const struct nfsd4_pnfs_layoutreturn_arg *);
++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len);
++int bl_init_proc(void);
++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **);
 +
-+/* used for both layout return and recall */
-+enum pnfs_layoutreturn_type {
-+	RETURN_FILE = 1,
-+	RETURN_FSID = 2,
-+	RETURN_ALL  = 3
-+};
++extern bl_comm_t	*bl_comm_global;	// Ugly...
++#endif /* __KERNEL__ */
 +
-+enum pnfs_iomode {
-+	IOMODE_READ = 1,
-+	IOMODE_RW = 2,
-+	IOMODE_ANY = 3,
-+};
++#endif /* NFSD4_BLOCK */
 +
-+enum pnfs_notify_deviceid_type4 {
-+	NOTIFY_DEVICEID4_CHANGE = 1 << 1,
-+	NOTIFY_DEVICEID4_DELETE = 1 << 2,
-+};
+diff -up linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h
+--- linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h.orig	2011-01-28 09:43:53.394765249 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd4_spnfs.h	2011-01-28 09:43:53.394765249 -0500
+@@ -0,0 +1,345 @@
++/*
++ * include/linux/nfsd4_spnfs.h
++ *
++ * spNFS - simple pNFS implementation with userspace daemon
++ *
++ */
 +
-+#define NFL4_UFLG_MASK			0x0000003F
-+#define NFL4_UFLG_DENSE			0x00000001
-+#define NFL4_UFLG_COMMIT_THRU_MDS	0x00000002
-+#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK	0xFFFFFFC0
-+
-+/* Encoded in the loh_body field of type layouthint4 */
-+enum filelayout_hint_care4 {
-+	NFLH4_CARE_DENSE		= NFL4_UFLG_DENSE,
-+	NFLH4_CARE_COMMIT_THRU_MDS	= NFL4_UFLG_COMMIT_THRU_MDS,
-+	NFLH4_CARE_STRIPE_UNIT_SIZE	= 0x00000040,
-+	NFLH4_CARE_STRIPE_COUNT		= 0x00000080
-+};
++/******************************************************************************
 +
-+#define NFS4_DEVICEID4_SIZE 16
++(c) 2007 Network Appliance, Inc.  All Rights Reserved.
 +
-+struct nfs4_deviceid {
-+	char data[NFS4_DEVICEID4_SIZE];
-+};
++Network Appliance provides this source code under the GPL v2 License.
++The GPL v2 license is available at
++http://opensource.org/licenses/gpl-license.php.
 +
- #endif
- #endif
- 
-diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
-index 508f8cf..27c45cc 100644
---- a/include/linux/nfs_fs.h
-+++ b/include/linux/nfs_fs.h
-@@ -188,6 +188,10 @@ struct nfs_inode {
- 	struct nfs_delegation	*delegation;
- 	fmode_t			 delegation_state;
- 	struct rw_semaphore	rwsem;
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +
-+	/* pNFS layout information */
-+	struct rpc_wait_queue lo_rpcwaitq;
-+	struct pnfs_layout_hdr *layout;
- #endif /* CONFIG_NFS_V4*/
- #ifdef CONFIG_NFS_FSCACHE
- 	struct fscache_cookie	*fscache;
-@@ -490,8 +494,12 @@ extern void nfs_unblock_sillyrename(struct dentry *dentry);
- extern int  nfs_congestion_kb;
- extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
- extern int  nfs_writepages(struct address_space *, struct writeback_control *);
--extern int  nfs_flush_incompatible(struct file *file, struct page *page);
--extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-+struct pnfs_layout_segment;
-+extern int  nfs_flush_incompatible(struct file *file, struct page *page,
-+				   struct pnfs_layout_segment *lseg);
-+extern int  nfs_updatepage(struct file *, struct page *,
-+			   unsigned int offset, unsigned int count,
-+			   struct pnfs_layout_segment *lseg, void *fsdata);
- extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
- 
- /*
-@@ -613,6 +621,8 @@ extern void * nfs_root_data(void);
- #define NFSDBG_CLIENT		0x0200
- #define NFSDBG_MOUNT		0x0400
- #define NFSDBG_FSCACHE		0x0800
-+#define NFSDBG_PNFS		0x1000
-+#define NFSDBG_PNFS_LD		0x2000
- #define NFSDBG_ALL		0xFFFF
- 
- #ifdef __KERNEL__
-diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
-index c82ee7c..81121d0 100644
---- a/include/linux/nfs_fs_sb.h
-+++ b/include/linux/nfs_fs_sb.h
-@@ -82,6 +82,8 @@ struct nfs_client {
- 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
- 	u32			cl_exchange_flags;
- 	struct nfs4_session	*cl_session; 	/* sharred session */
-+	struct list_head	cl_layouts;
-+	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
- #endif /* CONFIG_NFS_V4_1 */
- 
- #ifdef CONFIG_NFS_FSCACHE
-@@ -89,6 +91,16 @@ struct nfs_client {
- #endif
- };
- 
-+static inline bool
-+is_ds_only_client(struct nfs_client *clp)
-+{
-+#ifdef CONFIG_NFS_V4_1
-+	return is_ds_only_session(clp->cl_exchange_flags);
++******************************************************************************/
++
++#ifndef NFS_SPNFS_H
++#define NFS_SPNFS_H
++
++
++#ifdef __KERNEL__
++#include "exportfs.h"
++#include "sunrpc/svc.h"
++#include "nfsd/nfsfh.h"
 +#else
-+	return false;
-+#endif
-+}
++#include <sys/types.h>
++#endif /* __KERNEL__ */
 +
- /*
-  * NFS client parameters stored in the superblock.
-  */
-@@ -133,7 +145,7 @@ struct nfs_server {
- #endif
- 
- #ifdef CONFIG_NFS_V4
--	u32			attr_bitmask[2];/* V4 bitmask representing the set
-+	u32			attr_bitmask[3];/* V4 bitmask representing the set
- 						   of attributes supported on this
- 						   filesystem */
- 	u32			cache_consistency_bitmask[2];
-@@ -144,6 +156,11 @@ struct nfs_server {
- 	u32			acl_bitmask;	/* V4 bitmask representing the ACEs
- 						   that are supported on this
- 						   filesystem */
-+	struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
-+	void			       *pnfs_ld_data; /* Per-mount data */
-+	unsigned int			ds_rsize;  /* Data server read size */
-+	unsigned int			ds_wsize;  /* Data server write size */
-+	u32				pnfs_blksize; /* layout_blksize attr */
- #endif
- 	void (*destroy)(struct nfs_server *);
- 
-diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
-index 68b10f5..f9b5f44 100644
---- a/include/linux/nfs_iostat.h
-+++ b/include/linux/nfs_iostat.h
-@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters {
- 	NFSIOS_SHORTREAD,
- 	NFSIOS_SHORTWRITE,
- 	NFSIOS_DELAY,
-+	NFSIOS_PNFS_READ,
-+	NFSIOS_PNFS_WRITE,
-+	NFSIOS_PNFS_COMMIT,
- 	__NFSIOS_COUNTSMAX,
- };
- 
-diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
-index f8b60e7..6fa43c7 100644
---- a/include/linux/nfs_page.h
-+++ b/include/linux/nfs_page.h
-@@ -48,6 +48,7 @@ struct nfs_page {
- 	struct kref		wb_kref;	/* reference count */
- 	unsigned long		wb_flags;
- 	struct nfs_writeverf	wb_verf;	/* Commit cookie */
-+	struct pnfs_layout_segment *wb_lseg;	/* Pnfs layout info */
- };
- 
- struct nfs_pageio_descriptor {
-@@ -61,6 +62,11 @@ struct nfs_pageio_descriptor {
- 	int			(*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
- 	int 			pg_ioflags;
- 	int			pg_error;
-+	struct pnfs_layout_segment *pg_lseg;
-+#ifdef CONFIG_NFS_V4_1
-+	int			pg_iswrite;
-+	int			(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
-+#endif /* CONFIG_NFS_V4_1 */
- };
- 
- #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
-@@ -69,13 +75,15 @@ extern	struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
- 					    struct inode *inode,
- 					    struct page *page,
- 					    unsigned int offset,
--					    unsigned int count);
-+					    unsigned int count,
-+					    struct pnfs_layout_segment *lseg);
- extern	void nfs_clear_request(struct nfs_page *req);
- extern	void nfs_release_request(struct nfs_page *req);
- 
- 
- extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
--			  pgoff_t idx_start, unsigned int npages, int tag);
-+			  pgoff_t idx_start, unsigned int npages, int tag,
-+			  int *use_pnfs);
- extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
- 			     struct inode *inode,
- 			     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
-diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
-index fc46192..63c0301 100644
---- a/include/linux/nfs_xdr.h
-+++ b/include/linux/nfs_xdr.h
-@@ -3,6 +3,8 @@
- 
- #include <linux/nfsacl.h>
- #include <linux/nfs3.h>
-+#include <linux/nfs4.h>
-+#include <linux/sunrpc/sched.h>
- 
- /*
-  * To change the maximum rsize and wsize supported by the NFS client, adjust
-@@ -10,7 +12,7 @@
-  * support a megabyte or more.  The default is left at 4096 bytes, which is
-  * reasonable for NFS over UDP.
-  */
--#define NFS_MAX_FILE_IO_SIZE	(1048576U)
-+#define NFS_MAX_FILE_IO_SIZE	(4U * 1048576U)
- #define NFS_DEF_FILE_IO_SIZE	(4096U)
- #define NFS_MIN_FILE_IO_SIZE	(1024U)
- 
-@@ -113,6 +115,8 @@ struct nfs_fsinfo {
- 	__u32			dtpref;	/* pref. readdir transfer size */
- 	__u64			maxfilesize;
- 	__u32			lease_time; /* in seconds */
-+	__u32			layouttype; /* supported pnfs layout driver */
-+	__u32			blksize; /* preferred pnfs io block size */
- };
- 
- struct nfs_fsstat {
-@@ -185,6 +189,123 @@ struct nfs4_get_lease_time_res {
- 	struct nfs4_sequence_res	lr_seq_res;
- };
- 
-+#define PNFS_LAYOUT_MAXSIZE 4096
++#define SPNFS_STATUS_INVALIDMSG		0x01
++#define SPNFS_STATUS_AGAIN		0x02
++#define SPNFS_STATUS_FAIL		0x04
++#define SPNFS_STATUS_SUCCESS		0x08
++
++#define SPNFS_TYPE_LAYOUTGET		0x01
++#define SPNFS_TYPE_LAYOUTCOMMIT		0x02
++#define SPNFS_TYPE_LAYOUTRETURN		0x03
++#define SPNFS_TYPE_GETDEVICEITER	0x04
++#define SPNFS_TYPE_GETDEVICEINFO	0x05
++#define SPNFS_TYPE_SETATTR		0x06
++#define SPNFS_TYPE_OPEN			0x07
++#define	SPNFS_TYPE_CLOSE		0x08
++#define SPNFS_TYPE_CREATE		0x09
++#define SPNFS_TYPE_REMOVE		0x0a
++#define SPNFS_TYPE_COMMIT		0x0b
++#define SPNFS_TYPE_READ			0x0c
++#define SPNFS_TYPE_WRITE		0x0d
++
++#define	SPNFS_MAX_DEVICES		1
++#define	SPNFS_MAX_DATA_SERVERS		16
++#define SPNFS_MAX_IO			512
 +
-+struct nfs4_layoutdriver_data {
-+	__u32 len;
-+	void *buf;
++/* layout */
++struct spnfs_msg_layoutget_args {
++	unsigned long inode;
++	unsigned long generation;
 +};
 +
-+struct pnfs_layout_range {
-+	u32 iomode;
-+	u64 offset;
-+	u64 length;
++struct spnfs_filelayout_list {
++	u_int32_t       fh_len;
++	unsigned char   fh_val[128]; /* DMXXX fix this const */
 +};
 +
-+struct nfs4_layoutget_args {
-+	__u32 type;
-+	struct pnfs_layout_range range;
-+	__u64 minlength;
-+	__u32 maxcount;
-+	struct inode *inode;
-+	struct nfs_open_context *ctx;
-+	struct nfs4_sequence_args seq_args;
++struct spnfs_msg_layoutget_res {
++	int status;
++	u_int64_t devid;
++	u_int64_t stripe_size;
++	u_int32_t stripe_type;
++	u_int32_t stripe_count;
++	struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS];
 +};
 +
-+struct nfs4_layoutget_res {
-+	__u32 return_on_close;
-+	struct pnfs_layout_range range;
-+	__u32 type;
-+	nfs4_stateid stateid;
-+	struct nfs4_layoutdriver_data layout;
-+	struct nfs4_sequence_res seq_res;
++/* layoutcommit */
++struct spnfs_msg_layoutcommit_args {
++	unsigned long inode;
++	unsigned long generation;
++	u_int64_t file_size;
 +};
 +
-+struct nfs4_layoutget {
-+	struct nfs4_layoutget_args args;
-+	struct nfs4_layoutget_res res;
-+	struct pnfs_layout_segment **lsegpp;
++struct spnfs_msg_layoutcommit_res {
 +	int status;
 +};
 +
-+struct nfs4_layoutcommit_args {
-+	nfs4_stateid stateid;
-+	__u64 lastbytewritten;
-+	__u32 time_modify_changed;
-+	struct timespec time_modify;
-+	const u32 *bitmask;
-+	struct nfs_fh *fh;
-+	struct inode *inode;
-+
-+	/* Values set by layout driver */
-+	struct pnfs_layout_range range;
-+	__u32 layout_type;
-+	void *layoutdriver_data;
-+	struct nfs4_sequence_args seq_args;
++/* layoutreturn */
++/* No op for the daemon */
++/*
++struct spnfs_msg_layoutreturn_args {
 +};
 +
-+struct nfs4_layoutcommit_res {
-+	__u32 sizechanged;
-+	__u64 newsize;
-+	struct nfs_fattr *fattr;
-+	const struct nfs_server *server;
-+	struct nfs4_sequence_res seq_res;
++struct spnfs_msg_layoutreturn_res {
 +};
++*/
 +
-+struct nfs4_layoutcommit_data {
-+	struct rpc_task task;
-+	struct rpc_cred *cred;
-+	struct nfs_fattr fattr;
-+	struct nfs4_layoutcommit_args args;
-+	struct nfs4_layoutcommit_res res;
-+	int status;
++/* getdeviceiter */
++struct spnfs_msg_getdeviceiter_args {
++	unsigned long inode;
++	u_int64_t cookie;
++	u_int64_t verf;
 +};
 +
-+struct nfs4_layoutreturn_args {
-+	__u32   reclaim;
-+	__u32   layout_type;
-+	__u32   return_type;
-+	struct pnfs_layout_range range;
-+	struct inode *inode;
-+	struct nfs4_sequence_args seq_args;
++struct spnfs_msg_getdeviceiter_res {
++	int status;
++	u_int64_t devid;
++	u_int64_t cookie;
++	u_int64_t verf;
++	u_int32_t eof;
 +};
 +
-+struct nfs4_layoutreturn_res {
-+	struct nfs4_sequence_res seq_res;
-+	bool valid;	/* internal, true if received reply */
-+	u32 lrs_present;
-+	nfs4_stateid stateid;
++/* getdeviceinfo */
++struct spnfs_data_server {
++	u_int32_t dsid;
++	char netid[5];
++	char addr[29];
 +};
 +
-+struct nfs4_layoutreturn {
-+	struct nfs4_layoutreturn_args args;
-+	struct nfs4_layoutreturn_res res;
-+	struct rpc_cred *cred;
-+	const nfs4_stateid *stateid;
-+	int rpc_status;
++struct spnfs_device {
++	u_int64_t devid;
++	int dscount;
++	struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS];
 +};
 +
-+struct nfs4_getdevicelist_args {
-+	const struct nfs_fh *fh;
-+	u32 layoutclass;
-+	struct nfs4_sequence_args seq_args;
++struct spnfs_msg_getdeviceinfo_args {
++	u_int64_t devid;
 +};
 +
-+struct nfs4_getdevicelist_res {
-+	struct pnfs_devicelist *devlist;
-+	struct nfs4_sequence_res seq_res;
++struct spnfs_msg_getdeviceinfo_res {
++	int status;
++	struct spnfs_device devinfo;
 +};
 +
-+struct nfs4_getdeviceinfo_args {
-+	struct pnfs_device *pdev;
-+	struct nfs4_sequence_args seq_args;
++/* setattr */
++struct spnfs_msg_setattr_args {
++	unsigned long inode;
++	unsigned long generation;
++	int file_size;
 +};
 +
-+struct nfs4_getdeviceinfo_res {
-+	struct pnfs_device *pdev;
-+	struct nfs4_sequence_res seq_res;
++struct spnfs_msg_setattr_res {
++	int status;
 +};
 +
- /*
-  * Arguments to the open call.
-  */
-@@ -854,7 +975,7 @@ struct nfs4_server_caps_arg {
- };
- 
- struct nfs4_server_caps_res {
--	u32				attr_bitmask[2];
-+	u32				attr_bitmask[3];
- 	u32				acl_bitmask;
- 	u32				has_links;
- 	u32				has_symlinks;
-@@ -969,6 +1090,30 @@ struct nfs_page;
- 
- #define NFS_PAGEVEC_SIZE	(8U)
- 
-+#if defined(CONFIG_NFS_V4_1)
-+
-+/* pnfsflag values */
-+enum pnfs_flags {
-+	PNFS_NO_RPC = 1 << 0,	/* non rpc result callback switch */
++/* open */
++struct spnfs_msg_open_args {
++	unsigned long inode;
++	unsigned long generation;
++	int create;
++	int createmode;
++	int truncate;
 +};
 +
-+/* pnfs-specific data needed for read, write, and commit calls */
-+struct pnfs_call_data {
-+	struct pnfs_layout_segment *lseg;
-+	const struct rpc_call_ops *call_ops;
-+	u32			orig_count;	/* for retry via MDS */
-+	int			pnfs_error;
-+	u8			pnfsflags;
-+	u8			how;		/* for FLUSH_STABLE */
++struct spnfs_msg_open_res {
++	int status;
 +};
 +
-+/* files layout-type specific data for read, write, and commit */
-+struct pnfs_fl_call_data {
-+	struct nfs_client	*ds_nfs_client;
-+	__u64			orig_offset;
++/* close */
++/* No op for daemon */
++struct spnfs_msg_close_args {
++	int x;
 +};
-+#endif /* CONFIG_NFS_V4_1 */
-+
- struct nfs_read_data {
- 	int			flags;
- 	struct rpc_task		task;
-@@ -984,10 +1129,16 @@ struct nfs_read_data {
- #ifdef CONFIG_NFS_V4
- 	unsigned long		timestamp;	/* For lease renewal */
- #endif
-+#if defined(CONFIG_NFS_V4_1)
-+	struct pnfs_call_data	pdata;
-+	struct pnfs_fl_call_data fldata;
-+#endif /* CONFIG_NFS_V4_1 */
- 	struct page		*page_array[NFS_PAGEVEC_SIZE];
- };
- 
- struct nfs_write_data {
-+	struct kref		refcount;	/* For pnfs commit splitting */
-+	struct nfs_write_data	*parent;	/* For pnfs commit splitting */
- 	int			flags;
- 	struct rpc_task		task;
- 	struct inode		*inode;
-@@ -1003,6 +1154,10 @@ struct nfs_write_data {
- #ifdef CONFIG_NFS_V4
- 	unsigned long		timestamp;	/* For lease renewal */
- #endif
-+#if defined(CONFIG_NFS_V4_1)
-+	struct pnfs_call_data	pdata;
-+	struct pnfs_fl_call_data fldata;
-+#endif /* CONFIG_NFS_V4_1 */
- 	struct page		*page_array[NFS_PAGEVEC_SIZE];
- };
- 
-diff --git a/include/linux/nfsd/const.h b/include/linux/nfsd/const.h
-index 323f8cf..520fcfb 100644
---- a/include/linux/nfsd/const.h
-+++ b/include/linux/nfsd/const.h
-@@ -29,6 +29,7 @@
- #ifdef __KERNEL__
- 
- #include <linux/sunrpc/msg_prot.h>
-+#include <linux/sunrpc/svc.h>
- 
- /*
-  * Largest number of bytes we need to allocate for an NFS
-diff --git a/include/linux/nfsd/debug.h b/include/linux/nfsd/debug.h
-index ee4aa91..aad7013 100644
---- a/include/linux/nfsd/debug.h
-+++ b/include/linux/nfsd/debug.h
-@@ -32,6 +32,8 @@
- #define NFSDDBG_REPCACHE	0x0080
- #define NFSDDBG_XDR		0x0100
- #define NFSDDBG_LOCKD		0x0200
-+#define NFSDDBG_PNFS		0x0400
-+#define NFSDDBG_FILELAYOUT	0x0800
- #define NFSDDBG_ALL		0x7FFF
- #define NFSDDBG_NOCHANGE	0xFFFF
- 
-diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
-index 8ae78a6..933ab78 100644
---- a/include/linux/nfsd/export.h
-+++ b/include/linux/nfsd/export.h
-@@ -100,6 +100,7 @@ struct svc_export {
- 	uid_t			ex_anon_uid;
- 	gid_t			ex_anon_gid;
- 	int			ex_fsid;
-+	int			ex_pnfs;
- 	unsigned char *		ex_uuid; /* 16 byte fsid */
- 	struct nfsd4_fs_locations ex_fslocs;
- 	int			ex_nflavors;
-diff --git a/include/linux/nfsd/nfs4layoutxdr.h b/include/linux/nfsd/nfs4layoutxdr.h
-new file mode 100644
-index 0000000..b02d96a
---- /dev/null
-+++ b/include/linux/nfsd/nfs4layoutxdr.h
-@@ -0,0 +1,132 @@
-+/*
-+ *  Copyright (c) 2006 The Regents of the University of Michigan.
-+ *  All rights reserved.
-+ *
-+ *  Andy Adamson <andros at umich.edu>
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in the
-+ *     documentation and/or other materials provided with the distribution.
-+ *  3. Neither the name of the University nor the names of its
-+ *     contributors may be used to endorse or promote products derived
-+ *     from this software without specific prior written permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ */
-+
-+#ifndef NFSD_NFS4LAYOUTXDR_H
-+#define NFSD_NFS4LAYOUTXDR_H
 +
-+#include <linux/sunrpc/xdr.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
++struct spnfs_msg_close_res {
++	int y;
++};
 +
-+/* the nfsd4_pnfs_devlist dev_addr for the file layout type */
-+struct pnfs_filelayout_devaddr {
-+	struct xdr_netobj	r_netid;
-+	struct xdr_netobj	r_addr;
++/* create */
++/*
++struct spnfs_msg_create_args {
++	int x;
 +};
 +
-+/* list of multipath servers */
-+struct pnfs_filelayout_multipath {
-+	u32				fl_multipath_length;
-+	struct pnfs_filelayout_devaddr 	*fl_multipath_list;
++struct spnfs_msg_create_res {
++	int y;
 +};
++*/
 +
-+struct pnfs_filelayout_device {
-+	u32					fl_stripeindices_length;
-+	u32       		 		*fl_stripeindices_list;
-+	u32					fl_device_length;
-+	struct pnfs_filelayout_multipath 	*fl_device_list;
++/* remove */
++struct spnfs_msg_remove_args {
++	unsigned long inode;
++	unsigned long generation;
 +};
 +
-+struct pnfs_filelayout_layout {
-+	u32                             lg_layout_type; /* response */
-+	u32                             lg_stripe_type; /* response */
-+	u32                             lg_commit_through_mds; /* response */
-+	u64                             lg_stripe_unit; /* response */
-+	u64                             lg_pattern_offset; /* response */
-+	u32                             lg_first_stripe_index;	/* response */
-+	struct nfsd4_pnfs_deviceid	device_id;		/* response */
-+	u32                             lg_fh_length;		/* response */
-+	struct knfsd_fh                 *lg_fh_list;		/* response */
++struct spnfs_msg_remove_res {
++	int status;
 +};
 +
-+enum stripetype4 {
-+	STRIPE_SPARSE = 1,
-+	STRIPE_DENSE = 2
++/* commit */
++/*
++struct spnfs_msg_commit_args {
++	int x;
 +};
 +
-+enum pnfs_block_extent_state4 {
-+        PNFS_BLOCK_READWRITE_DATA       = 0,
-+        PNFS_BLOCK_READ_DATA            = 1,
-+        PNFS_BLOCK_INVALID_DATA         = 2,
-+        PNFS_BLOCK_NONE_DATA            = 3
++struct spnfs_msg_commit_res {
++	int y;
 +};
++*/
 +
-+enum pnfs_block_volume_type4 {
-+        PNFS_BLOCK_VOLUME_SIMPLE = 0,
-+        PNFS_BLOCK_VOLUME_SLICE = 1,
-+        PNFS_BLOCK_VOLUME_CONCAT = 2,
-+        PNFS_BLOCK_VOLUME_STRIPE = 3,
++/* read */
++struct spnfs_msg_read_args {
++	unsigned long inode;
++	unsigned long generation;
++	loff_t offset;
++	unsigned long len;
 +};
-+typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4;
 +
-+enum bl_cache_state {
-+	BLOCK_LAYOUT_NEW	= 0,
-+	BLOCK_LAYOUT_CACHE	= 1,
-+	BLOCK_LAYOUT_UPDATE	= 2,
++struct spnfs_msg_read_res {
++	int status;
++	char data[SPNFS_MAX_IO];
 +};
 +
-+typedef struct pnfs_blocklayout_layout {
-+        struct list_head                bll_list;
-+        struct nfsd4_pnfs_deviceid      bll_vol_id;
-+        u64                             bll_foff;	// file offset
-+        u64                             bll_len;
-+        u64                             bll_soff;	// storage offset
-+	int				bll_recalled;
-+        enum pnfs_block_extent_state4   bll_es;
-+	enum bl_cache_state		bll_cache_state;
-+} pnfs_blocklayout_layout_t;
++/* write */
++struct spnfs_msg_write_args {
++	unsigned long inode;
++	unsigned long generation;
++	loff_t offset;
++	unsigned long len;
++	char data[SPNFS_MAX_IO];
++};
 +
-+typedef struct pnfs_blocklayout_devinfo {
-+        struct list_head                bld_list;
-+        pnfs_block_volume_type4         bld_type;
-+        struct nfsd4_pnfs_deviceid      bld_devid;
-+        int                             bld_index_loc;
-+        union {
-+                struct {
-+                        u64             bld_offset;
-+                        u32             bld_sig_len,
-+                                        *bld_sig;
-+                } simple;
-+                struct {
-+                        u64             bld_start,
-+                                        bld_len;
-+                        u32             bld_index;      /* Index of Simple Volume */
-+                } slice;
-+                struct {
-+                        u32             bld_stripes;
-+                        u64             bld_chunk_size;
-+                        u32             *bld_stripe_indexs;
-+                } stripe;
-+        } u;
-+} pnfs_blocklayout_devinfo_t;
++struct spnfs_msg_write_res {
++	int status;
++};
 +
-+#endif /* NFSD_NFS4LAYOUTXDR_H */
-diff --git a/include/linux/nfsd/nfs4pnfsdlm.h b/include/linux/nfsd/nfs4pnfsdlm.h
-new file mode 100644
-index 0000000..eb31123
---- /dev/null
-+++ b/include/linux/nfsd/nfs4pnfsdlm.h
-@@ -0,0 +1,54 @@
-+/******************************************************************************
-+ *
-+ * (c) 2007 Network Appliance, Inc.  All Rights Reserved.
-+ * (c) 2009 NetApp.  All Rights Reserved.
-+ *
-+ * NetApp provides this source code under the GPL v2 License.
-+ * The GPL v2 license is available at
-+ * http://opensource.org/licenses/gpl-license.php.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ ******************************************************************************/
-+#include <linux/genhd.h>
++/* bundle args and responses */
++union spnfs_msg_args {
++	struct spnfs_msg_layoutget_args		layoutget_args;
++	struct spnfs_msg_layoutcommit_args	layoutcommit_args;
++/*
++	struct spnfs_msg_layoutreturn_args	layoutreturn_args;
++*/
++	struct spnfs_msg_getdeviceiter_args     getdeviceiter_args;
++	struct spnfs_msg_getdeviceinfo_args     getdeviceinfo_args;
++	struct spnfs_msg_setattr_args		setattr_args;
++	struct spnfs_msg_open_args		open_args;
++	struct spnfs_msg_close_args		close_args;
++/*
++	struct spnfs_msg_create_args		create_args;
++*/
++	struct spnfs_msg_remove_args		remove_args;
++/*
++	struct spnfs_msg_commit_args		commit_args;
++*/
++	struct spnfs_msg_read_args		read_args;
++	struct spnfs_msg_write_args		write_args;
++};
 +
++union spnfs_msg_res {
++	struct spnfs_msg_layoutget_res		layoutget_res;
++	struct spnfs_msg_layoutcommit_res	layoutcommit_res;
 +/*
-+ * Length of comma separated pnfs data server IPv4 addresses. Enough room for
-+ * 32 addresses.
-+ */
-+#define NFSD_DLM_DS_LIST_MAX   512
++	struct spnfs_msg_layoutreturn_res	layoutreturn_res;
++*/
++	struct spnfs_msg_getdeviceiter_res      getdeviceiter_res;
++	struct spnfs_msg_getdeviceinfo_res      getdeviceinfo_res;
++	struct spnfs_msg_setattr_res		setattr_res;
++	struct spnfs_msg_open_res		open_res;
++	struct spnfs_msg_close_res		close_res;
 +/*
-+ * Length of colon separated pnfs dlm device of the form
-+ * disk_name:comma separated data server IPv4 address
-+ */
-+#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1)
++	struct spnfs_msg_create_res		create_res;
++*/
++	struct spnfs_msg_remove_res		remove_res;
++/*
++	struct spnfs_msg_commit_res		commit_res;
++*/
++	struct spnfs_msg_read_res		read_res;
++	struct spnfs_msg_write_res		write_res;
++};
 +
-+#ifdef CONFIG_PNFSD
++/* a spnfs message, args and response */
++struct spnfs_msg {
++	unsigned char		im_type;
++	unsigned char		im_status;
++	union spnfs_msg_args	im_args;
++	union spnfs_msg_res	im_res;
++};
 +
-+/* For use by DLM cluster file systems exported by pNFSD */
-+extern const struct pnfs_export_operations pnfs_dlm_export_ops;
++/* spnfs configuration info */
++struct spnfs_config {
++	unsigned char		dense_striping;
++	int			stripe_size;
++	int			num_ds;
++	char			ds_dir[SPNFS_MAX_DATA_SERVERS][80];  /* XXX */
++};
 +
-+int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len);
++#if defined(__KERNEL__) && defined(CONFIG_SPNFS)
 +
-+void nfsd4_pnfs_dlm_shutdown(void);
++#include <linux/nfsd/nfsd4_pnfs.h>
 +
-+ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen);
++/* pipe mgmt structure.  messages flow through here */
++struct spnfs {
++	struct dentry		*spnfs_dentry;    /* dentry for pipe */
++	wait_queue_head_t	spnfs_wq;
++	struct spnfs_msg	spnfs_im;         /* spnfs message */
++	struct mutex		spnfs_lock;       /* Serializes upcalls */
++	struct mutex		spnfs_plock;
++};
 +
-+#else /* CONFIG_PNFSD */
++struct nfsd4_open;
++
++int spnfs_layout_type(struct super_block *);
++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr,
++			      const struct nfsd4_pnfs_layoutget_arg *,
++			      struct nfsd4_pnfs_layoutget_res *);
++int spnfs_layoutcommit(void);
++int spnfs_layoutreturn(struct inode *,
++		       const struct nfsd4_pnfs_layoutreturn_arg *);
++int spnfs_getdeviceiter(struct super_block *,
++			u32 layout_type,
++			struct nfsd4_pnfs_dev_iter_res *);
++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
++			u32 layout_type,
++			const struct nfsd4_pnfs_deviceid *);
++int spnfs_setattr(void);
++int spnfs_open(struct inode *, struct nfsd4_open *);
++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *);
++int spnfs_remove(unsigned long, unsigned long);
++__be32 spnfs_read(struct inode *, loff_t, unsigned long *,
++		  int, struct svc_rqst *);
++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *);
++int spnfs_getfh(int, struct nfs_fh *);
++int spnfs_test_layoutrecall(char *, u64, u64);
++int spnfs_layoutrecall(struct inode *, int, u64, u64);
++
++int nfsd_spnfs_new(void);
++void nfsd_spnfs_delete(void);
++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *);
++int spnfs_enabled(void);
++int spnfs_init_proc(void);
++
++extern struct spnfs_config *spnfs_config;
++
++#endif /* __KERNEL__ && CONFIG_SPNFS */
 +
-+static inline void nfsd4_pnfs_dlm_shutdown(void)
-+{
-+	return;
-+}
++#endif /* NFS_SPNFS_H */
+diff -up linux-2.6.37.noarch/include/linux/nfsd/const.h.orig linux-2.6.37.noarch/include/linux/nfsd/const.h
+--- linux-2.6.37.noarch/include/linux/nfsd/const.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/const.h	2011-01-28 09:43:53.387765940 -0500
+@@ -29,6 +29,7 @@
+ #ifdef __KERNEL__
+ 
+ #include <linux/sunrpc/msg_prot.h>
++#include <linux/sunrpc/svc.h>
+ 
+ /*
+  * Largest number of bytes we need to allocate for an NFS
+diff -up linux-2.6.37.noarch/include/linux/nfsd/debug.h.orig linux-2.6.37.noarch/include/linux/nfsd/debug.h
+--- linux-2.6.37.noarch/include/linux/nfsd/debug.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/debug.h	2011-01-28 09:43:53.388765835 -0500
+@@ -32,6 +32,8 @@
+ #define NFSDDBG_REPCACHE	0x0080
+ #define NFSDDBG_XDR		0x0100
+ #define NFSDDBG_LOCKD		0x0200
++#define NFSDDBG_PNFS		0x0400
++#define NFSDDBG_FILELAYOUT	0x0800
+ #define NFSDDBG_ALL		0x7FFF
+ #define NFSDDBG_NOCHANGE	0xFFFF
+ 
+diff -up linux-2.6.37.noarch/include/linux/nfsd/export.h.orig linux-2.6.37.noarch/include/linux/nfsd/export.h
+--- linux-2.6.37.noarch/include/linux/nfsd/export.h.orig	2011-01-28 09:37:32.865968740 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/export.h	2011-01-28 09:43:53.388765835 -0500
+@@ -79,6 +79,20 @@ struct nfsd4_fs_locations {
+ };
+ 
+ /*
++ * Callbacks
++ */
++struct nfsd4_callback {
++	void *cb_op;
++	struct nfs4_client *cb_clp;
++	struct list_head cb_per_client;
++	u32 cb_minorversion;
++	struct rpc_message cb_msg;
++	const struct rpc_call_ops *cb_ops;
++	struct work_struct cb_work;
++	bool cb_done;
++};
 +
-+#endif /* CONFIG_PNFSD */
-diff --git a/include/linux/nfsd/nfsd4_pnfs.h b/include/linux/nfsd/nfsd4_pnfs.h
-new file mode 100644
-index 0000000..2e66837
---- /dev/null
-+++ b/include/linux/nfsd/nfsd4_pnfs.h
-@@ -0,0 +1,271 @@
++/*
+  * We keep an array of pseudoflavors with the export, in order from most
+  * to least preferred.  For the forseeable future, we don't expect more
+  * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+@@ -100,6 +114,7 @@ struct svc_export {
+ 	uid_t			ex_anon_uid;
+ 	gid_t			ex_anon_gid;
+ 	int			ex_fsid;
++	int			ex_pnfs;
+ 	unsigned char *		ex_uuid; /* 16 byte fsid */
+ 	struct nfsd4_fs_locations ex_fslocs;
+ 	int			ex_nflavors;
+diff -up linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h
+--- linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig	2011-01-28 09:43:53.389765732 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/nfs4layoutxdr.h	2011-01-28 09:43:53.389765732 -0500
+@@ -0,0 +1,132 @@
 +/*
 + *  Copyright (c) 2006 The Regents of the University of Michigan.
 + *  All rights reserved.
@@ -27339,747 +25627,788 @@ index 0000000..2e66837
 + *
 + */
 +
-+#ifndef _LINUX_NFSD_NFSD4_PNFS_H
-+#define _LINUX_NFSD_NFSD4_PNFS_H
-+
-+#include <linux/exportfs.h>
-+#include <linux/exp_xdr.h>
-+#include <linux/nfs_xdr.h>
-+
-+struct nfsd4_pnfs_deviceid {
-+	u64	sbid;			/* per-superblock unique ID */
-+	u64	devid;			/* filesystem-wide unique device ID */
-+};
-+
-+struct nfsd4_pnfs_dev_iter_res {
-+	u64		gd_cookie;	/* request/repsonse */
-+	u64		gd_verf;	/* request/repsonse */
-+	u64		gd_devid;	/* response */
-+	u32		gd_eof;		/* response */
-+};
-+
-+/* Arguments for set_device_notify */
-+struct pnfs_devnotify_arg {
-+	struct nfsd4_pnfs_deviceid dn_devid;	/* request */
-+	u32 dn_layout_type;			/* request */
-+	u32 dn_notify_types;			/* request/response */
-+};
-+
-+struct nfsd4_layout_seg {
-+	u64	clientid;
-+	u32	layout_type;
-+	u32	iomode;
-+	u64	offset;
-+	u64	length;
-+};
-+
-+/* Used by layout_get to encode layout (loc_body var in spec)
-+ * Args:
-+ * minlength - min number of accessible bytes given by layout
-+ * fsid - Major part of struct pnfs_deviceid.  File system uses this
-+ * to build the deviceid returned in the layout.
-+ * fh - fs can modify the file handle for use on data servers
-+ * seg - layout info requested and layout info returned
-+ * xdr - xdr info
-+ * return_on_close - true if layout to be returned on file close
-+ */
-+
-+struct nfsd4_pnfs_layoutget_arg {
-+	u64			lg_minlength;
-+	u64			lg_sbid;
-+	const struct knfsd_fh	*lg_fh;
-+};
-+
-+struct nfsd4_pnfs_layoutget_res {
-+	struct nfsd4_layout_seg	lg_seg;	/* request/resopnse */
-+	u32			lg_return_on_close;
-+};
-+
-+struct nfsd4_pnfs_layoutcommit_arg {
-+	struct nfsd4_layout_seg	lc_seg;		/* request */
-+	u32			lc_reclaim;	/* request */
-+	u32			lc_newoffset;	/* request */
-+	u64			lc_last_wr;	/* request */
-+	struct nfstime4		lc_mtime;	/* request */
-+	u32			lc_up_len;	/* layout length */
-+	void			*lc_up_layout;	/* decoded by callback */
-+};
-+
-+struct nfsd4_pnfs_layoutcommit_res {
-+	u32			lc_size_chg;	/* boolean for response */
-+	u64			lc_newsize;	/* response */
-+};
-+
-+#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */
++#ifndef NFSD_NFS4LAYOUTXDR_H
++#define NFSD_NFS4LAYOUTXDR_H
 +
-+struct nfsd4_pnfs_layoutreturn_arg {
-+	u32			lr_return_type;	/* request */
-+	struct nfsd4_layout_seg	lr_seg;		/* request */
-+	u32			lr_reclaim;	/* request */
-+	u32			lrf_body_len;	/* request */
-+	void			*lrf_body;	/* request */
-+	void			*lr_cookie;	/* fs private */
-+};
++#include <linux/sunrpc/xdr.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
 +
-+/* pNFS Metadata to Data server state communication */
-+struct pnfs_get_state {
-+	u32			dsid;    /* request */
-+	u64			ino;      /* request */
-+	nfs4_stateid		stid;     /* request;response */
-+	nfs4_clientid		clid;     /* response */
-+	u32			access;    /* response */
-+	u32			stid_gen;    /* response */
-+	u32			verifier[2]; /* response */
++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */
++struct pnfs_filelayout_devaddr {
++	struct xdr_netobj	r_netid;
++	struct xdr_netobj	r_addr;
 +};
 +
-+/*
-+ * pNFS export operations vector.
-+ *
-+ * The filesystem must implement the following methods:
-+ *   layout_type
-+ *   get_device_info
-+ *   layout_get
-+ *
-+ * All other methods are optional and can be set to NULL if not implemented.
-+ */
-+struct pnfs_export_operations {
-+	/* Returns the supported pnfs_layouttype4. */
-+	int (*layout_type) (struct super_block *);
-+
-+	/* Encode device info onto the xdr stream. */
-+	int (*get_device_info) (struct super_block *,
-+				struct exp_xdr_stream *,
-+				u32 layout_type,
-+				const struct nfsd4_pnfs_deviceid *);
-+
-+	/* Retrieve all available devices via an iterator.
-+	 * arg->cookie == 0 indicates the beginning of the list,
-+	 * otherwise arg->verf is used to verify that the list hasn't changed
-+	 * while retrieved.
-+	 *
-+	 * On output, the filesystem sets the devid based on the current cookie
-+	 * and sets res->cookie and res->verf corresponding to the next entry.
-+	 * When the last entry in the list is retrieved, res->eof is set to 1.
-+	 */
-+	int (*get_device_iter) (struct super_block *,
-+				u32 layout_type,
-+				struct nfsd4_pnfs_dev_iter_res *);
-+
-+	int (*set_device_notify) (struct super_block *,
-+				  struct pnfs_devnotify_arg *);
-+
-+	/* Retrieve and encode a layout for inode onto the xdr stream.
-+	 * arg->minlength is the minimum number of accessible bytes required
-+	 *   by the client.
-+	 * The maximum number of bytes to encode the layout is given by
-+	 *   the xdr stream end pointer.
-+	 * arg->fsid contains the major part of struct pnfs_deviceid.
-+	 *   The file system uses this to build the deviceid returned
-+	 *   in the layout.
-+	 * res->seg - layout segment requested and layout info returned.
-+	 * res->fh can be modified the file handle for use on data servers
-+	 * res->return_on_close - true if layout to be returned on file close
-+	 *
-+	 * return one of the following nfs errors:
-+	 * NFS_OK			Success
-+	 * NFS4ERR_ACCESS		Permission error
-+	 * NFS4ERR_BADIOMODE		Server does not support requested iomode
-+	 * NFS4ERR_BADLAYOUT		No layout matching loga_minlength rules
-+	 * NFS4ERR_INVAL		Parameter other than layout is invalid
-+	 * NFS4ERR_IO			I/O error
-+	 * NFS4ERR_LAYOUTTRYLATER	Layout may be retrieved later
-+	 * NFS4ERR_LAYOUTUNAVAILABLE	Layout unavailable for this file
-+	 * NFS4ERR_LOCKED		Lock conflict
-+	 * NFS4ERR_NOSPC		Out-of-space error occured
-+	 * NFS4ERR_RECALLCONFLICT	Layout currently unavialable due to
-+	 *				a conflicting CB_LAYOUTRECALL
-+	 * NFS4ERR_SERVERFAULT		Server went bezerk
-+	 * NFS4ERR_TOOSMALL		loga_maxcount too small to fit layout
-+	 * NFS4ERR_WRONG_TYPE		Wrong file type (not a regular file)
-+	 */
-+	enum nfsstat4 (*layout_get) (struct inode *,
-+				     struct exp_xdr_stream *xdr,
-+				     const struct nfsd4_pnfs_layoutget_arg *,
-+				     struct nfsd4_pnfs_layoutget_res *);
-+
-+	/* Commit changes to layout */
-+	int (*layout_commit) (struct inode *,
-+			      const struct nfsd4_pnfs_layoutcommit_arg *,
-+			      struct nfsd4_pnfs_layoutcommit_res *);
-+
-+	/* Returns the layout */
-+	int (*layout_return) (struct inode *,
-+			      const struct nfsd4_pnfs_layoutreturn_arg *);
-+
-+	/* Can layout segments be merged for this layout type? */
-+	int (*can_merge_layouts) (u32 layout_type);
-+
-+	/* pNFS Files layout specific operations */
-+
-+	/* Get the write verifier for DS (called on MDS only) */
-+	void (*get_verifier) (struct super_block *, u32 *p);
-+	/* Call fs on DS only */
-+	int (*get_state) (struct inode *, struct knfsd_fh *,
-+			  struct pnfs_get_state *);
++/* list of multipath servers */
++struct pnfs_filelayout_multipath {
++	u32				fl_multipath_length;
++	struct pnfs_filelayout_devaddr 	*fl_multipath_list;
 +};
 +
-+struct nfsd4_pnfs_cb_layout {
-+	u32			cbl_recall_type;	/* request */
-+	struct nfsd4_layout_seg cbl_seg;		/* request */
-+	u32			cbl_layoutchanged;	/* request */
-+	nfs4_stateid		cbl_sid;		/* request */
-+	struct nfs4_fsid	cbl_fsid;
-+	void			*cbl_cookie;		/* fs private */
++struct pnfs_filelayout_device {
++	u32					fl_stripeindices_length;
++	u32       		 		*fl_stripeindices_list;
++	u32					fl_device_length;
++	struct pnfs_filelayout_multipath 	*fl_device_list;
 +};
 +
-+/* layoutrecall request (from exported filesystem) */
-+struct nfs4_layoutrecall {
-+	struct kref			clr_ref;
-+	struct nfsd4_pnfs_cb_layout	cb;	/* request */
-+	struct list_head		clr_perclnt; /* on cl_layoutrecalls */
-+	struct nfs4_client	       *clr_client;
-+	struct nfs4_file	       *clr_file;
-+	struct timespec			clr_time;	/* last activity */
-+	struct super_block 		*clr_sb; /* We might not have a file */
-+	struct nfs4_layoutrecall	*parent; /* The initiating recall */
-+
-+	void				*clr_args;	/* nfsd internal */
++struct pnfs_filelayout_layout {
++	u32                             lg_layout_type; /* response */
++	u32                             lg_stripe_type; /* response */
++	u32                             lg_commit_through_mds; /* response */
++	u64                             lg_stripe_unit; /* response */
++	u64                             lg_pattern_offset; /* response */
++	u32                             lg_first_stripe_index;	/* response */
++	struct nfsd4_pnfs_deviceid	device_id;		/* response */
++	u32                             lg_fh_length;		/* response */
++	struct knfsd_fh                 *lg_fh_list;		/* response */
 +};
 +
-+struct nfsd4_pnfs_cb_dev_item {
-+	u32			cbd_notify_type;	/* request */
-+	u32			cbd_layout_type;	/* request */
-+	struct nfsd4_pnfs_deviceid cbd_devid;		/* request */
-+	u32			cbd_immediate;		/* request */
++enum stripetype4 {
++	STRIPE_SPARSE = 1,
++	STRIPE_DENSE = 2
 +};
 +
-+struct nfsd4_pnfs_cb_dev_list {
-+	u32				cbd_len;  /* request */
-+	struct nfsd4_pnfs_cb_dev_item  *cbd_list; /* request */
++enum pnfs_block_extent_state4 {
++        PNFS_BLOCK_READWRITE_DATA       = 0,
++        PNFS_BLOCK_READ_DATA            = 1,
++        PNFS_BLOCK_INVALID_DATA         = 2,
++        PNFS_BLOCK_NONE_DATA            = 3
 +};
 +
-+/*
-+ * callbacks provided by the nfsd
-+ */
-+struct pnfsd_cb_operations {
-+	/* Generic callbacks */
-+	int (*cb_layout_recall) (struct super_block *, struct inode *,
-+				 struct nfsd4_pnfs_cb_layout *);
-+	int (*cb_device_notify) (struct super_block *,
-+				 struct nfsd4_pnfs_cb_dev_list *);
-+
-+	/* pNFS Files layout specific callbacks */
-+
-+	/* Callback from fs on MDS only */
-+	int (*cb_get_state) (struct super_block *, struct pnfs_get_state *);
-+	/* Callback from fs on DS only */
-+	int (*cb_change_state) (struct pnfs_get_state *);
++enum pnfs_block_volume_type4 {
++        PNFS_BLOCK_VOLUME_SIMPLE = 0,
++        PNFS_BLOCK_VOLUME_SLICE = 1,
++        PNFS_BLOCK_VOLUME_CONCAT = 2,
++        PNFS_BLOCK_VOLUME_STRIPE = 3,
 +};
++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4;
 +
-+#endif /* _LINUX_NFSD_NFSD4_PNFS_H */
-diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
-index 812bc1e..df667d0 100644
---- a/include/linux/nfsd/syscall.h
-+++ b/include/linux/nfsd/syscall.h
-@@ -29,6 +29,7 @@
- /*#define NFSCTL_GETFH		6	/ * get an fh by ino DISCARDED */
- #define NFSCTL_GETFD		7	/* get an fh by path (used by mountd) */
- #define	NFSCTL_GETFS		8	/* get an fh by path with max FH len */
-+#define	NFSCTL_FD2FH		9	/* get a fh from a fd */
- 
- /* SVC */
- struct nfsctl_svc {
-@@ -71,6 +72,11 @@ struct nfsctl_fsparm {
- 	int			gd_maxlen;
- };
- 
-+/* FD2FH */
-+struct nfsctl_fd2fh {
-+	int			fd;
++enum bl_cache_state {
++	BLOCK_LAYOUT_NEW	= 0,
++	BLOCK_LAYOUT_CACHE	= 1,
++	BLOCK_LAYOUT_UPDATE	= 2,
 +};
 +
- /*
-  * This is the argument union.
-  */
-@@ -82,6 +88,7 @@ struct nfsctl_arg {
- 		struct nfsctl_export	u_export;
- 		struct nfsctl_fdparm	u_getfd;
- 		struct nfsctl_fsparm	u_getfs;
-+		struct nfsctl_fd2fh	u_fd2fh;
- 		/*
- 		 * The following dummy member is needed to preserve binary compatibility
- 		 * on platforms where alignof(void*)>alignof(int).  It's needed because
-@@ -95,6 +102,7 @@ struct nfsctl_arg {
- #define ca_export	u.u_export
- #define ca_getfd	u.u_getfd
- #define	ca_getfs	u.u_getfs
-+#define	ca_fd2fh	u.u_fd2fh
- };
- 
- union nfsctl_res {
-diff --git a/include/linux/nfsd4_block.h b/include/linux/nfsd4_block.h
-new file mode 100644
-index 0000000..b0d5177
---- /dev/null
-+++ b/include/linux/nfsd4_block.h
-@@ -0,0 +1,101 @@
-+#ifndef NFSD4_BLOCK
-+#define NFSD4_BLOCK
-+
-+#include <linux/sunrpc/svc.h>
-+#include <linux/sunrpc/svcauth.h>
-+#include <linux/nfsd/nfsfh.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+
-+#define PNFS_BLOCK_SUCCESS		1
-+#define PNFS_BLOCK_FAILURE		0
-+
-+#define PNFS_BLOCK_CTL_START		1
-+#define PNFS_BLOCK_CTL_STOP		2
-+#define PNFS_BLOCK_CTL_VERS		3 /* Allows daemon to request current
-+					   * version from kernel via an upcall.
-+					   */
++typedef struct pnfs_blocklayout_layout {
++        struct list_head                bll_list;
++        struct nfsd4_pnfs_deviceid      bll_vol_id;
++        u64                             bll_foff;	// file offset
++        u64                             bll_len;
++        u64                             bll_soff;	// storage offset
++	int				bll_recalled;
++        enum pnfs_block_extent_state4   bll_es;
++	enum bl_cache_state		bll_cache_state;
++} pnfs_blocklayout_layout_t;
 +
-+#define PNFS_UPCALL_MSG_STOP	0
-+#define PNFS_UPCALL_MSG_GETSIG	1
-+#define PNFS_UPCALL_MSG_GETSLICE	2
-+#define PNFS_UPCALL_MSG_DMCHK	3	// See if dev_t is a DM volume
-+#define PNFS_UPCALL_MSG_DMGET	4
-+#define PNFS_UPCALL_MSG_VERS	5
++typedef struct pnfs_blocklayout_devinfo {
++        struct list_head                bld_list;
++        pnfs_block_volume_type4         bld_type;
++        struct nfsd4_pnfs_deviceid      bld_devid;
++        int                             bld_index_loc;
++        union {
++                struct {
++                        u64             bld_offset;
++                        u32             bld_sig_len,
++                                        *bld_sig;
++                } simple;
++                struct {
++                        u64             bld_start,
++                                        bld_len;
++                        u32             bld_index;      /* Index of Simple Volume */
++                } slice;
++                struct {
++                        u32             bld_stripes;
++                        u64             bld_chunk_size;
++                        u32             *bld_stripe_indexs;
++                } stripe;
++        } u;
++} pnfs_blocklayout_devinfo_t;
 +
-+#define PNFS_UPCALL_VERS		8
++#endif /* NFSD_NFS4LAYOUTXDR_H */
+diff -up linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h
+--- linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig	2011-01-28 09:43:53.389765732 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/nfs4pnfsdlm.h	2011-01-28 09:43:53.389765732 -0500
+@@ -0,0 +1,54 @@
++/******************************************************************************
++ *
++ * (c) 2007 Network Appliance, Inc.  All Rights Reserved.
++ * (c) 2009 NetApp.  All Rights Reserved.
++ *
++ * NetApp provides this source code under the GPL v2 License.
++ * The GPL v2 license is available at
++ * http://opensource.org/licenses/gpl-license.php.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ ******************************************************************************/
++#include <linux/genhd.h>
 +
-+typedef struct stripe_dev {
-+	int	major,
-+		minor,
-+		offset;
-+} stripe_dev_t;
++/*
++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for
++ * 32 addresses.
++ */
++#define NFSD_DLM_DS_LIST_MAX   512
++/*
++ * Length of colon separated pnfs dlm device of the form
++ * disk_name:comma separated data server IPv4 address
++ */
++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1)
 +
-+typedef struct bl_comm_res {
-+	int				res_status;
-+	union {
-+		struct {
-+			long long	start,
-+					length;
-+		} slice;
-+		struct {
-+			int		num_stripes,
-+					stripe_size;
-+			stripe_dev_t	devs[];
-+		} stripe;
-+		struct {
-+			long long	sector;
-+			int		offset,
-+					len;
-+			char		sig[];
-+		} sig;
-+		int			vers,
-+					dm_vol;
-+	} u;
-+} bl_comm_res_t;
++#ifdef CONFIG_PNFSD
 +
-+typedef struct bl_comm_msg {
-+	int		msg_type,
-+			msg_status;
-+	union {
-+		dev_t	msg_dev;
-+		int	msg_vers;
-+	} u;
-+	bl_comm_res_t	*msg_res;
-+} bl_comm_msg_t;
++/* For use by DLM cluster file systems exported by pNFSD */
++extern const struct pnfs_export_operations pnfs_dlm_export_ops;
 +
-+#ifdef __KERNEL__
++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len);
 +
-+typedef struct bl_comm {
-+	/* ---- protects access to this structure ---- */
-+	struct mutex		lock;
-+	/* ---- protects access to rpc pipe ---- */
-+	struct mutex		pipe_lock;
-+	struct dentry		*pipe_dentry;
-+	wait_queue_head_t	pipe_wq;
-+	bl_comm_msg_t		msg;
-+} bl_comm_t;
++void nfsd4_pnfs_dlm_shutdown(void);
 +
-+int pnfs_block_enabled(struct inode *, int);
-+int bl_layout_type(struct super_block *sb);
-+int bl_getdeviceiter(struct super_block *, u32 layout_type,
-+		     struct nfsd4_pnfs_dev_iter_res *);
-+int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
-+		     u32 layout_type,
-+		     const struct nfsd4_pnfs_deviceid *);
-+enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *,
-+			   const struct nfsd4_pnfs_layoutget_arg *,
-+			   struct nfsd4_pnfs_layoutget_res *);
-+int bl_layoutcommit(struct inode *,
-+		    const struct nfsd4_pnfs_layoutcommit_arg *,
-+		    struct nfsd4_pnfs_layoutcommit_res *);
-+int bl_layoutreturn(struct inode *,
-+		    const struct nfsd4_pnfs_layoutreturn_arg *);
-+int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len);
-+int bl_init_proc(void);
-+int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **);
++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen);
 +
-+extern bl_comm_t	*bl_comm_global;	// Ugly...
-+#endif /* __KERNEL__ */
++#else /* CONFIG_PNFSD */
 +
-+#endif /* NFSD4_BLOCK */
++static inline void nfsd4_pnfs_dlm_shutdown(void)
++{
++	return;
++}
 +
-diff --git a/include/linux/nfsd4_spnfs.h b/include/linux/nfsd4_spnfs.h
-new file mode 100644
-index 0000000..ea828e4
---- /dev/null
-+++ b/include/linux/nfsd4_spnfs.h
-@@ -0,0 +1,345 @@
++#endif /* CONFIG_PNFSD */
+diff -up linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h
+--- linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig	2011-01-28 09:43:53.390765631 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/nfsd4_pnfs.h	2011-01-28 09:43:53.390765631 -0500
+@@ -0,0 +1,273 @@
 +/*
-+ * include/linux/nfsd4_spnfs.h
++ *  Copyright (c) 2006 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros at umich.edu>
 + *
-+ * spNFS - simple pNFS implementation with userspace daemon
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 + *
 + */
 +
-+/******************************************************************************
-+
-+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
-+
-+Network Appliance provides this source code under the GPL v2 License.
-+The GPL v2 license is available at
-+http://opensource.org/licenses/gpl-license.php.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#ifndef _LINUX_NFSD_NFSD4_PNFS_H
++#define _LINUX_NFSD_NFSD4_PNFS_H
 +
-+******************************************************************************/
++#include <linux/exportfs.h>
++#include <linux/exp_xdr.h>
++#include <linux/nfs_xdr.h>
++#include <linux/nfsd/export.h>
 +
-+#ifndef NFS_SPNFS_H
-+#define NFS_SPNFS_H
++struct nfsd4_pnfs_deviceid {
++	u64	sbid;			/* per-superblock unique ID */
++	u64	devid;			/* filesystem-wide unique device ID */
++};
 +
++struct nfsd4_pnfs_dev_iter_res {
++	u64		gd_cookie;	/* request/repsonse */
++	u64		gd_verf;	/* request/repsonse */
++	u64		gd_devid;	/* response */
++	u32		gd_eof;		/* response */
++};
 +
-+#ifdef __KERNEL__
-+#include "exportfs.h"
-+#include "sunrpc/svc.h"
-+#include "nfsd/nfsfh.h"
-+#else
-+#include <sys/types.h>
-+#endif /* __KERNEL__ */
++/* Arguments for set_device_notify */
++struct pnfs_devnotify_arg {
++	struct nfsd4_pnfs_deviceid dn_devid;	/* request */
++	u32 dn_layout_type;			/* request */
++	u32 dn_notify_types;			/* request/response */
++};
 +
-+#define SPNFS_STATUS_INVALIDMSG		0x01
-+#define SPNFS_STATUS_AGAIN		0x02
-+#define SPNFS_STATUS_FAIL		0x04
-+#define SPNFS_STATUS_SUCCESS		0x08
++struct nfsd4_layout_seg {
++	u64	clientid;
++	u32	layout_type;
++	u32	iomode;
++	u64	offset;
++	u64	length;
++};
 +
-+#define SPNFS_TYPE_LAYOUTGET		0x01
-+#define SPNFS_TYPE_LAYOUTCOMMIT		0x02
-+#define SPNFS_TYPE_LAYOUTRETURN		0x03
-+#define SPNFS_TYPE_GETDEVICEITER	0x04
-+#define SPNFS_TYPE_GETDEVICEINFO	0x05
-+#define SPNFS_TYPE_SETATTR		0x06
-+#define SPNFS_TYPE_OPEN			0x07
-+#define	SPNFS_TYPE_CLOSE		0x08
-+#define SPNFS_TYPE_CREATE		0x09
-+#define SPNFS_TYPE_REMOVE		0x0a
-+#define SPNFS_TYPE_COMMIT		0x0b
-+#define SPNFS_TYPE_READ			0x0c
-+#define SPNFS_TYPE_WRITE		0x0d
++/* Used by layout_get to encode layout (loc_body var in spec)
++ * Args:
++ * minlength - min number of accessible bytes given by layout
++ * fsid - Major part of struct pnfs_deviceid.  File system uses this
++ * to build the deviceid returned in the layout.
++ * fh - fs can modify the file handle for use on data servers
++ * seg - layout info requested and layout info returned
++ * xdr - xdr info
++ * return_on_close - true if layout to be returned on file close
++ */
 +
-+#define	SPNFS_MAX_DEVICES		1
-+#define	SPNFS_MAX_DATA_SERVERS		16
-+#define SPNFS_MAX_IO			512
++struct nfsd4_pnfs_layoutget_arg {
++	u64			lg_minlength;
++	u64			lg_sbid;
++	const struct knfsd_fh	*lg_fh;
++};
 +
-+/* layout */
-+struct spnfs_msg_layoutget_args {
-+	unsigned long inode;
-+	unsigned long generation;
++struct nfsd4_pnfs_layoutget_res {
++	struct nfsd4_layout_seg	lg_seg;	/* request/resopnse */
++	u32			lg_return_on_close;
 +};
 +
-+struct spnfs_filelayout_list {
-+	u_int32_t       fh_len;
-+	unsigned char   fh_val[128]; /* DMXXX fix this const */
++struct nfsd4_pnfs_layoutcommit_arg {
++	struct nfsd4_layout_seg	lc_seg;		/* request */
++	u32			lc_reclaim;	/* request */
++	u32			lc_newoffset;	/* request */
++	u64			lc_last_wr;	/* request */
++	struct nfstime4		lc_mtime;	/* request */
++	u32			lc_up_len;	/* layout length */
++	void			*lc_up_layout;	/* decoded by callback */
 +};
 +
-+struct spnfs_msg_layoutget_res {
-+	int status;
-+	u_int64_t devid;
-+	u_int64_t stripe_size;
-+	u_int32_t stripe_type;
-+	u_int32_t stripe_count;
-+	struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS];
++struct nfsd4_pnfs_layoutcommit_res {
++	u32			lc_size_chg;	/* boolean for response */
++	u64			lc_newsize;	/* response */
 +};
 +
-+/* layoutcommit */
-+struct spnfs_msg_layoutcommit_args {
-+	unsigned long inode;
-+	unsigned long generation;
-+	u_int64_t file_size;
++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */
++
++struct nfsd4_pnfs_layoutreturn_arg {
++	u32			lr_return_type;	/* request */
++	struct nfsd4_layout_seg	lr_seg;		/* request */
++	u32			lr_reclaim;	/* request */
++	u32			lrf_body_len;	/* request */
++	void			*lrf_body;	/* request */
++	void			*lr_cookie;	/* fs private */
 +};
 +
-+struct spnfs_msg_layoutcommit_res {
-+	int status;
++/* pNFS Metadata to Data server state communication */
++struct pnfs_get_state {
++	u32			dsid;    /* request */
++	u64			ino;      /* request */
++	nfs4_stateid		stid;     /* request;response */
++	nfs4_clientid		clid;     /* response */
++	u32			access;    /* response */
++	u32			stid_gen;    /* response */
++	u32			verifier[2]; /* response */
 +};
 +
-+/* layoutreturn */
-+/* No op for the daemon */
 +/*
-+struct spnfs_msg_layoutreturn_args {
-+};
++ * pNFS export operations vector.
++ *
++ * The filesystem must implement the following methods:
++ *   layout_type
++ *   get_device_info
++ *   layout_get
++ *
++ * All other methods are optional and can be set to NULL if not implemented.
++ */
++struct pnfs_export_operations {
++	/* Returns the supported pnfs_layouttype4. */
++	int (*layout_type) (struct super_block *);
 +
-+struct spnfs_msg_layoutreturn_res {
-+};
-+*/
++	/* Encode device info onto the xdr stream. */
++	int (*get_device_info) (struct super_block *,
++				struct exp_xdr_stream *,
++				u32 layout_type,
++				const struct nfsd4_pnfs_deviceid *);
 +
-+/* getdeviceiter */
-+struct spnfs_msg_getdeviceiter_args {
-+	unsigned long inode;
-+	u_int64_t cookie;
-+	u_int64_t verf;
-+};
++	/* Retrieve all available devices via an iterator.
++	 * arg->cookie == 0 indicates the beginning of the list,
++	 * otherwise arg->verf is used to verify that the list hasn't changed
++	 * while retrieved.
++	 *
++	 * On output, the filesystem sets the devid based on the current cookie
++	 * and sets res->cookie and res->verf corresponding to the next entry.
++	 * When the last entry in the list is retrieved, res->eof is set to 1.
++	 */
++	int (*get_device_iter) (struct super_block *,
++				u32 layout_type,
++				struct nfsd4_pnfs_dev_iter_res *);
 +
-+struct spnfs_msg_getdeviceiter_res {
-+	int status;
-+	u_int64_t devid;
-+	u_int64_t cookie;
-+	u_int64_t verf;
-+	u_int32_t eof;
-+};
++	int (*set_device_notify) (struct super_block *,
++				  struct pnfs_devnotify_arg *);
++
++	/* Retrieve and encode a layout for inode onto the xdr stream.
++	 * arg->minlength is the minimum number of accessible bytes required
++	 *   by the client.
++	 * The maximum number of bytes to encode the layout is given by
++	 *   the xdr stream end pointer.
++	 * arg->fsid contains the major part of struct pnfs_deviceid.
++	 *   The file system uses this to build the deviceid returned
++	 *   in the layout.
++	 * res->seg - layout segment requested and layout info returned.
++	 * res->fh can be modified the file handle for use on data servers
++	 * res->return_on_close - true if layout to be returned on file close
++	 *
++	 * return one of the following nfs errors:
++	 * NFS_OK			Success
++	 * NFS4ERR_ACCESS		Permission error
++	 * NFS4ERR_BADIOMODE		Server does not support requested iomode
++	 * NFS4ERR_BADLAYOUT		No layout matching loga_minlength rules
++	 * NFS4ERR_INVAL		Parameter other than layout is invalid
++	 * NFS4ERR_IO			I/O error
++	 * NFS4ERR_LAYOUTTRYLATER	Layout may be retrieved later
++	 * NFS4ERR_LAYOUTUNAVAILABLE	Layout unavailable for this file
++	 * NFS4ERR_LOCKED		Lock conflict
++	 * NFS4ERR_NOSPC		Out-of-space error occured
++	 * NFS4ERR_RECALLCONFLICT	Layout currently unavialable due to
++	 *				a conflicting CB_LAYOUTRECALL
++	 * NFS4ERR_SERVERFAULT		Server went bezerk
++	 * NFS4ERR_TOOSMALL		loga_maxcount too small to fit layout
++	 * NFS4ERR_WRONG_TYPE		Wrong file type (not a regular file)
++	 */
++	enum nfsstat4 (*layout_get) (struct inode *,
++				     struct exp_xdr_stream *xdr,
++				     const struct nfsd4_pnfs_layoutget_arg *,
++				     struct nfsd4_pnfs_layoutget_res *);
 +
-+/* getdeviceinfo */
-+struct spnfs_data_server {
-+	u_int32_t dsid;
-+	char netid[5];
-+	char addr[29];
-+};
++	/* Commit changes to layout */
++	int (*layout_commit) (struct inode *,
++			      const struct nfsd4_pnfs_layoutcommit_arg *,
++			      struct nfsd4_pnfs_layoutcommit_res *);
 +
-+struct spnfs_device {
-+	u_int64_t devid;
-+	int dscount;
-+	struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS];
-+};
++	/* Returns the layout */
++	int (*layout_return) (struct inode *,
++			      const struct nfsd4_pnfs_layoutreturn_arg *);
 +
-+struct spnfs_msg_getdeviceinfo_args {
-+	u_int64_t devid;
-+};
++	/* Can layout segments be merged for this layout type? */
++	int (*can_merge_layouts) (u32 layout_type);
 +
-+struct spnfs_msg_getdeviceinfo_res {
-+	int status;
-+	struct spnfs_device devinfo;
-+};
++	/* pNFS Files layout specific operations */
 +
-+/* setattr */
-+struct spnfs_msg_setattr_args {
-+	unsigned long inode;
-+	unsigned long generation;
-+	int file_size;
++	/* Get the write verifier for DS (called on MDS only) */
++	void (*get_verifier) (struct super_block *, u32 *p);
++	/* Call fs on DS only */
++	int (*get_state) (struct inode *, struct knfsd_fh *,
++			  struct pnfs_get_state *);
 +};
 +
-+struct spnfs_msg_setattr_res {
-+	int status;
++struct nfsd4_pnfs_cb_layout {
++	u32			cbl_recall_type;	/* request */
++	struct nfsd4_layout_seg cbl_seg;		/* request */
++	u32			cbl_layoutchanged;	/* request */
++	nfs4_stateid		cbl_sid;		/* request */
++	struct nfs4_fsid	cbl_fsid;
++	void			*cbl_cookie;		/* fs private */
 +};
 +
-+/* open */
-+struct spnfs_msg_open_args {
-+	unsigned long inode;
-+	unsigned long generation;
-+	int create;
-+	int createmode;
-+	int truncate;
-+};
++/* layoutrecall request (from exported filesystem) */
++struct nfs4_layoutrecall {
++	struct kref			clr_ref;
++	struct nfsd4_pnfs_cb_layout	cb;	/* request */
++	struct list_head		clr_perclnt; /* on cl_layoutrecalls */
++	struct nfs4_client	       *clr_client;
++	struct nfs4_file	       *clr_file;
++	struct timespec			clr_time;	/* last activity */
++	struct super_block 		*clr_sb; /* We might not have a file */
++	struct nfs4_layoutrecall	*parent; /* The initiating recall */
 +
-+struct spnfs_msg_open_res {
-+	int status;
++	/* nfsd internal */
++	struct nfsd4_callback		clr_recall;
 +};
 +
-+/* close */
-+/* No op for daemon */
-+struct spnfs_msg_close_args {
-+	int x;
++struct nfsd4_pnfs_cb_dev_item {
++	u32			cbd_notify_type;	/* request */
++	u32			cbd_layout_type;	/* request */
++	struct nfsd4_pnfs_deviceid cbd_devid;		/* request */
++	u32			cbd_immediate;		/* request */
 +};
 +
-+struct spnfs_msg_close_res {
-+	int y;
++struct nfsd4_pnfs_cb_dev_list {
++	u32				cbd_len;  /* request */
++	struct nfsd4_pnfs_cb_dev_item  *cbd_list; /* request */
 +};
 +
-+/* create */
 +/*
-+struct spnfs_msg_create_args {
-+	int x;
++ * callbacks provided by the nfsd
++ */
++struct pnfsd_cb_operations {
++	/* Generic callbacks */
++	int (*cb_layout_recall) (struct super_block *, struct inode *,
++				 struct nfsd4_pnfs_cb_layout *);
++	int (*cb_device_notify) (struct super_block *,
++				 struct nfsd4_pnfs_cb_dev_list *);
++
++	/* pNFS Files layout specific callbacks */
++
++	/* Callback from fs on MDS only */
++	int (*cb_get_state) (struct super_block *, struct pnfs_get_state *);
++	/* Callback from fs on DS only */
++	int (*cb_change_state) (struct pnfs_get_state *);
 +};
 +
-+struct spnfs_msg_create_res {
-+	int y;
++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */
+diff -up linux-2.6.37.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.37.noarch/include/linux/nfsd/syscall.h
+--- linux-2.6.37.noarch/include/linux/nfsd/syscall.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/nfsd/syscall.h	2011-01-28 09:43:53.391765532 -0500
+@@ -29,6 +29,7 @@
+ /*#define NFSCTL_GETFH		6	/ * get an fh by ino DISCARDED */
+ #define NFSCTL_GETFD		7	/* get an fh by path (used by mountd) */
+ #define	NFSCTL_GETFS		8	/* get an fh by path with max FH len */
++#define	NFSCTL_FD2FH		9	/* get a fh from a fd */
+ 
+ /* SVC */
+ struct nfsctl_svc {
+@@ -71,6 +72,11 @@ struct nfsctl_fsparm {
+ 	int			gd_maxlen;
+ };
+ 
++/* FD2FH */
++struct nfsctl_fd2fh {
++	int			fd;
 +};
-+*/
 +
-+/* remove */
-+struct spnfs_msg_remove_args {
-+	unsigned long inode;
-+	unsigned long generation;
+ /*
+  * This is the argument union.
+  */
+@@ -82,6 +88,7 @@ struct nfsctl_arg {
+ 		struct nfsctl_export	u_export;
+ 		struct nfsctl_fdparm	u_getfd;
+ 		struct nfsctl_fsparm	u_getfs;
++		struct nfsctl_fd2fh	u_fd2fh;
+ 		/*
+ 		 * The following dummy member is needed to preserve binary compatibility
+ 		 * on platforms where alignof(void*)>alignof(int).  It's needed because
+@@ -95,6 +102,7 @@ struct nfsctl_arg {
+ #define ca_export	u.u_export
+ #define ca_getfd	u.u_getfd
+ #define	ca_getfs	u.u_getfs
++#define	ca_fd2fh	u.u_fd2fh
+ };
+ 
+ union nfsctl_res {
+diff -up linux-2.6.37.noarch/include/linux/nfs_fs.h.orig linux-2.6.37.noarch/include/linux/nfs_fs.h
+--- linux-2.6.37.noarch/include/linux/nfs_fs.h.orig	2011-01-28 09:37:32.863968809 -0500
++++ linux-2.6.37.noarch/include/linux/nfs_fs.h	2011-01-28 09:43:53.381766626 -0500
+@@ -190,6 +190,8 @@ struct nfs_inode {
+ 	struct rw_semaphore	rwsem;
+ 
+ 	/* pNFS layout information */
++	struct rpc_wait_queue lo_rpcwaitq;
++	struct rpc_wait_queue	lo_rpcwaitq_stateid;
+ 	struct pnfs_layout_hdr *layout;
+ #endif /* CONFIG_NFS_V4*/
+ #ifdef CONFIG_NFS_FSCACHE
+@@ -499,8 +501,12 @@ extern int  nfs_sillyrename(struct inode
+ extern int  nfs_congestion_kb;
+ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
+ extern int  nfs_writepages(struct address_space *, struct writeback_control *);
+-extern int  nfs_flush_incompatible(struct file *file, struct page *page);
+-extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
++struct pnfs_layout_segment;
++extern int  nfs_flush_incompatible(struct file *file, struct page *page,
++				   struct pnfs_layout_segment *lseg);
++extern int  nfs_updatepage(struct file *, struct page *,
++			   unsigned int offset, unsigned int count,
++			   struct pnfs_layout_segment *lseg, void *fsdata);
+ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+ 
+ /*
+diff -up linux-2.6.37.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.37.noarch/include/linux/nfs_fs_sb.h
+--- linux-2.6.37.noarch/include/linux/nfs_fs_sb.h.orig	2011-01-28 09:37:32.863968809 -0500
++++ linux-2.6.37.noarch/include/linux/nfs_fs_sb.h	2011-01-28 09:43:53.382766505 -0500
+@@ -79,6 +79,12 @@ struct nfs_client {
+ 	u32			cl_exchange_flags;
+ 	struct nfs4_session	*cl_session; 	/* sharred session */
+ 	struct list_head	cl_layouts;
++	atomic_t		cl_recall_count; /* no. of lsegs in recall */
++	struct list_head	cl_layoutrecalls;
++	unsigned long		cl_cb_lrecall_count;
++#define PNFS_MAX_CB_LRECALLS (64)
++	atomic_t		*cl_drain_notification[PNFS_MAX_CB_LRECALLS];
++	struct rpc_wait_queue	cl_rpcwaitq_recall;
+ 	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
+ #endif /* CONFIG_NFS_V4_1 */
+ 
+@@ -87,6 +93,16 @@ struct nfs_client {
+ #endif
+ };
+ 
++static inline bool
++is_ds_only_client(struct nfs_client *clp)
++{
++#ifdef CONFIG_NFS_V4_1
++	return is_ds_only_session(clp->cl_exchange_flags);
++#else
++	return false;
++#endif
++}
++
+ /*
+  * NFS client parameters stored in the superblock.
+  */
+@@ -132,7 +148,7 @@ struct nfs_server {
+ #endif
+ 
+ #ifdef CONFIG_NFS_V4
+-	u32			attr_bitmask[2];/* V4 bitmask representing the set
++	u32			attr_bitmask[3];/* V4 bitmask representing the set
+ 						   of attributes supported on this
+ 						   filesystem */
+ 	u32			cache_consistency_bitmask[2];
+@@ -144,7 +160,10 @@ struct nfs_server {
+ 						   that are supported on this
+ 						   filesystem */
+ 	struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
+-	struct rpc_wait_queue	roc_rpcwaitq;
++	void			       *pnfs_ld_data; /* Per-mount data */
++	unsigned int			ds_rsize;  /* Data server read size */
++	unsigned int			ds_wsize;  /* Data server write size */
++	u32				pnfs_blksize; /* layout_blksize attr */
+ 
+ 	/* the following fields are protected by nfs_client->cl_lock */
+ 	struct rb_root		state_owners;
+diff -up linux-2.6.37.noarch/include/linux/nfs_iostat.h.orig linux-2.6.37.noarch/include/linux/nfs_iostat.h
+--- linux-2.6.37.noarch/include/linux/nfs_iostat.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/nfs_iostat.h	2011-01-28 09:43:53.384766270 -0500
+@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters {
+ 	NFSIOS_SHORTREAD,
+ 	NFSIOS_SHORTWRITE,
+ 	NFSIOS_DELAY,
++	NFSIOS_PNFS_READ,
++	NFSIOS_PNFS_WRITE,
++	NFSIOS_PNFS_COMMIT,
+ 	__NFSIOS_COUNTSMAX,
+ };
+ 
+diff -up linux-2.6.37.noarch/include/linux/nfs_page.h.orig linux-2.6.37.noarch/include/linux/nfs_page.h
+--- linux-2.6.37.noarch/include/linux/nfs_page.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/nfs_page.h	2011-01-28 09:43:53.385766158 -0500
+@@ -49,6 +49,7 @@ struct nfs_page {
+ 	struct kref		wb_kref;	/* reference count */
+ 	unsigned long		wb_flags;
+ 	struct nfs_writeverf	wb_verf;	/* Commit cookie */
++	struct pnfs_layout_segment *wb_lseg;	/* Pnfs layout info */
+ };
+ 
+ struct nfs_pageio_descriptor {
+@@ -62,6 +63,11 @@ struct nfs_pageio_descriptor {
+ 	int			(*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
+ 	int 			pg_ioflags;
+ 	int			pg_error;
++	struct pnfs_layout_segment *pg_lseg;
++#ifdef CONFIG_NFS_V4_1
++	int			pg_iswrite;
++	int			(*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
++#endif /* CONFIG_NFS_V4_1 */
+ };
+ 
+ #define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
+@@ -70,13 +76,15 @@ extern	struct nfs_page *nfs_create_reque
+ 					    struct inode *inode,
+ 					    struct page *page,
+ 					    unsigned int offset,
+-					    unsigned int count);
++					    unsigned int count,
++					    struct pnfs_layout_segment *lseg);
+ extern	void nfs_clear_request(struct nfs_page *req);
+ extern	void nfs_release_request(struct nfs_page *req);
+ 
+ 
+ extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
+-			  pgoff_t idx_start, unsigned int npages, int tag);
++			  pgoff_t idx_start, unsigned int npages, int tag,
++			  int *use_pnfs);
+ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+ 			     struct inode *inode,
+ 			     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+diff -up linux-2.6.37.noarch/include/linux/nfs_xdr.h.orig linux-2.6.37.noarch/include/linux/nfs_xdr.h
+--- linux-2.6.37.noarch/include/linux/nfs_xdr.h.orig	2011-01-28 09:37:32.864968775 -0500
++++ linux-2.6.37.noarch/include/linux/nfs_xdr.h	2011-01-28 09:43:53.387765940 -0500
+@@ -3,6 +3,8 @@
+ 
+ #include <linux/nfsacl.h>
+ #include <linux/nfs3.h>
++#include <linux/nfs4.h>
++#include <linux/sunrpc/sched.h>
+ 
+ /*
+  * To change the maximum rsize and wsize supported by the NFS client, adjust
+@@ -10,7 +12,7 @@
+  * support a megabyte or more.  The default is left at 4096 bytes, which is
+  * reasonable for NFS over UDP.
+  */
+-#define NFS_MAX_FILE_IO_SIZE	(1048576U)
++#define NFS_MAX_FILE_IO_SIZE	(4U * 1048576U)
+ #define NFS_DEF_FILE_IO_SIZE	(4096U)
+ #define NFS_MIN_FILE_IO_SIZE	(1024U)
+ 
+@@ -115,6 +117,7 @@ struct nfs_fsinfo {
+ 	struct timespec		time_delta; /* server time granularity */
+ 	__u32			lease_time; /* in seconds */
+ 	__u32			layouttype; /* supported pnfs layout driver */
++	__u32			blksize; /* preferred pnfs io block size */
+ };
+ 
+ struct nfs_fsstat {
+@@ -226,6 +229,73 @@ struct nfs4_layoutget {
+ 	struct pnfs_layout_segment **lsegpp;
+ };
+ 
++struct nfs4_layoutcommit_args {
++	nfs4_stateid stateid;
++	__u64 lastbytewritten;
++	__u32 time_modify_changed;
++	struct timespec time_modify;
++	const u32 *bitmask;
++	struct nfs_fh *fh;
++	struct inode *inode;
++
++	/* Values set by layout driver */
++	struct pnfs_layout_range range;
++	__u32 layout_type;
++	void *layoutdriver_data;
++	struct nfs4_sequence_args seq_args;
 +};
 +
-+struct spnfs_msg_remove_res {
++struct nfs4_layoutcommit_res {
++	__u32 sizechanged;
++	__u64 newsize;
++	struct nfs_fattr *fattr;
++	const struct nfs_server *server;
++	struct nfs4_sequence_res seq_res;
 +	int status;
 +};
 +
-+/* commit */
-+/*
-+struct spnfs_msg_commit_args {
-+	int x;
-+};
-+
-+struct spnfs_msg_commit_res {
-+	int y;
++struct nfs4_layoutcommit_data {
++	struct rpc_task task;
++	struct rpc_cred *cred;
++	struct nfs_fattr fattr;
++	struct nfs4_layoutcommit_args args;
++	struct nfs4_layoutcommit_res res;
 +};
-+*/
 +
-+/* read */
-+struct spnfs_msg_read_args {
-+	unsigned long inode;
-+	unsigned long generation;
-+	loff_t offset;
-+	unsigned long len;
++struct nfs4_layoutreturn_args {
++	__u32   reclaim;
++	__u32   layout_type;
++	__u32   return_type;
++	struct pnfs_layout_range range;
++	struct inode *inode;
++	struct nfs4_sequence_args seq_args;
 +};
 +
-+struct spnfs_msg_read_res {
-+	int status;
-+	char data[SPNFS_MAX_IO];
++struct nfs4_layoutreturn_res {
++	struct nfs4_sequence_res seq_res;
++	u32 lrs_present;
++	nfs4_stateid stateid;
 +};
 +
-+/* write */
-+struct spnfs_msg_write_args {
-+	unsigned long inode;
-+	unsigned long generation;
-+	loff_t offset;
-+	unsigned long len;
-+	char data[SPNFS_MAX_IO];
++struct nfs4_layoutreturn {
++	struct nfs4_layoutreturn_args args;
++	struct nfs4_layoutreturn_res res;
++	struct rpc_cred *cred;
++	struct nfs_client *clp;
++	int rpc_status;
 +};
 +
-+struct spnfs_msg_write_res {
-+	int status;
++struct nfs4_getdevicelist_args {
++	const struct nfs_fh *fh;
++	u32 layoutclass;
++	struct nfs4_sequence_args seq_args;
 +};
 +
-+/* bundle args and responses */
-+union spnfs_msg_args {
-+	struct spnfs_msg_layoutget_args		layoutget_args;
-+	struct spnfs_msg_layoutcommit_args	layoutcommit_args;
-+/*
-+	struct spnfs_msg_layoutreturn_args	layoutreturn_args;
-+*/
-+	struct spnfs_msg_getdeviceiter_args     getdeviceiter_args;
-+	struct spnfs_msg_getdeviceinfo_args     getdeviceinfo_args;
-+	struct spnfs_msg_setattr_args		setattr_args;
-+	struct spnfs_msg_open_args		open_args;
-+	struct spnfs_msg_close_args		close_args;
-+/*
-+	struct spnfs_msg_create_args		create_args;
-+*/
-+	struct spnfs_msg_remove_args		remove_args;
-+/*
-+	struct spnfs_msg_commit_args		commit_args;
-+*/
-+	struct spnfs_msg_read_args		read_args;
-+	struct spnfs_msg_write_args		write_args;
++struct nfs4_getdevicelist_res {
++	struct pnfs_devicelist *devlist;
++	struct nfs4_sequence_res seq_res;
 +};
 +
-+union spnfs_msg_res {
-+	struct spnfs_msg_layoutget_res		layoutget_res;
-+	struct spnfs_msg_layoutcommit_res	layoutcommit_res;
-+/*
-+	struct spnfs_msg_layoutreturn_res	layoutreturn_res;
-+*/
-+	struct spnfs_msg_getdeviceiter_res      getdeviceiter_res;
-+	struct spnfs_msg_getdeviceinfo_res      getdeviceinfo_res;
-+	struct spnfs_msg_setattr_res		setattr_res;
-+	struct spnfs_msg_open_res		open_res;
-+	struct spnfs_msg_close_res		close_res;
-+/*
-+	struct spnfs_msg_create_res		create_res;
-+*/
-+	struct spnfs_msg_remove_res		remove_res;
-+/*
-+	struct spnfs_msg_commit_res		commit_res;
-+*/
-+	struct spnfs_msg_read_res		read_res;
-+	struct spnfs_msg_write_res		write_res;
-+};
+ struct nfs4_getdeviceinfo_args {
+ 	struct pnfs_device *pdev;
+ 	struct nfs4_sequence_args seq_args;
+@@ -889,7 +959,7 @@ struct nfs4_server_caps_arg {
+ };
+ 
+ struct nfs4_server_caps_res {
+-	u32				attr_bitmask[2];
++	u32				attr_bitmask[3];
+ 	u32				acl_bitmask;
+ 	u32				has_links;
+ 	u32				has_symlinks;
+@@ -1004,6 +1074,30 @@ struct nfs_page;
+ 
+ #define NFS_PAGEVEC_SIZE	(8U)
+ 
++#if defined(CONFIG_NFS_V4_1)
 +
-+/* a spnfs message, args and response */
-+struct spnfs_msg {
-+	unsigned char		im_type;
-+	unsigned char		im_status;
-+	union spnfs_msg_args	im_args;
-+	union spnfs_msg_res	im_res;
++/* pnfsflag values */
++enum pnfs_flags {
++	PNFS_NO_RPC = 1 << 0,	/* non rpc result callback switch */
 +};
 +
-+/* spnfs configuration info */
-+struct spnfs_config {
-+	unsigned char		dense_striping;
-+	int			stripe_size;
-+	int			num_ds;
-+	char			ds_dir[SPNFS_MAX_DATA_SERVERS][80];  /* XXX */
++/* pnfs-specific data needed for read, write, and commit calls */
++struct pnfs_call_data {
++	struct pnfs_layout_segment *lseg;
++	const struct rpc_call_ops *call_ops;
++	u32			orig_count;	/* for retry via MDS */
++	int			pnfs_error;
++	u8			pnfsflags;
++	u8			how;		/* for FLUSH_STABLE */
 +};
 +
-+#if defined(__KERNEL__) && defined(CONFIG_SPNFS)
-+
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+
-+/* pipe mgmt structure.  messages flow through here */
-+struct spnfs {
-+	struct dentry		*spnfs_dentry;    /* dentry for pipe */
-+	wait_queue_head_t	spnfs_wq;
-+	struct spnfs_msg	spnfs_im;         /* spnfs message */
-+	struct mutex		spnfs_lock;       /* Serializes upcalls */
-+	struct mutex		spnfs_plock;
++/* files layout-type specific data for read, write, and commit */
++struct pnfs_fl_call_data {
++	struct nfs_client	*ds_nfs_client;
++	__u64			orig_offset;
 +};
++#endif /* CONFIG_NFS_V4_1 */
 +
-+struct nfsd4_open;
-+
-+int spnfs_layout_type(struct super_block *);
-+enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr,
-+			      const struct nfsd4_pnfs_layoutget_arg *,
-+			      struct nfsd4_pnfs_layoutget_res *);
-+int spnfs_layoutcommit(void);
-+int spnfs_layoutreturn(struct inode *,
-+		       const struct nfsd4_pnfs_layoutreturn_arg *);
-+int spnfs_getdeviceiter(struct super_block *,
-+			u32 layout_type,
-+			struct nfsd4_pnfs_dev_iter_res *);
-+int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
-+			u32 layout_type,
-+			const struct nfsd4_pnfs_deviceid *);
-+int spnfs_setattr(void);
-+int spnfs_open(struct inode *, struct nfsd4_open *);
-+int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *);
-+int spnfs_remove(unsigned long, unsigned long);
-+__be32 spnfs_read(struct inode *, loff_t, unsigned long *,
-+		  int, struct svc_rqst *);
-+__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *);
-+int spnfs_getfh(int, struct nfs_fh *);
-+int spnfs_test_layoutrecall(char *, u64, u64);
-+int spnfs_layoutrecall(struct inode *, int, u64, u64);
-+
-+int nfsd_spnfs_new(void);
-+void nfsd_spnfs_delete(void);
-+int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *);
-+int spnfs_enabled(void);
-+int spnfs_init_proc(void);
-+
-+extern struct spnfs_config *spnfs_config;
-+
-+#endif /* __KERNEL__ && CONFIG_SPNFS */
-+
-+#endif /* NFS_SPNFS_H */
-diff --git a/include/linux/panfs_shim_api.h b/include/linux/panfs_shim_api.h
-new file mode 100644
-index 0000000..3b44e19
---- /dev/null
-+++ b/include/linux/panfs_shim_api.h
+ struct nfs_read_data {
+ 	int			flags;
+ 	struct rpc_task		task;
+@@ -1019,10 +1113,16 @@ struct nfs_read_data {
+ #ifdef CONFIG_NFS_V4
+ 	unsigned long		timestamp;	/* For lease renewal */
+ #endif
++#if defined(CONFIG_NFS_V4_1)
++	struct pnfs_call_data	pdata;
++	struct pnfs_fl_call_data fldata;
++#endif /* CONFIG_NFS_V4_1 */
+ 	struct page		*page_array[NFS_PAGEVEC_SIZE];
+ };
+ 
+ struct nfs_write_data {
++	struct kref		refcount;	/* For pnfs commit splitting */
++	struct nfs_write_data	*parent;	/* For pnfs commit splitting */
+ 	int			flags;
+ 	struct rpc_task		task;
+ 	struct inode		*inode;
+@@ -1038,6 +1138,10 @@ struct nfs_write_data {
+ #ifdef CONFIG_NFS_V4
+ 	unsigned long		timestamp;	/* For lease renewal */
+ #endif
++#if defined(CONFIG_NFS_V4_1)
++	struct pnfs_call_data	pdata;
++	struct pnfs_fl_call_data fldata;
++#endif /* CONFIG_NFS_V4_1 */
+ 	struct page		*page_array[NFS_PAGEVEC_SIZE];
+ };
+ 
+diff -up linux-2.6.37.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.37.noarch/include/linux/panfs_shim_api.h
+--- linux-2.6.37.noarch/include/linux/panfs_shim_api.h.orig	2011-01-28 09:43:53.395765159 -0500
++++ linux-2.6.37.noarch/include/linux/panfs_shim_api.h	2011-01-28 09:43:53.395765159 -0500
 @@ -0,0 +1,57 @@
 +#ifndef _PANFS_SHIM_API_H
 +#define _PANFS_SHIM_API_H
@@ -28138,11 +26467,9 @@ index 0000000..3b44e19
 +panfs_shim_unregister(void);
 +
 +#endif /* _PANFS_SHIM_API_H */
-diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
-new file mode 100644
-index 0000000..b404f33
---- /dev/null
-+++ b/include/linux/pnfs_osd_xdr.h
+diff -up linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h
+--- linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h.orig	2011-01-28 09:43:53.397764982 -0500
++++ linux-2.6.37.noarch/include/linux/pnfs_osd_xdr.h	2011-01-28 09:43:53.397764982 -0500
 @@ -0,0 +1,439 @@
 +/*
 + *  pnfs_osd_xdr.h
@@ -28583,10 +26910,9 @@ index 0000000..b404f33
 +pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p);
 +
 +#endif /* __PNFS_OSD_XDR_H__ */
-diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
-index 6760816..fc3d2fc 100644
---- a/include/linux/posix_acl.h
-+++ b/include/linux/posix_acl.h
+diff -up linux-2.6.37.noarch/include/linux/posix_acl.h.orig linux-2.6.37.noarch/include/linux/posix_acl.h
+--- linux-2.6.37.noarch/include/linux/posix_acl.h.orig	2011-01-28 09:37:32.880968218 -0500
++++ linux-2.6.37.noarch/include/linux/posix_acl.h	2011-01-28 09:43:53.398764895 -0500
 @@ -8,6 +8,7 @@
  #ifndef __LINUX_POSIX_ACL_H
  #define __LINUX_POSIX_ACL_H
@@ -28595,10 +26921,9 @@ index 6760816..fc3d2fc 100644
  #include <linux/slab.h>
  
  #define ACL_UNDEFINED_ID	(-1)
-diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
-index 77e6248..1b26fff 100644
---- a/include/linux/sunrpc/msg_prot.h
-+++ b/include/linux/sunrpc/msg_prot.h
+diff -up linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h
+--- linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/sunrpc/msg_prot.h	2011-01-28 09:43:53.399764812 -0500
 @@ -14,6 +14,8 @@
  /* size of an XDR encoding unit in bytes, i.e. 32bit */
  #define XDR_UNIT	(4)
@@ -28608,10 +26933,9 @@ index 77e6248..1b26fff 100644
  /* spec defines authentication flavor as an unsigned 32 bit integer */
  typedef u32	rpc_authflavor_t;
  
-diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
-index cf14db9..2177d50 100644
---- a/include/linux/sunrpc/rpc_pipe_fs.h
-+++ b/include/linux/sunrpc/rpc_pipe_fs.h
+diff -up linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h
+--- linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/include/linux/sunrpc/rpc_pipe_fs.h	2011-01-28 09:43:53.399764812 -0500
 @@ -3,6 +3,7 @@
  
  #ifdef __KERNEL__
@@ -28631,11 +26955,9 @@ index cf14db9..2177d50 100644
  };
  
  struct rpc_pipe_ops {
-diff --git a/include/linux/sunrpc/simple_rpc_pipefs.h b/include/linux/sunrpc/simple_rpc_pipefs.h
-new file mode 100644
-index 0000000..f6a1227
---- /dev/null
-+++ b/include/linux/sunrpc/simple_rpc_pipefs.h
+diff -up linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h
+--- linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig	2011-01-28 09:43:53.400764729 -0500
++++ linux-2.6.37.noarch/include/linux/sunrpc/simple_rpc_pipefs.h	2011-01-28 09:43:53.400764729 -0500
 @@ -0,0 +1,105 @@
 +/*
 + *  Copyright (c) 2008 The Regents of the University of Michigan.
@@ -28742,11 +27064,10 @@ index 0000000..f6a1227
 +extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg);
 +
 +#endif /* _SIMPLE_RPC_PIPEFS_H_ */
-diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
-index 5f4e18b..f7a0358 100644
---- a/include/linux/sunrpc/svc_xprt.h
-+++ b/include/linux/sunrpc/svc_xprt.h
-@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(const struct sockaddr *addr,
+diff -up linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h
+--- linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h.orig	2011-01-28 09:37:32.915967004 -0500
++++ linux-2.6.37.noarch/include/linux/sunrpc/svc_xprt.h	2011-01-28 09:43:53.400764729 -0500
+@@ -205,4 +205,41 @@ static inline char *__svc_print_addr(con
  
  	return buf;
  }
@@ -28788,25 +27109,10 @@ index 5f4e18b..f7a0358 100644
 +	return len;
 +}
  #endif /* SUNRPC_SVC_XPRT_H */
-diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
-index 35cf2e8..bb5f3fd 100644
---- a/include/linux/sunrpc/xdr.h
-+++ b/include/linux/sunrpc/xdr.h
-@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp)
- 	return p + 2;
- }
- 
-+static inline __be32 *
-+xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len)
-+{
-+	memcpy(ptr, p, len);
-+	return p + XDR_QUADLEN(len);
-+}
-+
- /*
-  * Adjust kvec to reflect end of xdr'ed data (RPC client XDR)
-  */
-@@ -197,6 +204,7 @@ struct xdr_stream {
+diff -up linux-2.6.37.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.37.noarch/include/linux/sunrpc/xdr.h
+--- linux-2.6.37.noarch/include/linux/sunrpc/xdr.h.orig	2011-01-28 09:37:32.916966969 -0500
++++ linux-2.6.37.noarch/include/linux/sunrpc/xdr.h	2011-01-28 09:43:53.401764649 -0500
+@@ -213,6 +213,7 @@ typedef int	(*kxdrdproc_t)(void *rqstp, 
  
  extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
  extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
@@ -28814,11 +27120,10 @@ index 35cf2e8..bb5f3fd 100644
  extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
  		unsigned int base, unsigned int len);
  extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
-diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
-index 9d2fca5..e102040 100644
---- a/net/sunrpc/Makefile
-+++ b/net/sunrpc/Makefile
-@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
+diff -up linux-2.6.37.noarch/net/sunrpc/Makefile.orig linux-2.6.37.noarch/net/sunrpc/Makefile
+--- linux-2.6.37.noarch/net/sunrpc/Makefile.orig	2011-01-04 19:50:19.000000000 -0500
++++ linux-2.6.37.noarch/net/sunrpc/Makefile	2011-01-28 09:43:53.402764570 -0500
+@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt
  	    svc.o svcsock.o svcauth.o svcauth_unix.o \
  	    addr.o rpcb_clnt.o timer.o xdr.o \
  	    sunrpc_syms.o cache.o rpc_pipe.o \
@@ -28827,11 +27132,9 @@ index 9d2fca5..e102040 100644
  sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
  sunrpc-$(CONFIG_PROC_FS) += stats.o
  sunrpc-$(CONFIG_SYSCTL) += sysctl.o
-diff --git a/net/sunrpc/simple_rpc_pipefs.c b/net/sunrpc/simple_rpc_pipefs.c
-new file mode 100644
-index 0000000..24af0a1
---- /dev/null
-+++ b/net/sunrpc/simple_rpc_pipefs.c
+diff -up linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c
+--- linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c.orig	2011-01-28 09:43:53.403764492 -0500
++++ linux-2.6.37.noarch/net/sunrpc/simple_rpc_pipefs.c	2011-01-28 09:43:53.403764492 -0500
 @@ -0,0 +1,423 @@
 +/*
 + *  net/sunrpc/simple_rpc_pipefs.c
@@ -29256,50 +27559,10 @@ index 0000000..24af0a1
 +		kfree(rpcmsg);
 +}
 +EXPORT_SYMBOL(pipefs_generic_destroy_msg);
-diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
-index a1f82a8..98a59f6 100644
---- a/net/sunrpc/xdr.c
-+++ b/net/sunrpc/xdr.c
-@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
- {
- 	struct kvec *tail;
- 	size_t copy;
--	char *p;
- 	unsigned int pglen = buf->page_len;
-+	unsigned int tailbuf_len;
- 
- 	tail = buf->tail;
- 	BUG_ON (len > pglen);
- 
-+	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-+
- 	/* Shift the tail first */
--	if (tail->iov_len != 0) {
--		p = (char *)tail->iov_base + len;
-+	if (tailbuf_len != 0) {
-+		unsigned int free_space = tailbuf_len - tail->iov_len;
-+
-+		if (len < free_space)
-+			free_space = len;
-+		tail->iov_len += free_space;
-+
-+		copy = len;
- 		if (tail->iov_len > len) {
--			copy = tail->iov_len - len;
--			memmove(p, tail->iov_base, copy);
-+			char *p = (char *)tail->iov_base + len;
-+			memmove(p, tail->iov_base, tail->iov_len - len);
- 		} else
--			buf->buflen -= len;
--		/* Copy from the inlined pages into the tail */
--		copy = len;
--		if (copy > tail->iov_len)
- 			copy = tail->iov_len;
-+		/* Copy from the inlined pages into the tail */
- 		_copy_from_pages((char *)tail->iov_base,
- 				buf->pages, buf->page_base + pglen - len,
- 				copy);
-@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
+diff -up linux-2.6.37.noarch/net/sunrpc/xdr.c.orig linux-2.6.37.noarch/net/sunrpc/xdr.c
+--- linux-2.6.37.noarch/net/sunrpc/xdr.c.orig	2011-01-28 09:37:33.428949202 -0500
++++ linux-2.6.37.noarch/net/sunrpc/xdr.c	2011-01-28 09:43:53.404764414 -0500
+@@ -518,6 +518,27 @@ __be32 * xdr_reserve_space(struct xdr_st
  EXPORT_SYMBOL_GPL(xdr_reserve_space);
  
  /**


More information about the scm-commits mailing list