[kernel/rawhide/user/steved/pnfs-rawhide: 61/73] Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03
Steve Dickson
steved at fedoraproject.org
Wed Mar 23 15:25:14 UTC 2011
commit 657e72ee0cc0af1a16253855ec9009830994d1d0
Author: Steve Dickson <steved at redhat.com>
Date: Mon Nov 8 13:08:25 2010 -0500
Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03
Signed-off-by: Steve Dickson <steved at redhat.com>
kernel.spec | 9 +-
...6-f15.patch => pnfs-all-2.6.36-2010-11-03.patch |40685 ++++++++++----------
pnfs-all-2.6.36-rc3-2010-08-30.patch |30007 ---------------
3 files changed, 20396 insertions(+), 50305 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index 7f45ae4..83a78a7 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -23,7 +23,7 @@ Summary: The Linux kernel
#
# (Uncomment the '#' and both spaces below to set the buildid.)
#
-%define buildid .pnfs_latest.rc6
+%define buildid .pnfs_all_2010_11_03
###################################################################
# The buildid can also be specified on the rpmbuild command line
@@ -727,7 +727,7 @@ Patch12303: dmar-disable-when-ricoh-multifunction.patch
Patch12305: xhci_hcd-suspend-resume.patch
-Patch30000: pnfs-all-latest.v2.6.36-rc6-f15.patch
+Patch30000: pnfs-all-2.6.36-2010-11-03.patch
Patch30001: linux-2.6-pnfs-compile.patch
Patch30002: linux-2.6.35-inline.patch
@@ -1356,7 +1356,7 @@ ApplyPatch dmar-disable-when-ricoh-multifunction.patch
ApplyPatch xhci_hcd-suspend-resume.patch
-ApplyPatch pnfs-all-latest.v2.6.36-rc6-f15.patch
+ApplyPatch pnfs-all-2.6.36-2010-11-03.patch
ApplyPatch linux-2.6-pnfs-compile.patch
ApplyPatch linux-2.6.35-inline.patch
@@ -2423,6 +2423,9 @@ fi
- Switch to tracking git snapshots of what will become 2.6.37.
- Fix context rejects in utrace and a few other patches.
+* Mon Nov 8 2010 Steve Dickson <steved at redhat.com>
+- Updated to the latest pNFS tag: pnfs-all-2.6.36-2010-11-03
+
* Wed Oct 20 2010 Chuck Ebbert <cebbert at redhat.com> 2.6.36-1
- Linux 2.6.36
diff --git a/pnfs-all-latest.v2.6.36-rc6-f15.patch b/pnfs-all-2.6.36-2010-11-03.patch
similarity index 93%
rename from pnfs-all-latest.v2.6.36-rc6-f15.patch
rename to pnfs-all-2.6.36-2010-11-03.patch
index d55cb8e..852037a 100644
--- a/pnfs-all-latest.v2.6.36-rc6-f15.patch
+++ b/pnfs-all-2.6.36-2010-11-03.patch
@@ -1,47 +1,7 @@
-diff -up linux-2.6.35.noarch/arch/um/drivers/net_kern.c.orig linux-2.6.35.noarch/arch/um/drivers/net_kern.c
---- linux-2.6.35.noarch/arch/um/drivers/net_kern.c.orig 2010-09-30 12:22:11.308040000 -0400
-+++ linux-2.6.35.noarch/arch/um/drivers/net_kern.c 2010-09-30 12:25:08.063271000 -0400
-@@ -255,18 +255,6 @@ static void uml_net_tx_timeout(struct ne
- netif_wake_queue(dev);
- }
-
--static int uml_net_set_mac(struct net_device *dev, void *addr)
--{
-- struct uml_net_private *lp = netdev_priv(dev);
-- struct sockaddr *hwaddr = addr;
--
-- spin_lock_irq(&lp->lock);
-- eth_mac_addr(dev, hwaddr->sa_data);
-- spin_unlock_irq(&lp->lock);
--
-- return 0;
--}
--
- static int uml_net_change_mtu(struct net_device *dev, int new_mtu)
- {
- dev->mtu = new_mtu;
-@@ -373,7 +361,7 @@ static const struct net_device_ops uml_n
- .ndo_start_xmit = uml_net_start_xmit,
- .ndo_set_multicast_list = uml_net_set_multicast_list,
- .ndo_tx_timeout = uml_net_tx_timeout,
-- .ndo_set_mac_address = uml_net_set_mac,
-+ .ndo_set_mac_address = eth_mac_addr,
- .ndo_change_mtu = uml_net_change_mtu,
- .ndo_validate_addr = eth_validate_addr,
- };
-@@ -472,7 +460,8 @@ static void eth_configure(int n, void *i
- ((*transport->user->init)(&lp->user, dev) != 0))
- goto out_unregister;
-
-- eth_mac_addr(dev, device->mac);
-+ /* don't use eth_mac_addr, it will not work here */
-+ memcpy(dev->dev_addr, device->mac, ETH_ALEN);
- dev->mtu = transport->user->mtu;
- dev->netdev_ops = ¨_netdev_ops;
- dev->ethtool_ops = ¨_net_ethtool_ops;
-diff -up linux-2.6.35.noarch/Documentation/filesystems/nfs/00-INDEX.orig linux-2.6.35.noarch/Documentation/filesystems/nfs/00-INDEX
---- linux-2.6.35.noarch/Documentation/filesystems/nfs/00-INDEX.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/Documentation/filesystems/nfs/00-INDEX 2010-09-30 12:25:08.029273000 -0400
+diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
+index 2f68cd6..e474827 100644
+--- a/Documentation/filesystems/nfs/00-INDEX
++++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,5 +12,7 @@ nfs-rdma.txt
- how to install and setup the Linux NFS/RDMA client and server software
nfsroot.txt
@@ -50,9 +10,11 @@ diff -up linux-2.6.35.noarch/Documentation/filesystems/nfs/00-INDEX.orig linux-2
+ - short explanation of some of the internals of the pnfs client code
rpc-cache.txt
- introduction to the caching mechanisms in the sunrpc layer.
-diff -up linux-2.6.35.noarch/Documentation/filesystems/nfs/pnfs.txt.orig linux-2.6.35.noarch/Documentation/filesystems/nfs/pnfs.txt
---- linux-2.6.35.noarch/Documentation/filesystems/nfs/pnfs.txt.orig 2010-09-30 12:25:08.032273000 -0400
-+++ linux-2.6.35.noarch/Documentation/filesystems/nfs/pnfs.txt 2010-09-30 12:25:08.034271000 -0400
+diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
+new file mode 100644
+index 0000000..bc0b9cf
+--- /dev/null
++++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -0,0 +1,48 @@
+Reference counting in pnfs:
+==========================
@@ -102,9 +64,11 @@ diff -up linux-2.6.35.noarch/Documentation/filesystems/nfs/pnfs.txt.orig linux-2
+file driver devices refer to data servers, which are kept in a module
+level cache. Its reference is held over the lifetime of the deviceid
+pointing to it.
-diff -up linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt
---- linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt.orig 2010-09-30 12:25:08.036270000 -0400
-+++ linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt 2010-09-30 12:25:08.038270000 -0400
+diff --git a/Documentation/filesystems/spnfs.txt b/Documentation/filesystems/spnfs.txt
+new file mode 100644
+index 0000000..e1d2864
+--- /dev/null
++++ b/Documentation/filesystems/spnfs.txt
@@ -0,0 +1,211 @@
+(c) 2007 Network Appliance Inc.
+
@@ -317,10 +281,11 @@ diff -up linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.
+Bugs, enhancements, compliments, complaints to: dmuntz at netapp.com
+
+
-diff -up linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.35.noarch/drivers/md/dm-ioctl.c
---- linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig 2010-09-30 12:22:19.666288000 -0400
-+++ linux-2.6.35.noarch/drivers/md/dm-ioctl.c 2010-09-30 12:25:08.084271000 -0400
-@@ -663,6 +663,12 @@ static int dev_create(struct dm_ioctl *p
+diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
+index 3e39193..92ce1fb 100644
+--- a/drivers/md/dm-ioctl.c
++++ b/drivers/md/dm-ioctl.c
+@@ -663,6 +663,12 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
return 0;
}
@@ -333,7 +298,7 @@ diff -up linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.35.noarch/driv
/*
* Always use UUID for lookups if it's present, otherwise use name or dev.
*/
-@@ -758,6 +764,12 @@ static int dev_remove(struct dm_ioctl *p
+@@ -758,6 +764,12 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
return 0;
}
@@ -346,7 +311,7 @@ diff -up linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.35.noarch/driv
/*
* Check a string doesn't overrun the chunk of
* memory we copied from userland.
-@@ -937,6 +949,12 @@ static int do_resume(struct dm_ioctl *pa
+@@ -937,6 +949,12 @@ static int do_resume(struct dm_ioctl *param)
return r;
}
@@ -372,10 +337,11 @@ diff -up linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.35.noarch/driv
static int table_clear(struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
-diff -up linux-2.6.35.noarch/drivers/scsi/hosts.c.orig linux-2.6.35.noarch/drivers/scsi/hosts.c
---- linux-2.6.35.noarch/drivers/scsi/hosts.c.orig 2010-09-30 12:22:33.073684000 -0400
-+++ linux-2.6.35.noarch/drivers/scsi/hosts.c 2010-09-30 12:25:08.090272000 -0400
-@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct
+diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
+index 8a8f803..7958885 100644
+--- a/drivers/scsi/hosts.c
++++ b/drivers/scsi/hosts.c
+@@ -50,10 +50,11 @@ static void scsi_host_cls_release(struct device *dev)
put_device(&class_to_shost(dev)->shost_gendev);
}
@@ -388,9 +354,68 @@ diff -up linux-2.6.35.noarch/drivers/scsi/hosts.c.orig linux-2.6.35.noarch/drive
/**
* scsi_host_set_state - Take the given host through the host state model.
-diff -up linux-2.6.35.noarch/fs/exofs/exofs.h.orig linux-2.6.35.noarch/fs/exofs/exofs.h
---- linux-2.6.35.noarch/fs/exofs/exofs.h.orig 2010-09-30 12:22:43.889004000 -0400
-+++ linux-2.6.35.noarch/fs/exofs/exofs.h 2010-09-30 12:25:08.109274000 -0400
+diff --git a/fs/Kconfig b/fs/Kconfig
+index 3d18530..82b6696 100644
+--- a/fs/Kconfig
++++ b/fs/Kconfig
+@@ -224,6 +224,31 @@ config LOCKD_V4
+ config EXPORTFS
+ tristate
+
++config EXPORTFS_FILE_LAYOUT
++ bool
++ depends on PNFSD && EXPORTFS
++ help
++ Exportfs support for the NFSv4.1 files layout type.
++ Must be automatically selected by supporting filesystems.
++
++config EXPORTFS_OSD_LAYOUT
++ bool
++ depends on PNFSD && EXPORTFS
++ help
++ Exportfs support for the NFSv4.1 objects layout type.
++ Must be automatically selected by supporting osd
++ filesystems.
++
++ If unsure, say N.
++
++config EXPORTFS_BLOCK_LAYOUT
++ bool
++ depends on PNFSD && EXPORTFS
++ help
++ Exportfs support for the NFSv4.1 blocks layout type.
++ Must be automatically selected by supporting filesystems.
++
++
+ config NFS_ACL_SUPPORT
+ tristate
+ select FS_POSIX_ACL
+diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
+index 2d0f757..5458546 100644
+--- a/fs/exofs/Kbuild
++++ b/fs/exofs/Kbuild
+@@ -13,4 +13,5 @@
+ #
+
+ exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
++exofs-$(CONFIG_PNFSD) += export.o
+ obj-$(CONFIG_EXOFS_FS) += exofs.o
+diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
+index 86194b2f..77c677f 100644
+--- a/fs/exofs/Kconfig
++++ b/fs/exofs/Kconfig
+@@ -1,6 +1,7 @@
+ config EXOFS_FS
+ tristate "exofs: OSD based file system support"
+ depends on SCSI_OSD_ULD
++ select EXPORTFS_OSD_LAYOUT if PNFSD
+ help
+ EXOFS is a file system that uses an OSD storage device,
+ as its backing storage.
+diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
+index 2dc925f..3c03f2d 100644
+--- a/fs/exofs/exofs.h
++++ b/fs/exofs/exofs.h
@@ -36,13 +36,9 @@
#include <linux/fs.h>
#include <linux/time.h>
@@ -414,7 +439,7 @@ diff -up linux-2.6.35.noarch/fs/exofs/exofs.h.orig linux-2.6.35.noarch/fs/exofs/
unsigned long i_flags; /* various atomic flags */
uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
uint32_t i_dir_start_lookup; /* which page to start lookup */
-@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si
+@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_size(unsigned numdevs)
*/
#define OBJ_2BCREATED 0 /* object will be created soon*/
#define OBJ_CREATED 1 /* object has been created on the osd*/
@@ -424,7 +449,7 @@ diff -up linux-2.6.35.noarch/fs/exofs/exofs.h.orig linux-2.6.35.noarch/fs/exofs/
static inline int obj_2bcreated(struct exofs_i_info *oi)
{
-@@ -303,4 +303,21 @@ extern const struct inode_operations exo
+@@ -303,4 +303,21 @@ extern const struct inode_operations exofs_special_inode_operations;
extern const struct inode_operations exofs_symlink_inode_operations;
extern const struct inode_operations exofs_fast_symlink_inode_operations;
@@ -446,9 +471,11 @@ diff -up linux-2.6.35.noarch/fs/exofs/exofs.h.orig linux-2.6.35.noarch/fs/exofs/
+#endif
+
#endif
-diff -up linux-2.6.35.noarch/fs/exofs/export.c.orig linux-2.6.35.noarch/fs/exofs/export.c
---- linux-2.6.35.noarch/fs/exofs/export.c.orig 2010-09-30 12:25:08.113272000 -0400
-+++ linux-2.6.35.noarch/fs/exofs/export.c 2010-09-30 12:25:08.115273000 -0400
+diff --git a/fs/exofs/export.c b/fs/exofs/export.c
+new file mode 100644
+index 0000000..69bce46
+--- /dev/null
++++ b/fs/exofs/export.c
@@ -0,0 +1,396 @@
+/*
+ * export.c - Implementation of the pnfs_export_operations
@@ -846,10 +873,11 @@ diff -up linux-2.6.35.noarch/fs/exofs/export.c.orig linux-2.6.35.noarch/fs/exofs
+{
+ sb->s_pnfs_op = &exofs_pnfs_ops;
+}
-diff -up linux-2.6.35.noarch/fs/exofs/inode.c.orig linux-2.6.35.noarch/fs/exofs/inode.c
---- linux-2.6.35.noarch/fs/exofs/inode.c.orig 2010-09-30 12:22:43.898009000 -0400
-+++ linux-2.6.35.noarch/fs/exofs/inode.c 2010-09-30 12:25:08.121273000 -0400
-@@ -820,8 +820,9 @@ static inline int exofs_inode_is_fast_sy
+diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
+index 3eadd97..1cf2286 100644
+--- a/fs/exofs/inode.c
++++ b/fs/exofs/inode.c
+@@ -826,8 +826,9 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
const struct osd_attr g_attr_logical_length = ATTR_DEF(
OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
@@ -860,7 +888,7 @@ diff -up linux-2.6.35.noarch/fs/exofs/inode.c.orig linux-2.6.35.noarch/fs/exofs/
struct exofs_i_info *oi = exofs_i(inode);
int ret;
-@@ -858,7 +859,8 @@ int exofs_setattr(struct dentry *dentry,
+@@ -864,7 +865,8 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
@@ -870,7 +898,7 @@ diff -up linux-2.6.35.noarch/fs/exofs/inode.c.orig linux-2.6.35.noarch/fs/exofs/
if (unlikely(error))
return error;
}
-@@ -971,6 +973,7 @@ static void __oi_init(struct exofs_i_inf
+@@ -977,6 +979,7 @@ static void __oi_init(struct exofs_i_info *oi)
{
init_waitqueue_head(&oi->i_wq);
oi->i_flags = 0;
@@ -878,30 +906,62 @@ diff -up linux-2.6.35.noarch/fs/exofs/inode.c.orig linux-2.6.35.noarch/fs/exofs/
}
/*
* Fill in an inode read from the OSD and set it up for use
-diff -up linux-2.6.35.noarch/fs/exofs/Kbuild.orig linux-2.6.35.noarch/fs/exofs/Kbuild
---- linux-2.6.35.noarch/fs/exofs/Kbuild.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/exofs/Kbuild 2010-09-30 12:25:08.099276000 -0400
-@@ -13,4 +13,5 @@
- #
-
- exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
-+exofs-$(CONFIG_PNFSD) += export.o
- obj-$(CONFIG_EXOFS_FS) += exofs.o
-diff -up linux-2.6.35.noarch/fs/exofs/Kconfig.orig linux-2.6.35.noarch/fs/exofs/Kconfig
---- linux-2.6.35.noarch/fs/exofs/Kconfig.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/exofs/Kconfig 2010-09-30 12:25:08.104275000 -0400
-@@ -1,6 +1,7 @@
- config EXOFS_FS
- tristate "exofs: OSD based file system support"
- depends on SCSI_OSD_ULD
-+ select EXPORTFS_OSD_LAYOUT if PNFSD
- help
- EXOFS is a file system that uses an OSD storage device,
- as its backing storage.
-diff -up linux-2.6.35.noarch/fs/exofs/super.c.orig linux-2.6.35.noarch/fs/exofs/super.c
---- linux-2.6.35.noarch/fs/exofs/super.c.orig 2010-09-30 12:22:43.908005000 -0400
-+++ linux-2.6.35.noarch/fs/exofs/super.c 2010-09-30 12:25:08.132273000 -0400
-@@ -620,6 +620,7 @@ static int exofs_fill_super(struct super
+diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
+deleted file mode 100644
+index c52e988..0000000
+--- a/fs/exofs/pnfs.h
++++ /dev/null
+@@ -1,45 +0,0 @@
+-/*
+- * Copyright (C) 2008, 2009
+- * Boaz Harrosh <bharrosh at panasas.com>
+- *
+- * This file is part of exofs.
+- *
+- * exofs is free software; you can redistribute it and/or modify it under the
+- * terms of the GNU General Public License version 2 as published by the Free
+- * Software Foundation.
+- *
+- */
+-
+-/* FIXME: Remove this file once pnfs hits mainline */
+-
+-#ifndef __EXOFS_PNFS_H__
+-#define __EXOFS_PNFS_H__
+-
+-#if ! defined(__PNFS_OSD_XDR_H__)
+-
+-enum pnfs_iomode {
+- IOMODE_READ = 1,
+- IOMODE_RW = 2,
+- IOMODE_ANY = 3,
+-};
+-
+-/* Layout Structure */
+-enum pnfs_osd_raid_algorithm4 {
+- PNFS_OSD_RAID_0 = 1,
+- PNFS_OSD_RAID_4 = 2,
+- PNFS_OSD_RAID_5 = 3,
+- PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
+-};
+-
+-struct pnfs_osd_data_map {
+- u32 odm_num_comps;
+- u64 odm_stripe_unit;
+- u32 odm_group_width;
+- u32 odm_group_depth;
+- u32 odm_mirror_cnt;
+- u32 odm_raid_algorithm;
+-};
+-
+-#endif /* ! defined(__PNFS_OSD_XDR_H__) */
+-
+-#endif /* __EXOFS_PNFS_H__ */
+diff --git a/fs/exofs/super.c b/fs/exofs/super.c
+index 047e92f..623aa55 100644
+--- a/fs/exofs/super.c
++++ b/fs/exofs/super.c
+@@ -620,6 +620,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
sb->s_export_op = &exofs_export_ops;
@@ -909,9 +969,23 @@ diff -up linux-2.6.35.noarch/fs/exofs/super.c.orig linux-2.6.35.noarch/fs/exofs/
root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
if (IS_ERR(root)) {
EXOFS_ERR("ERROR: exofs_iget failed\n");
-diff -up linux-2.6.35.noarch/fs/exportfs/expfs.c.orig linux-2.6.35.noarch/fs/exportfs/expfs.c
---- linux-2.6.35.noarch/fs/exportfs/expfs.c.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/exportfs/expfs.c 2010-09-30 12:25:08.153277000 -0400
+diff --git a/fs/exportfs/Makefile b/fs/exportfs/Makefile
+index d7c5d4d..51e8ee4 100644
+--- a/fs/exportfs/Makefile
++++ b/fs/exportfs/Makefile
+@@ -3,4 +3,7 @@
+
+ obj-$(CONFIG_EXPORTFS) += exportfs.o
+
+-exportfs-objs := expfs.o
++exportfs-y := expfs.o
++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o
++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o
++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o
+diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
+index e9e1759..a10949a 100644
+--- a/fs/exportfs/expfs.c
++++ b/fs/exportfs/expfs.c
@@ -16,6 +16,13 @@
#include <linux/namei.h>
#include <linux/sched.h>
@@ -926,21 +1000,11 @@ diff -up linux-2.6.35.noarch/fs/exportfs/expfs.c.orig linux-2.6.35.noarch/fs/exp
#define dprintk(fmt, args...) do{}while(0)
-diff -up linux-2.6.35.noarch/fs/exportfs/Makefile.orig linux-2.6.35.noarch/fs/exportfs/Makefile
---- linux-2.6.35.noarch/fs/exportfs/Makefile.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/exportfs/Makefile 2010-09-30 12:25:08.148273000 -0400
-@@ -3,4 +3,7 @@
-
- obj-$(CONFIG_EXPORTFS) += exportfs.o
-
--exportfs-objs := expfs.o
-+exportfs-y := expfs.o
-+exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o
-+exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o
-+exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o
-diff -up linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c
---- linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-09-30 12:25:08.156275000 -0400
-+++ linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-09-30 12:25:08.158274000 -0400
+diff --git a/fs/exportfs/nfs4blocklayoutxdr.c b/fs/exportfs/nfs4blocklayoutxdr.c
+new file mode 100644
+index 0000000..439e647
+--- /dev/null
++++ b/fs/exportfs/nfs4blocklayoutxdr.c
@@ -0,0 +1,158 @@
+/*
+ * linux/fs/nfsd/nfs4blocklayoutxdr.c
@@ -1100,9 +1164,11 @@ diff -up linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.35.
+ return NFS4_OK;
+}
+EXPORT_SYMBOL_GPL(blocklayout_encode_layout);
-diff -up linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c
---- linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-09-30 12:25:08.161273000 -0400
-+++ linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-09-30 12:25:08.162279000 -0400
+diff --git a/fs/exportfs/nfs4filelayoutxdr.c b/fs/exportfs/nfs4filelayoutxdr.c
+new file mode 100644
+index 0000000..f63c311
+--- /dev/null
++++ b/fs/exportfs/nfs4filelayoutxdr.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2006 The Regents of the University of Michigan.
@@ -1322,9 +1388,11 @@ diff -up linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.35.n
+ return nfserr;
+}
+EXPORT_SYMBOL(filelayout_encode_layout);
-diff -up linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c
---- linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-09-30 12:25:08.165275000 -0400
-+++ linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-09-30 12:25:08.167274000 -0400
+diff --git a/fs/exportfs/pnfs_osd_xdr_srv.c b/fs/exportfs/pnfs_osd_xdr_srv.c
+new file mode 100644
+index 0000000..60df0df
+--- /dev/null
++++ b/fs/exportfs/pnfs_osd_xdr_srv.c
@@ -0,0 +1,289 @@
+/*
+ * pnfs_osd_xdr_enc.c
@@ -1615,9 +1683,10 @@ diff -up linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.35.no
+ return p;
+}
+EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr);
-diff -up linux-2.6.35.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.35.noarch/fs/gfs2/ops_fstype.c
---- linux-2.6.35.noarch/fs/gfs2/ops_fstype.c.orig 2010-09-30 12:22:44.411020000 -0400
-+++ linux-2.6.35.noarch/fs/gfs2/ops_fstype.c 2010-09-30 12:25:08.184275000 -0400
+diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
+index 4d4b1e8..efb0a44 100644
+--- a/fs/gfs2/ops_fstype.c
++++ b/fs/gfs2/ops_fstype.c
@@ -18,6 +18,7 @@
#include <linux/mount.h>
#include <linux/gfs2_ondisk.h>
@@ -1626,7 +1695,7 @@ diff -up linux-2.6.35.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.35.noarch/fs/gf
#include "gfs2.h"
#include "incore.h"
-@@ -1166,6 +1167,9 @@ static int fill_super(struct super_block
+@@ -1166,6 +1167,9 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
sb->s_export_op = &gfs2_export_ops;
@@ -1636,44 +1705,123 @@ diff -up linux-2.6.35.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.35.noarch/fs/gf
sb->s_xattr = gfs2_xattr_handlers;
sb->s_qcop = &gfs2_quotactl_ops;
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
-diff -up linux-2.6.35.noarch/fs/Kconfig.orig linux-2.6.35.noarch/fs/Kconfig
---- linux-2.6.35.noarch/fs/Kconfig.orig 2010-09-30 12:22:42.941976000 -0400
-+++ linux-2.6.35.noarch/fs/Kconfig 2010-09-30 12:25:08.094276000 -0400
-@@ -224,6 +224,31 @@ config LOCKD_V4
- config EXPORTFS
- tristate
+diff --git a/fs/inode.c b/fs/inode.c
+index 8646433..e415be4 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -172,15 +172,21 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
+ mapping->writeback_index = 0;
-+config EXPORTFS_FILE_LAYOUT
-+ bool
-+ depends on PNFSD && EXPORTFS
+ /*
+- * If the block_device provides a backing_dev_info for client
+- * inodes then use that. Otherwise the inode share the bdev's
+- * backing_dev_info.
++ * If the filesystem provides a backing_dev_info for client inodes
++ * then use that. Otherwise inodes share default_backing_dev_info.
+ */
+- if (sb->s_bdev) {
+- struct backing_dev_info *bdi;
+-
+- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+- mapping->backing_dev_info = bdi;
++ if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) {
++ /*
++ * Catch cases where filesystem might be bitten by using s_bdi
++ * instead of sb->s_bdev. Can be removed in 2.6.38.
++ */
++ if (sb->s_bdev) {
++ struct backing_dev_info *bdi =
++ sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
++ WARN(bdi != sb->s_bdi, "s_bdev bdi %s != s_bdi %s\n",
++ bdi->name, sb->s_bdi->name);
++ }
++ mapping->backing_dev_info = sb->s_bdi;
+ }
+ inode->i_private = NULL;
+ inode->i_mapping = mapping;
+diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
+index f7e13db..0da8d28 100644
+--- a/fs/nfs/Kconfig
++++ b/fs/nfs/Kconfig
+@@ -76,10 +76,42 @@ config NFS_V4
+
+ config NFS_V4_1
+ bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
+- depends on NFS_V4 && EXPERIMENTAL
++ depends on NFS_FS && NFS_V4 && EXPERIMENTAL
++ select PNFS_FILE_LAYOUT
+ help
+ This option enables support for minor version 1 of the NFSv4 protocol
+- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
++ (RFC 5661) in the kernel's NFS client.
++
++ If unsure, say N.
++
++config PNFS_FILE_LAYOUT
++ tristate
++
++config PNFS_OBJLAYOUT
++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+ help
-+ Exportfs support for the NFSv4.1 files layout type.
-+ Must be automatically selected by supporting filesystems.
++ Say M here if you want your pNFS client to support the Objects Layout Driver.
++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
++ upper level driver (SCSI_OSD_ULD).
+
-+config EXPORTFS_OSD_LAYOUT
-+ bool
-+ depends on PNFSD && EXPORTFS
++ If unsure, say N.
++
++config PNFS_PANLAYOUT
++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
++ depends on PNFS_OBJLAYOUT
+ help
-+ Exportfs support for the NFSv4.1 objects layout type.
-+ Must be automatically selected by supporting osd
-+ filesystems.
++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver.
+
+ If unsure, say N.
+
-+config EXPORTFS_BLOCK_LAYOUT
-+ bool
-+ depends on PNFSD && EXPORTFS
++config PNFS_BLOCK
++ tristate "Provide a pNFS block client (EXPERIMENTAL)"
++ depends on NFS_FS && NFS_V4_1
++ select MD
++ select BLK_DEV_DM
+ help
-+ Exportfs support for the NFSv4.1 blocks layout type.
-+ Must be automatically selected by supporting filesystems.
++ Say M or y here if you want your pNfs client to support the block protocol
+
+ If unsure, say N.
+
+diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
+index da7fda6..e68c498 100644
+--- a/fs/nfs/Makefile
++++ b/fs/nfs/Makefile
+@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
+ delegation.o idmap.o \
+ callback.o callback_xdr.o callback_proc.o \
+ nfs4namespace.o
++nfs-$(CONFIG_NFS_V4_1) += pnfs.o
+ nfs-$(CONFIG_SYSCTL) += sysctl.o
+ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
- config NFS_ACL_SUPPORT
- tristate
- select FS_POSIX_ACL
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c
---- linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-09-30 12:25:08.211275000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-09-30 12:25:08.212285000 -0400
++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
+new file mode 100644
+index 0000000..5a4bf3d
+--- /dev/null
++++ b/fs/nfs/blocklayout/Makefile
+@@ -0,0 +1,6 @@
++#
++# Makefile for the pNFS block layout driver kernel module
++#
++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
++ extents.o block-device-discovery-pipe.o
+diff --git a/fs/nfs/blocklayout/block-device-discovery-pipe.c b/fs/nfs/blocklayout/block-device-discovery-pipe.c
+new file mode 100644
+index 0000000..e4c199f
+--- /dev/null
++++ b/fs/nfs/blocklayout/block-device-discovery-pipe.c
@@ -0,0 +1,66 @@
+#include <linux/module.h>
+#include <linux/uaccess.h>
@@ -1741,10 +1889,12 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.or
+ pipefs_closepipe(bl_device_pipe);
+ return;
+}
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c
---- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-09-30 12:25:08.216275000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c 2010-09-30 12:25:08.218276000 -0400
-@@ -0,0 +1,1152 @@
+diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
+new file mode 100644
+index 0000000..b3ab4cb
+--- /dev/null
++++ b/fs/nfs/blocklayout/blocklayout.c
+@@ -0,0 +1,1146 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.c
+ *
@@ -2424,7 +2574,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.
+ }
+}
+
-+/* This is mostly copied form the filelayout's get_device_info function.
++/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
@@ -2543,8 +2693,10 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.
+ bdev = nfs4_blk_get_deviceinfo(server, fh,
+ &dlist->dev_id[i],
+ &block_disklist);
-+ if (!bdev)
++ if (!bdev) {
++ status = -ENODEV;
+ goto out_error;
++ }
+ spin_lock(&b_mt_id->bm_lock);
+ list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+ spin_unlock(&b_mt_id->bm_lock);
@@ -2830,13 +2982,6 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.
+ fsdata->private = NULL;
+}
+
-+static ssize_t
-+bl_get_stripesize(struct pnfs_layout_hdr *lo)
-+{
-+ dprintk("%s enter\n", __func__);
-+ return 0;
-+}
-+
+/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request.
+ * Should return False if there is a reason requests can not be coalesced,
+ * otherwise, should default to returning True.
@@ -2870,7 +3015,6 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .initialize_mountpoint = bl_initialize_mountpoint,
+ .uninitialize_mountpoint = bl_uninitialize_mountpoint,
-+ .get_stripesize = bl_get_stripesize,
+ .pg_test = bl_pg_test,
+};
+
@@ -2897,14 +3041,16 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c
---- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-09-30 12:25:08.225277000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-09-30 12:25:08.227276000 -0400
-@@ -0,0 +1,335 @@
+diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
+new file mode 100644
+index 0000000..9e7bd62
+--- /dev/null
++++ b/fs/nfs/blocklayout/blocklayout.h
+@@ -0,0 +1,302 @@
+/*
-+ * linux/fs/nfs/blocklayout/blocklayoutdev.c
++ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
-+ * Device operations for the pnfs nfs4 file layout driver.
++ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
@@ -2932,324 +3078,293 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
-+#include <linux/module.h>
-+#include <linux/buffer_head.h> /* __bread */
++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
++#define FS_NFS_NFS4BLOCKLAYOUT_H
+
-+#include <linux/genhd.h>
-+#include <linux/blkdev.h>
-+#include <linux/hash.h>
++#include <linux/nfs_fs.h>
++#include <linux/dm-ioctl.h> /* Needed for struct dm_ioctl*/
++#include "../pnfs.h"
+
-+#include "blocklayout.h"
++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9)
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++#define PG_pnfserr PG_owner_priv_1
++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags)
++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags)
++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags)
+
-+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes)
-+{
-+ uint32_t *q = p + XDR_QUADLEN(nbytes);
-+ if (unlikely(q > end || q < p))
-+ return NULL;
-+ return p;
-+}
-+EXPORT_SYMBOL(blk_overflow);
++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */
++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */
++extern int dm_do_resume(struct dm_ioctl *param);
++extern int dm_table_load(struct dm_ioctl *param, size_t param_size);
+
-+/* Open a block_device by device number. */
-+struct block_device *nfs4_blkdev_get(dev_t dev)
-+{
-+ struct block_device *bd;
++struct block_mount_id {
++ spinlock_t bm_lock; /* protects list */
++ struct list_head bm_devlist; /* holds pnfs_block_dev */
++};
+
-+ dprintk("%s enter\n", __func__);
-+ bd = open_by_devnum(dev, FMODE_READ);
-+ if (IS_ERR(bd))
-+ goto fail;
-+ return bd;
-+fail:
-+ dprintk("%s failed to open device : %ld\n",
-+ __func__, PTR_ERR(bd));
-+ return NULL;
-+}
++struct pnfs_block_dev {
++ struct list_head bm_node;
++ struct nfs4_deviceid bm_mdevid; /* associated devid */
++ struct block_device *bm_mdev; /* meta device itself */
++};
+
-+/*
-+ * Release the block device
-+ */
-+int nfs4_blkdev_put(struct block_device *bdev)
-+{
-+ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
-+ MINOR(bdev->bd_dev));
-+ bd_release(bdev);
-+ return blkdev_put(bdev, FMODE_READ);
-+}
++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */
++struct visible_block_device {
++ struct list_head vi_node;
++ struct block_device *vi_bdev;
++ int vi_mapped;
++ int vi_put_done;
++};
+
-+/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded
-+ * in dev->dev_addr_buf.
-+ */
-+struct pnfs_block_dev *
-+nfs4_blk_decode_device(struct nfs_server *server,
-+ struct pnfs_device *dev,
-+ struct list_head *sdlist)
-+{
-+ struct pnfs_block_dev *rv = NULL;
-+ struct block_device *bd = NULL;
-+ struct pipefs_hdr *msg = NULL, *reply = NULL;
-+ uint32_t major, minor;
++enum blk_vol_type {
++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */
++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */
++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */
++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */
++};
+
-+ dprintk("%s enter\n", __func__);
++/* All disk offset/lengths are stored in 512-byte sectors */
++struct pnfs_blk_volume {
++ uint32_t bv_type;
++ sector_t bv_size;
++ struct pnfs_blk_volume **bv_vols;
++ int bv_vol_n;
++ union {
++ dev_t bv_dev;
++ sector_t bv_stripe_unit;
++ sector_t bv_offset;
++ };
++};
+
-+ if (IS_ERR(bl_device_pipe))
-+ return NULL;
-+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
-+ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
-+ dev->mincount);
-+ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area,
-+ dev->mincount);
-+ if (IS_ERR(msg)) {
-+ dprintk("ERROR: couldn't make pipefs message.\n");
-+ goto out_err;
-+ }
-+ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
-+ msg->status = BL_DEVICE_REQUEST_INIT;
++/* Since components need not be aligned, cannot use sector_t */
++struct pnfs_blk_sig_comp {
++ int64_t bs_offset; /* In bytes */
++ uint32_t bs_length; /* In bytes */
++ char *bs_string;
++};
+
-+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
-+ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
-+ &bl_device_list, 0, 0);
++/* Maximum number of signatures components in a simple volume */
++# define PNFS_BLOCK_MAX_SIG_COMP 16
+
-+ if (IS_ERR(reply)) {
-+ dprintk("ERROR: upcall_waitreply failed\n");
-+ goto out_err;
-+ }
-+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
-+ dprintk("%s failed to open device: %ld\n",
-+ __func__, PTR_ERR(bd));
-+ goto out_err;
-+ }
-+ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t));
-+ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)),
-+ sizeof(uint32_t));
-+ bd = nfs4_blkdev_get(MKDEV(major, minor));
-+ if (IS_ERR(bd)) {
-+ dprintk("%s failed to open device : %ld\n",
-+ __func__, PTR_ERR(bd));
-+ goto out_err;
-+ }
++struct pnfs_blk_sig {
++ int si_num_comps;
++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP];
++};
+
-+ rv = kzalloc(sizeof(*rv), GFP_KERNEL);
-+ if (!rv)
-+ goto out_err;
++enum exstate4 {
++ PNFS_BLOCK_READWRITE_DATA = 0,
++ PNFS_BLOCK_READ_DATA = 1,
++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
++};
+
-+ rv->bm_mdev = bd;
-+ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
-+ dprintk("%s Created device %s with bd_block_size %u\n",
-+ __func__,
-+ bd->bd_disk->disk_name,
-+ bd->bd_block_size);
-+ kfree(reply);
-+ kfree(msg);
-+ return rv;
++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+
-+out_err:
-+ kfree(rv);
-+ if (!IS_ERR(reply))
-+ kfree(reply);
-+ if (!IS_ERR(msg))
-+ kfree(msg);
-+ return NULL;
-+}
++struct my_tree_t {
++ sector_t mtt_step_size; /* Internal sector alignment */
++ struct list_head mtt_stub; /* Should be a radix tree */
++};
+
-+/* Map deviceid returned by the server to constructed block_device */
-+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
-+ struct nfs4_deviceid *id)
-+{
-+ struct block_device *rv = NULL;
-+ struct block_mount_id *mid;
-+ struct pnfs_block_dev *dev;
++struct pnfs_inval_markings {
++ spinlock_t im_lock;
++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */
++ sector_t im_block_size; /* Server blocksize in sectors */
++};
+
-+ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
-+ mid = BLK_ID(lo);
-+ spin_lock(&mid->bm_lock);
-+ list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
-+ if (memcmp(id->data, dev->bm_mdevid.data,
-+ NFS4_DEVICEID4_SIZE) == 0) {
-+ rv = dev->bm_mdev;
-+ goto out;
-+ }
-+ }
-+ out:
-+ spin_unlock(&mid->bm_lock);
-+ dprintk("%s returning %p\n", __func__, rv);
-+ return rv;
-+}
++struct pnfs_inval_tracking {
++ struct list_head it_link;
++ int it_sector;
++ int it_tags;
++};
+
-+/* Tracks info needed to ensure extents in layout obey constraints of spec */
-+struct layout_verification {
-+ u32 mode; /* R or RW */
-+ u64 start; /* Expected start of next non-COW extent */
-+ u64 inval; /* Start of INVAL coverage */
-+ u64 cowread; /* End of COW read coverage */
++/* sector_t fields are all in 512-byte sectors */
++struct pnfs_block_extent {
++ struct kref be_refcnt;
++ struct list_head be_node; /* link into lseg list */
++ struct nfs4_deviceid be_devid; /* STUB - remevable??? */
++ struct block_device *be_mdev;
++ sector_t be_f_offset; /* the starting offset in the file */
++ sector_t be_length; /* the size of the extent */
++ sector_t be_v_offset; /* the starting offset in the volume */
++ enum exstate4 be_state; /* the state of this extent */
++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+
-+/* Verify the extent meets the layout requirements of the pnfs-block draft,
-+ * section 2.3.1.
-+ */
-+static int verify_extent(struct pnfs_block_extent *be,
-+ struct layout_verification *lv)
++/* Shortened extent used by LAYOUTCOMMIT */
++struct pnfs_block_short_extent {
++ struct list_head bse_node;
++ struct nfs4_deviceid bse_devid; /* STUB - removable??? */
++ struct block_device *bse_mdev;
++ sector_t bse_f_offset; /* the starting offset in the file */
++ sector_t bse_length; /* the size of the extent */
++};
++
++static inline void
++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
-+ if (lv->mode == IOMODE_READ) {
-+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
-+ be->be_state == PNFS_BLOCK_INVALID_DATA)
-+ return -EIO;
-+ if (be->be_f_offset != lv->start)
-+ return -EIO;
-+ lv->start += be->be_length;
-+ return 0;
-+ }
-+ /* lv->mode == IOMODE_RW */
-+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
-+ if (be->be_f_offset != lv->start)
-+ return -EIO;
-+ if (lv->cowread > lv->start)
-+ return -EIO;
-+ lv->start += be->be_length;
-+ lv->inval = lv->start;
-+ return 0;
-+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-+ if (be->be_f_offset != lv->start)
-+ return -EIO;
-+ lv->start += be->be_length;
-+ return 0;
-+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
-+ if (be->be_f_offset > lv->start)
-+ return -EIO;
-+ if (be->be_f_offset < lv->inval)
-+ return -EIO;
-+ if (be->be_f_offset < lv->cowread)
-+ return -EIO;
-+ /* It looks like you might want to min this with lv->start,
-+ * but you really don't.
-+ */
-+ lv->inval = lv->inval + be->be_length;
-+ lv->cowread = be->be_f_offset + be->be_length;
-+ return 0;
-+ } else
-+ return -EIO;
++ spin_lock_init(&marks->im_lock);
++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
++ marks->im_block_size = blocksize;
++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
++ blocksize);
+}
+
-+/* XDR decode pnfs_block_layout4 structure */
-+int
-+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-+ struct nfs4_layoutget_res *lgr)
-+{
-+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
-+ uint32_t *p = (uint32_t *)lgr->layout.buf;
-+ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len);
-+ int i, status = -EIO;
-+ uint32_t count;
-+ struct pnfs_block_extent *be = NULL, *save;
-+ uint64_t tmp; /* Used by READSECTOR */
-+ struct layout_verification lv = {
-+ .mode = lgr->range.iomode,
-+ .start = lgr->range.offset >> 9,
-+ .inval = lgr->range.offset >> 9,
-+ .cowread = lgr->range.offset >> 9,
-+ };
++enum extentclass4 {
++ RW_EXTENT = 0, /* READWRTE and INVAL */
++ RO_EXTENT = 1, /* READ and NONE */
++ EXTENT_LISTS = 2,
++};
+
-+ LIST_HEAD(extents);
++static inline int choose_list(enum exstate4 state)
++{
++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
++ return RO_EXTENT;
++ else
++ return RW_EXTENT;
++}
+
-+ BLK_READBUF(p, end, 4);
-+ READ32(count);
++struct pnfs_block_layout {
++ struct pnfs_layout_hdr bl_layout;
++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
++ spinlock_t bl_ext_lock; /* Protects list manipulation */
++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
++ struct list_head bl_commit; /* Needs layout commit */
++ unsigned int bl_count; /* entries in bl_commit */
++ sector_t bl_blocksize; /* Server blocksize in sectors */
++};
+
-+ dprintk("%s enter, number of extents %i\n", __func__, count);
-+ BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count);
++/* this struct is comunicated between:
++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit
++ */
++struct bl_layoutupdate_data {
++ struct list_head ranges;
++};
+
-+ /* Decode individual extents, putting them in temporary
-+ * staging area until whole layout is decoded to make error
-+ * recovery easier.
-+ */
-+ for (i = 0; i < count; i++) {
-+ be = alloc_extent();
-+ if (!be) {
-+ status = -ENOMEM;
-+ goto out_err;
-+ }
-+ READ_DEVID(&be->be_devid);
-+ be->be_mdev = translate_devid(lo, &be->be_devid);
-+ if (!be->be_mdev)
-+ goto out_err;
-+ /* The next three values are read in as bytes,
-+ * but stored as 512-byte sector lengths
-+ */
-+ READ_SECTOR(be->be_f_offset);
-+ READ_SECTOR(be->be_length);
-+ READ_SECTOR(be->be_v_offset);
-+ READ32(be->be_state);
-+ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
-+ be->be_inval = &bl->bl_inval;
-+ if (verify_extent(be, &lv)) {
-+ dprintk("%s verify failed\n", __func__);
-+ goto out_err;
-+ }
-+ list_add_tail(&be->be_node, &extents);
-+ }
-+ if (p != end) {
-+ dprintk("%s Undecoded cruft at end of opaque\n", __func__);
-+ be = NULL;
-+ goto out_err;
-+ }
-+ if (lgr->range.offset + lgr->range.length != lv.start << 9) {
-+ dprintk("%s Final length mismatch\n", __func__);
-+ be = NULL;
-+ goto out_err;
-+ }
-+ if (lv.start < lv.cowread) {
-+ dprintk("%s Final uncovered COW extent\n", __func__);
-+ be = NULL;
-+ goto out_err;
-+ }
-+ /* Extents decoded properly, now try to merge them in to
-+ * existing layout extents.
-+ */
-+ spin_lock(&bl->bl_ext_lock);
-+ list_for_each_entry_safe(be, save, &extents, be_node) {
-+ list_del(&be->be_node);
-+ status = add_and_merge_extent(bl, be);
-+ if (status) {
-+ spin_unlock(&bl->bl_ext_lock);
-+ /* This is a fairly catastrophic error, as the
-+ * entire layout extent lists are now corrupted.
-+ * We should have some way to distinguish this.
-+ */
-+ be = NULL;
-+ goto out_err;
-+ }
-+ }
-+ spin_unlock(&bl->bl_ext_lock);
-+ status = 0;
-+ out:
-+ dprintk("%s returns %i\n", __func__, status);
-+ return status;
++#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->inode)->pnfs_ld_data))
+
-+ out_err:
-+ put_extent(be);
-+ while (!list_empty(&extents)) {
-+ be = list_first_entry(&extents, struct pnfs_block_extent,
-+ be_node);
-+ list_del(&be->be_node);
-+ put_extent(be);
-+ }
-+ goto out;
++static inline struct pnfs_block_layout *
++BLK_LO2EXT(struct pnfs_layout_hdr *lo)
++{
++ return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c
---- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-09-30 12:25:08.229279000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-09-30 12:25:08.231276000 -0400
-@@ -0,0 +1,120 @@
++
++static inline struct pnfs_block_layout *
++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
++{
++ return BLK_LO2EXT(lseg->layout);
++}
++
++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes);
++
++#define BLK_READBUF(p, e, nbytes) do { \
++ p = blk_overflow(p, e, nbytes); \
++ if (!p) { \
++ printk(KERN_WARNING \
++ "%s: reply buffer overflowed in line %d.\n", \
++ __func__, __LINE__); \
++ goto out_err; \
++ } \
++} while (0)
++
++#define READ32(x) (x) = ntohl(*p++)
++#define READ64(x) do { \
++ (x) = (uint64_t)ntohl(*p++) << 32; \
++ (x) |= ntohl(*p++); \
++} while (0)
++#define COPYMEM(x, nbytes) do { \
++ memcpy((x), p, nbytes); \
++ p += XDR_QUADLEN(nbytes); \
++} while (0)
++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_DEVICEID4_SIZE)
++#define READ_SECTOR(x) do { \
++ READ64(tmp); \
++ if (tmp & 0x1ff) { \
++ printk(KERN_WARNING \
++ "%s Value not 512-byte aligned at line %d\n", \
++ __func__, __LINE__); \
++ goto out_err; \
++ } \
++ (x) = tmp >> 9; \
++} while (0)
++
++#define WRITE32(n) do { \
++ *p++ = htonl(n); \
++ } while (0)
++#define WRITE64(n) do { \
++ *p++ = htonl((uint32_t)((n) >> 32)); \
++ *p++ = htonl((uint32_t)(n)); \
++} while (0)
++#define WRITEMEM(ptr, nbytes) do { \
++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
++} while (0)
++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE)
++
++/* blocklayoutdev.c */
++struct block_device *nfs4_blkdev_get(dev_t dev);
++int nfs4_blkdev_put(struct block_device *bdev);
++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
++ struct pnfs_device *dev,
++ struct list_head *sdlist);
++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
++ struct nfs4_layoutget_res *lgr);
++int nfs4_blk_create_block_disk_list(struct list_head *);
++void nfs4_blk_destroy_disk_list(struct list_head *);
++/* blocklayoutdm.c */
++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *);
++void free_block_dev(struct pnfs_block_dev *bdev);
++/* extents.c */
++struct pnfs_block_extent *
++find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
++ struct pnfs_block_extent **cow_read);
++int mark_initialized_sectors(struct pnfs_inval_markings *marks,
++ sector_t offset, sector_t length,
++ sector_t **pages);
++void put_extent(struct pnfs_block_extent *be);
++struct pnfs_block_extent *alloc_extent(void);
++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
++ struct xdr_stream *xdr,
++ const struct nfs4_layoutcommit_args *arg);
++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
++ const struct nfs4_layoutcommit_args *arg,
++ int status);
++int add_and_merge_extent(struct pnfs_block_layout *bl,
++ struct pnfs_block_extent *new);
++int mark_for_commit(struct pnfs_block_extent *be,
++ sector_t offset, sector_t length);
++
++#include <linux/sunrpc/simple_rpc_pipefs.h>
++
++extern struct pipefs_list bl_device_list;
++extern struct dentry *bl_device_pipe;
++
++int bl_pipe_init(void);
++void bl_pipe_exit(void);
++
++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
++
++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
+diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
+new file mode 100644
+index 0000000..17bd25a
+--- /dev/null
++++ b/fs/nfs/blocklayout/blocklayoutdev.c
+@@ -0,0 +1,335 @@
+/*
-+ * linux/fs/nfs/blocklayout/blocklayoutdm.c
++ * linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
-+ * Module for the NFSv4.1 pNFS block layout driver.
++ * Device operations for the pnfs nfs4 file layout driver.
+ *
-+ * Copyright (c) 2007 The Regents of the University of Michigan.
++ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
-+ * Fred Isaman <iisaman at umich.edu>
+ * Andy Adamson <andros at citi.umich.edu>
++ * Fred Isaman <iisaman at umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
@@ -3271,404 +3386,441 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.3
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
++#include <linux/module.h>
++#include <linux/buffer_head.h> /* __bread */
+
-+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-+#include <linux/sched.h>
++#include <linux/genhd.h>
++#include <linux/blkdev.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+/* Defines used for calculating memory usage in nfs4_blk_flatten() */
-+#define ARGSIZE 24 /* Max bytes needed for linear target arg string */
-+#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE)
-+#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE)
-+#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \
-+ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE)
-+#define roundup8(x) (((x)+7) & ~7)
-+#define sizeof8(x) roundup8(sizeof(x))
-+
-+static int dev_remove(dev_t dev)
++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes)
+{
-+ int ret = 1;
-+ struct pipefs_hdr *msg = NULL, *reply = NULL;
-+ uint64_t bl_dev;
-+ uint32_t major = MAJOR(dev), minor = MINOR(dev);
++ uint32_t *q = p + XDR_QUADLEN(nbytes);
++ if (unlikely(q > end || q < p))
++ return NULL;
++ return p;
++}
++EXPORT_SYMBOL(blk_overflow);
+
-+ dprintk("Entering %s\n", __func__);
++/* Open a block_device by device number. */
++struct block_device *nfs4_blkdev_get(dev_t dev)
++{
++ struct block_device *bd;
+
-+ if (IS_ERR(bl_device_pipe))
-+ return ret;
++ dprintk("%s enter\n", __func__);
++ bd = open_by_devnum(dev, FMODE_READ);
++ if (IS_ERR(bd))
++ goto fail;
++ return bd;
++fail:
++ dprintk("%s failed to open device : %ld\n",
++ __func__, PTR_ERR(bd));
++ return NULL;
++}
+
-+ memcpy((void *)&bl_dev, &major, sizeof(uint32_t));
-+ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t));
-+ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev,
-+ sizeof(uint64_t));
++/*
++ * Release the block device
++ */
++int nfs4_blkdev_put(struct block_device *bdev)
++{
++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
++ MINOR(bdev->bd_dev));
++ bd_release(bdev);
++ return blkdev_put(bdev, FMODE_READ);
++}
++
++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded
++ * in dev->dev_addr_buf.
++ */
++struct pnfs_block_dev *
++nfs4_blk_decode_device(struct nfs_server *server,
++ struct pnfs_device *dev,
++ struct list_head *sdlist)
++{
++ struct pnfs_block_dev *rv = NULL;
++ struct block_device *bd = NULL;
++ struct pipefs_hdr *msg = NULL, *reply = NULL;
++ uint32_t major, minor;
++
++ dprintk("%s enter\n", __func__);
++
++ if (IS_ERR(bl_device_pipe))
++ return NULL;
++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
++ dev->mincount);
++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area,
++ dev->mincount);
+ if (IS_ERR(msg)) {
+ dprintk("ERROR: couldn't make pipefs message.\n");
-+ goto out;
++ goto out_err;
+ }
+ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
+ msg->status = BL_DEVICE_REQUEST_INIT;
+
++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
+ &bl_device_list, 0, 0);
++
+ if (IS_ERR(reply)) {
+ dprintk("ERROR: upcall_waitreply failed\n");
-+ goto out;
++ goto out_err;
++ }
++ if (reply->status != BL_DEVICE_REQUEST_PROC) {
++ dprintk("%s failed to open device: %ld\n",
++ __func__, PTR_ERR(bd));
++ goto out_err;
++ }
++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t));
++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)),
++ sizeof(uint32_t));
++ bd = nfs4_blkdev_get(MKDEV(major, minor));
++ if (IS_ERR(bd)) {
++ dprintk("%s failed to open device : %ld\n",
++ __func__, PTR_ERR(bd));
++ goto out_err;
+ }
+
-+ if (reply->status == BL_DEVICE_REQUEST_PROC)
-+ ret = 0; /*TODO: what to return*/
-+out:
++ rv = kzalloc(sizeof(*rv), GFP_KERNEL);
++ if (!rv)
++ goto out_err;
++
++ rv->bm_mdev = bd;
++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
++ dprintk("%s Created device %s with bd_block_size %u\n",
++ __func__,
++ bd->bd_disk->disk_name,
++ bd->bd_block_size);
++ kfree(reply);
++ kfree(msg);
++ return rv;
++
++out_err:
++ kfree(rv);
+ if (!IS_ERR(reply))
+ kfree(reply);
+ if (!IS_ERR(msg))
+ kfree(msg);
-+ return ret;
++ return NULL;
+}
+
-+/*
-+ * Release meta device
-+ */
-+static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
++/* Map deviceid returned by the server to constructed block_device */
++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
++ struct nfs4_deviceid *id)
+{
-+ int rv;
-+
-+ dprintk("%s Releasing\n", __func__);
-+ /* XXX Check return? */
-+ rv = nfs4_blkdev_put(bdev->bm_mdev);
-+ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv);
++ struct block_device *rv = NULL;
++ struct block_mount_id *mid;
++ struct pnfs_block_dev *dev;
+
-+ rv = dev_remove(bdev->bm_mdev->bd_dev);
-+ dprintk("%s Returns %d\n", __func__, rv);
++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
++ mid = BLK_ID(lo);
++ spin_lock(&mid->bm_lock);
++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
++ if (memcmp(id->data, dev->bm_mdevid.data,
++ NFS4_DEVICEID4_SIZE) == 0) {
++ rv = dev->bm_mdev;
++ goto out;
++ }
++ }
++ out:
++ spin_unlock(&mid->bm_lock);
++ dprintk("%s returning %p\n", __func__, rv);
+ return rv;
+}
+
-+void free_block_dev(struct pnfs_block_dev *bdev)
++/* Tracks info needed to ensure extents in layout obey constraints of spec */
++struct layout_verification {
++ u32 mode; /* R or RW */
++ u64 start; /* Expected start of next non-COW extent */
++ u64 inval; /* Start of INVAL coverage */
++ u64 cowread; /* End of COW read coverage */
++};
++
++/* Verify the extent meets the layout requirements of the pnfs-block draft,
++ * section 2.3.1.
++ */
++static int verify_extent(struct pnfs_block_extent *be,
++ struct layout_verification *lv)
+{
-+ if (bdev) {
-+ if (bdev->bm_mdev) {
-+ dprintk("%s Removing DM device: %d:%d\n",
-+ __func__,
-+ MAJOR(bdev->bm_mdev->bd_dev),
-+ MINOR(bdev->bm_mdev->bd_dev));
-+ /* XXX Check status ?? */
-+ nfs4_blk_metadev_release(bdev);
-+ }
-+ kfree(bdev);
++ if (lv->mode == IOMODE_READ) {
++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
++ be->be_state == PNFS_BLOCK_INVALID_DATA)
++ return -EIO;
++ if (be->be_f_offset != lv->start)
++ return -EIO;
++ lv->start += be->be_length;
++ return 0;
+ }
++ /* lv->mode == IOMODE_RW */
++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
++ if (be->be_f_offset != lv->start)
++ return -EIO;
++ if (lv->cowread > lv->start)
++ return -EIO;
++ lv->start += be->be_length;
++ lv->inval = lv->start;
++ return 0;
++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
++ if (be->be_f_offset != lv->start)
++ return -EIO;
++ lv->start += be->be_length;
++ return 0;
++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
++ if (be->be_f_offset > lv->start)
++ return -EIO;
++ if (be->be_f_offset < lv->inval)
++ return -EIO;
++ if (be->be_f_offset < lv->cowread)
++ return -EIO;
++ /* It looks like you might want to min this with lv->start,
++ * but you really don't.
++ */
++ lv->inval = lv->inval + be->be_length;
++ lv->cowread = be->be_f_offset + be->be_length;
++ return 0;
++ } else
++ return -EIO;
+}
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h
---- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-09-30 12:25:08.221275000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h 2010-09-30 12:25:08.222281000 -0400
-@@ -0,0 +1,302 @@
-+/*
-+ * linux/fs/nfs/blocklayout/blocklayout.h
-+ *
-+ * Module for the NFSv4.1 pNFS block layout driver.
-+ *
-+ * Copyright (c) 2006 The Regents of the University of Michigan.
-+ * All rights reserved.
-+ *
-+ * Andy Adamson <andros at citi.umich.edu>
-+ * Fred Isaman <iisaman at umich.edu>
-+ *
-+ * permission is granted to use, copy, create derivative works and
-+ * redistribute this software and such derivative works for any purpose,
-+ * so long as the name of the university of michigan is not used in
-+ * any advertising or publicity pertaining to the use or distribution
-+ * of this software without specific, written prior authorization. if
-+ * the above copyright notice or any other identification of the
-+ * university of michigan is included in any copy of any portion of
-+ * this software, then the disclaimer below must also be included.
-+ *
-+ * this software is provided as is, without representation from the
-+ * university of michigan as to its fitness for any purpose, and without
-+ * warranty by the university of michigan of any kind, either express
-+ * or implied, including without limitation the implied warranties of
-+ * merchantability and fitness for a particular purpose. the regents
-+ * of the university of michigan shall not be liable for any damages,
-+ * including special, indirect, incidental, or consequential damages,
-+ * with respect to any claim arising out or in connection with the use
-+ * of the software, even if it has been or is hereafter advised of the
-+ * possibility of such damages.
-+ */
-+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
-+#define FS_NFS_NFS4BLOCKLAYOUT_H
-+
-+#include <linux/nfs_fs.h>
-+#include <linux/dm-ioctl.h> /* Needed for struct dm_ioctl*/
-+#include "../pnfs.h"
-+
-+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9)
+
-+#define PG_pnfserr PG_owner_priv_1
-+#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags)
-+#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags)
-+#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags)
-+
-+extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */
-+extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */
-+extern int dm_do_resume(struct dm_ioctl *param);
-+extern int dm_table_load(struct dm_ioctl *param, size_t param_size);
++/* XDR decode pnfs_block_layout4 structure */
++int
++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
++ struct nfs4_layoutget_res *lgr)
++{
++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
++ uint32_t *p = (uint32_t *)lgr->layout.buf;
++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len);
++ int i, status = -EIO;
++ uint32_t count;
++ struct pnfs_block_extent *be = NULL, *save;
++ uint64_t tmp; /* Used by READSECTOR */
++ struct layout_verification lv = {
++ .mode = lgr->range.iomode,
++ .start = lgr->range.offset >> 9,
++ .inval = lgr->range.offset >> 9,
++ .cowread = lgr->range.offset >> 9,
++ };
+
-+struct block_mount_id {
-+ spinlock_t bm_lock; /* protects list */
-+ struct list_head bm_devlist; /* holds pnfs_block_dev */
-+};
++ LIST_HEAD(extents);
+
-+struct pnfs_block_dev {
-+ struct list_head bm_node;
-+ struct nfs4_deviceid bm_mdevid; /* associated devid */
-+ struct block_device *bm_mdev; /* meta device itself */
-+};
++ BLK_READBUF(p, end, 4);
++ READ32(count);
+
-+/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */
-+struct visible_block_device {
-+ struct list_head vi_node;
-+ struct block_device *vi_bdev;
-+ int vi_mapped;
-+ int vi_put_done;
-+};
++ dprintk("%s enter, number of extents %i\n", __func__, count);
++ BLK_READBUF(p, end, (28 + NFS4_DEVICEID4_SIZE) * count);
+
-+enum blk_vol_type {
-+ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */
-+ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */
-+ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */
-+ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */
-+};
++ /* Decode individual extents, putting them in temporary
++ * staging area until whole layout is decoded to make error
++ * recovery easier.
++ */
++ for (i = 0; i < count; i++) {
++ be = alloc_extent();
++ if (!be) {
++ status = -ENOMEM;
++ goto out_err;
++ }
++ READ_DEVID(&be->be_devid);
++ be->be_mdev = translate_devid(lo, &be->be_devid);
++ if (!be->be_mdev)
++ goto out_err;
++ /* The next three values are read in as bytes,
++ * but stored as 512-byte sector lengths
++ */
++ READ_SECTOR(be->be_f_offset);
++ READ_SECTOR(be->be_length);
++ READ_SECTOR(be->be_v_offset);
++ READ32(be->be_state);
++ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
++ be->be_inval = &bl->bl_inval;
++ if (verify_extent(be, &lv)) {
++ dprintk("%s verify failed\n", __func__);
++ goto out_err;
++ }
++ list_add_tail(&be->be_node, &extents);
++ }
++ if (p != end) {
++ dprintk("%s Undecoded cruft at end of opaque\n", __func__);
++ be = NULL;
++ goto out_err;
++ }
++ if (lgr->range.offset + lgr->range.length != lv.start << 9) {
++ dprintk("%s Final length mismatch\n", __func__);
++ be = NULL;
++ goto out_err;
++ }
++ if (lv.start < lv.cowread) {
++ dprintk("%s Final uncovered COW extent\n", __func__);
++ be = NULL;
++ goto out_err;
++ }
++ /* Extents decoded properly, now try to merge them in to
++ * existing layout extents.
++ */
++ spin_lock(&bl->bl_ext_lock);
++ list_for_each_entry_safe(be, save, &extents, be_node) {
++ list_del(&be->be_node);
++ status = add_and_merge_extent(bl, be);
++ if (status) {
++ spin_unlock(&bl->bl_ext_lock);
++ /* This is a fairly catastrophic error, as the
++ * entire layout extent lists are now corrupted.
++ * We should have some way to distinguish this.
++ */
++ be = NULL;
++ goto out_err;
++ }
++ }
++ spin_unlock(&bl->bl_ext_lock);
++ status = 0;
++ out:
++ dprintk("%s returns %i\n", __func__, status);
++ return status;
+
-+/* All disk offset/lengths are stored in 512-byte sectors */
-+struct pnfs_blk_volume {
-+ uint32_t bv_type;
-+ sector_t bv_size;
-+ struct pnfs_blk_volume **bv_vols;
-+ int bv_vol_n;
-+ union {
-+ dev_t bv_dev;
-+ sector_t bv_stripe_unit;
-+ sector_t bv_offset;
-+ };
-+};
++ out_err:
++ put_extent(be);
++ while (!list_empty(&extents)) {
++ be = list_first_entry(&extents, struct pnfs_block_extent,
++ be_node);
++ list_del(&be->be_node);
++ put_extent(be);
++ }
++ goto out;
++}
+diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
+new file mode 100644
+index 0000000..097dd05
+--- /dev/null
++++ b/fs/nfs/blocklayout/blocklayoutdm.c
+@@ -0,0 +1,120 @@
++/*
++ * linux/fs/nfs/blocklayout/blocklayoutdm.c
++ *
++ * Module for the NFSv4.1 pNFS block layout driver.
++ *
++ * Copyright (c) 2007 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Fred Isaman <iisaman at umich.edu>
++ * Andy Adamson <andros at citi.umich.edu>
++ *
++ * permission is granted to use, copy, create derivative works and
++ * redistribute this software and such derivative works for any purpose,
++ * so long as the name of the university of michigan is not used in
++ * any advertising or publicity pertaining to the use or distribution
++ * of this software without specific, written prior authorization. if
++ * the above copyright notice or any other identification of the
++ * university of michigan is included in any copy of any portion of
++ * this software, then the disclaimer below must also be included.
++ *
++ * this software is provided as is, without representation from the
++ * university of michigan as to its fitness for any purpose, and without
++ * warranty by the university of michigan of any kind, either express
++ * or implied, including without limitation the implied warranties of
++ * merchantability and fitness for a particular purpose. the regents
++ * of the university of michigan shall not be liable for any damages,
++ * including special, indirect, incidental, or consequential damages,
++ * with respect to any claim arising out or in connection with the use
++ * of the software, even if it has been or is hereafter advised of the
++ * possibility of such damages.
++ */
+
-+/* Since components need not be aligned, cannot use sector_t */
-+struct pnfs_blk_sig_comp {
-+ int64_t bs_offset; /* In bytes */
-+ uint32_t bs_length; /* In bytes */
-+ char *bs_string;
-+};
++#include <linux/genhd.h> /* gendisk - used in a dprintk*/
++#include <linux/sched.h>
++#include <linux/hash.h>
+
-+/* Maximum number of signatures components in a simple volume */
-+# define PNFS_BLOCK_MAX_SIG_COMP 16
++#include "blocklayout.h"
+
-+struct pnfs_blk_sig {
-+ int si_num_comps;
-+ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP];
-+};
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+enum exstate4 {
-+ PNFS_BLOCK_READWRITE_DATA = 0,
-+ PNFS_BLOCK_READ_DATA = 1,
-+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
-+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
-+};
++/* Defines used for calculating memory usage in nfs4_blk_flatten() */
++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */
++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE)
++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE)
++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \
++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE)
++#define roundup8(x) (((x)+7) & ~7)
++#define sizeof8(x) roundup8(sizeof(x))
+
-+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
++static int dev_remove(dev_t dev)
++{
++ int ret = 1;
++ struct pipefs_hdr *msg = NULL, *reply = NULL;
++ uint64_t bl_dev;
++ uint32_t major = MAJOR(dev), minor = MINOR(dev);
+
-+struct my_tree_t {
-+ sector_t mtt_step_size; /* Internal sector alignment */
-+ struct list_head mtt_stub; /* Should be a radix tree */
-+};
++ dprintk("Entering %s\n", __func__);
+
-+struct pnfs_inval_markings {
-+ spinlock_t im_lock;
-+ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */
-+ sector_t im_block_size; /* Server blocksize in sectors */
-+};
++ if (IS_ERR(bl_device_pipe))
++ return ret;
+
-+struct pnfs_inval_tracking {
-+ struct list_head it_link;
-+ int it_sector;
-+ int it_tags;
-+};
++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t));
++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t));
++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev,
++ sizeof(uint64_t));
++ if (IS_ERR(msg)) {
++ dprintk("ERROR: couldn't make pipefs message.\n");
++ goto out;
++ }
++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8);
++ msg->status = BL_DEVICE_REQUEST_INIT;
+
-+/* sector_t fields are all in 512-byte sectors */
-+struct pnfs_block_extent {
-+ struct kref be_refcnt;
-+ struct list_head be_node; /* link into lseg list */
-+ struct nfs4_deviceid be_devid; /* STUB - remevable??? */
-+ struct block_device *be_mdev;
-+ sector_t be_f_offset; /* the starting offset in the file */
-+ sector_t be_length; /* the size of the extent */
-+ sector_t be_v_offset; /* the starting offset in the volume */
-+ enum exstate4 be_state; /* the state of this extent */
-+ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
-+};
++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg,
++ &bl_device_list, 0, 0);
++ if (IS_ERR(reply)) {
++ dprintk("ERROR: upcall_waitreply failed\n");
++ goto out;
++ }
+
-+/* Shortened extent used by LAYOUTCOMMIT */
-+struct pnfs_block_short_extent {
-+ struct list_head bse_node;
-+ struct nfs4_deviceid bse_devid; /* STUB - removable??? */
-+ struct block_device *bse_mdev;
-+ sector_t bse_f_offset; /* the starting offset in the file */
-+ sector_t bse_length; /* the size of the extent */
-+};
++ if (reply->status == BL_DEVICE_REQUEST_PROC)
++ ret = 0; /*TODO: what to return*/
++out:
++ if (!IS_ERR(reply))
++ kfree(reply);
++ if (!IS_ERR(msg))
++ kfree(msg);
++ return ret;
++}
+
-+static inline void
-+INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
++/*
++ * Release meta device
++ */
++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
-+ spin_lock_init(&marks->im_lock);
-+ INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
-+ marks->im_block_size = blocksize;
-+ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
-+ blocksize);
-+}
++ int rv;
+
-+enum extentclass4 {
-+ RW_EXTENT = 0, /* READWRTE and INVAL */
-+ RO_EXTENT = 1, /* READ and NONE */
-+ EXTENT_LISTS = 2,
-+};
++ dprintk("%s Releasing\n", __func__);
++ /* XXX Check return? */
++ rv = nfs4_blkdev_put(bdev->bm_mdev);
++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv);
+
-+static inline int choose_list(enum exstate4 state)
++ rv = dev_remove(bdev->bm_mdev->bd_dev);
++ dprintk("%s Returns %d\n", __func__, rv);
++ return rv;
++}
++
++void free_block_dev(struct pnfs_block_dev *bdev)
+{
-+ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
-+ return RO_EXTENT;
-+ else
-+ return RW_EXTENT;
-+}
-+
-+struct pnfs_block_layout {
-+ struct pnfs_layout_hdr bl_layout;
-+ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
-+ spinlock_t bl_ext_lock; /* Protects list manipulation */
-+ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
-+ struct list_head bl_commit; /* Needs layout commit */
-+ unsigned int bl_count; /* entries in bl_commit */
-+ sector_t bl_blocksize; /* Server blocksize in sectors */
-+};
-+
-+/* this struct is comunicated between:
-+ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit
-+ */
-+struct bl_layoutupdate_data {
-+ struct list_head ranges;
-+};
-+
-+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->inode)->pnfs_ld_data))
-+
-+static inline struct pnfs_block_layout *
-+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
-+{
-+ return container_of(lo, struct pnfs_block_layout, bl_layout);
-+}
-+
-+static inline struct pnfs_block_layout *
-+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
-+{
-+ return BLK_LO2EXT(lseg->layout);
++ if (bdev) {
++ if (bdev->bm_mdev) {
++ dprintk("%s Removing DM device: %d:%d\n",
++ __func__,
++ MAJOR(bdev->bm_mdev->bd_dev),
++ MINOR(bdev->bm_mdev->bd_dev));
++ /* XXX Check status ?? */
++ nfs4_blk_metadev_release(bdev);
++ }
++ kfree(bdev);
++ }
+}
-+
-+uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes);
-+
-+#define BLK_READBUF(p, e, nbytes) do { \
-+ p = blk_overflow(p, e, nbytes); \
-+ if (!p) { \
-+ printk(KERN_WARNING \
-+ "%s: reply buffer overflowed in line %d.\n", \
-+ __func__, __LINE__); \
-+ goto out_err; \
-+ } \
-+} while (0)
-+
-+#define READ32(x) (x) = ntohl(*p++)
-+#define READ64(x) do { \
-+ (x) = (uint64_t)ntohl(*p++) << 32; \
-+ (x) |= ntohl(*p++); \
-+} while (0)
-+#define COPYMEM(x, nbytes) do { \
-+ memcpy((x), p, nbytes); \
-+ p += XDR_QUADLEN(nbytes); \
-+} while (0)
-+#define READ_DEVID(x) COPYMEM((x)->data, NFS4_DEVICEID4_SIZE)
-+#define READ_SECTOR(x) do { \
-+ READ64(tmp); \
-+ if (tmp & 0x1ff) { \
-+ printk(KERN_WARNING \
-+ "%s Value not 512-byte aligned at line %d\n", \
-+ __func__, __LINE__); \
-+ goto out_err; \
-+ } \
-+ (x) = tmp >> 9; \
-+} while (0)
-+
-+#define WRITE32(n) do { \
-+ *p++ = htonl(n); \
-+ } while (0)
-+#define WRITE64(n) do { \
-+ *p++ = htonl((uint32_t)((n) >> 32)); \
-+ *p++ = htonl((uint32_t)(n)); \
-+} while (0)
-+#define WRITEMEM(ptr, nbytes) do { \
-+ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
-+} while (0)
-+#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_DEVICEID4_SIZE)
-+
-+/* blocklayoutdev.c */
-+struct block_device *nfs4_blkdev_get(dev_t dev);
-+int nfs4_blkdev_put(struct block_device *bdev);
-+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
-+ struct pnfs_device *dev,
-+ struct list_head *sdlist);
-+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
-+ struct nfs4_layoutget_res *lgr);
-+int nfs4_blk_create_block_disk_list(struct list_head *);
-+void nfs4_blk_destroy_disk_list(struct list_head *);
-+/* blocklayoutdm.c */
-+int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *);
-+void free_block_dev(struct pnfs_block_dev *bdev);
-+/* extents.c */
-+struct pnfs_block_extent *
-+find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
-+ struct pnfs_block_extent **cow_read);
-+int mark_initialized_sectors(struct pnfs_inval_markings *marks,
-+ sector_t offset, sector_t length,
-+ sector_t **pages);
-+void put_extent(struct pnfs_block_extent *be);
-+struct pnfs_block_extent *alloc_extent(void);
-+struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
-+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
-+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-+ struct xdr_stream *xdr,
-+ const struct nfs4_layoutcommit_args *arg);
-+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
-+ const struct nfs4_layoutcommit_args *arg,
-+ int status);
-+int add_and_merge_extent(struct pnfs_block_layout *bl,
-+ struct pnfs_block_extent *new);
-+int mark_for_commit(struct pnfs_block_extent *be,
-+ sector_t offset, sector_t length);
-+
-+#include <linux/sunrpc/simple_rpc_pipefs.h>
-+
-+extern struct pipefs_list bl_device_list;
-+extern struct dentry *bl_device_pipe;
-+
-+int bl_pipe_init(void);
-+void bl_pipe_exit(void);
-+
-+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
-+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
-+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
-+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
-+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
-+
-+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c
---- linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c.orig 2010-09-30 12:25:08.234277000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c 2010-09-30 12:25:08.236277000 -0400
+diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
+new file mode 100644
+index 0000000..40dff82
+--- /dev/null
++++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,948 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
@@ -4618,19 +4770,10 @@ diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.35.noar
+ }
+ }
+}
-diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile
---- linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile.orig 2010-09-30 12:25:08.207275000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile 2010-09-30 12:25:08.208278000 -0400
-@@ -0,0 +1,6 @@
-+#
-+# Makefile for the pNFS block layout driver kernel module
-+#
-+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-+blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
-+ extents.o block-device-discovery-pipe.o
-diff -up linux-2.6.35.noarch/fs/nfs/callback.h.orig linux-2.6.35.noarch/fs/nfs/callback.h
---- linux-2.6.35.noarch/fs/nfs/callback.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/callback.h 2010-09-30 12:25:08.241277000 -0400
+diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
+index 85a7cfd..1f92ceb 100644
+--- a/fs/nfs/callback.h
++++ b/fs/nfs/callback.h
@@ -8,6 +8,8 @@
#ifndef __LINUX_FS_NFS_CALLBACK_H
#define __LINUX_FS_NFS_CALLBACK_H
@@ -4640,7 +4783,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback.h.orig linux-2.6.35.noarch/fs/nfs/c
#define NFS4_CALLBACK 0x40000000
#define NFS4_CALLBACK_XDRSIZE 2048
#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
-@@ -111,6 +113,13 @@ extern int nfs41_validate_delegation_sta
+@@ -111,6 +113,13 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
@@ -4694,9 +4837,10 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback.h.orig linux-2.6.35.noarch/fs/nfs/c
#endif /* CONFIG_NFS_V4_1 */
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/nfs/callback_proc.c
---- linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig 2010-09-30 12:22:45.088040000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/callback_proc.c 2010-09-30 12:25:08.247277000 -0400
+diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
+index 930d10f..28816ab 100644
+--- a/fs/nfs/callback_proc.c
++++ b/fs/nfs/callback_proc.c
@@ -8,10 +8,14 @@
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
@@ -4712,7 +4856,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-@@ -113,16 +117,349 @@ int nfs4_validate_delegation_stateid(str
+@@ -113,16 +117,338 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)
@@ -4903,8 +5047,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
+
+ init_completion(&data.started);
+ __module_get(THIS_MODULE);
-+ if (!atomic_inc_not_zero(&clp->cl_count))
-+ goto out_put_no_client;
++ atomic_inc(&clp->cl_count);
+
+ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
+ if (IS_ERR(t)) {
@@ -4919,7 +5062,6 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
+ return data.result;
+out_module_put:
+ nfs_put_client(clp);
-+out_put_no_client:
+ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
+ module_put(THIS_MODULE);
+ return status;
@@ -4969,25 +5111,16 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
+ /* the callback must come from the MDS personality */
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+ goto loop;
-+ if (args->cbl_recall_type == RETURN_FILE) {
-+ inode = nfs_layoutrecall_find_inode(clp, args);
-+ if (inode != NULL) {
-+ status = pnfs_async_return_layout(clp, inode,
-+ args);
-+ if (status)
-+ res = cpu_to_be32(NFS4ERR_DELAY);
-+ iput(inode);
-+ }
-+ } else { /* _ALL or _FSID */
-+ /* we need the inode to get the nfs_server struct */
-+ inode = nfs_layoutrecall_find_inode(clp, args);
-+ if (!inode)
-+ goto loop;
-+ status = pnfs_async_return_layout(clp, inode, args);
-+ if (status)
-+ res = cpu_to_be32(NFS4ERR_DELAY);
-+ iput(inode);
-+ }
++ /* In the _ALL or _FSID case, we need the inode to get
++ * the nfs_server struct.
++ */
++ inode = nfs_layoutrecall_find_inode(clp, args);
++ if (!inode)
++ goto loop;
++ status = pnfs_async_return_layout(clp, inode, args);
++ if (status)
++ res = cpu_to_be32(NFS4ERR_DELAY);
++ iput(inode);
+loop:
+ clp = nfs_find_client_next(prev);
+ nfs_put_client(prev);
@@ -5066,7 +5199,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
return 0;
return 1;
-@@ -324,13 +661,37 @@ out:
+@@ -324,13 +650,37 @@ out:
return status;
}
@@ -5105,14 +5238,14 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
clp = nfs_find_client(args->craa_addr, 4);
if (clp == NULL)
goto out;
-@@ -338,16 +699,25 @@ __be32 nfs4_callback_recallany(struct cb
+@@ -338,16 +688,27 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
dprintk("NFS: RECALL_ANY callback request from %s\n",
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ status = cpu_to_be32(NFS4ERR_INVAL);
+ if (!validate_bitmap_values((const unsigned long *)
+ &args->craa_type_mask))
-+ return status;
++ goto out_put;
+
+ status = cpu_to_be32(NFS4_OK);
if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
@@ -5129,12 +5262,15 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/
if (flags)
nfs_expire_all_delegation_types(clp, flags);
- status = htonl(NFS4_OK);
++out_put:
++ nfs_put_client(clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
-diff -up linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.35.noarch/fs/nfs/callback_xdr.c
---- linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/callback_xdr.c 2010-09-30 12:25:08.253277000 -0400
+diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
+index 05af212..fbfa2b9 100644
+--- a/fs/nfs/callback_xdr.c
++++ b/fs/nfs/callback_xdr.c
@@ -22,6 +22,8 @@
#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -5292,7 +5428,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.35.noarch/fs/n
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
-@@ -574,11 +717,11 @@ preprocess_nfs41_op(int nop, unsigned in
+@@ -574,11 +717,11 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_SEQUENCE:
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
@@ -5306,7 +5442,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.35.noarch/fs/n
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
-@@ -739,6 +882,18 @@ static struct callback_op callback_ops[]
+@@ -739,6 +882,18 @@ static struct callback_op callback_ops[] = {
.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
},
#if defined(CONFIG_NFS_V4_1)
@@ -5325,9 +5461,10 @@ diff -up linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.35.noarch/fs/n
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
-diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/client.c
---- linux-2.6.35.noarch/fs/nfs/client.c.orig 2010-09-30 12:22:45.093040000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/client.c 2010-09-30 12:25:08.259280000 -0400
+diff --git a/fs/nfs/client.c b/fs/nfs/client.c
+index e734072..9e1135e 100644
+--- a/fs/nfs/client.c
++++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
#include "iostat.h"
#include "internal.h"
@@ -5336,7 +5473,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
#define NFSDBG_FACILITY NFSDBG_CLIENT
-@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_clie
+@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
cred = rpc_lookup_machine_cred();
if (!IS_ERR(cred))
clp->cl_machine_cred = cred;
@@ -5347,7 +5484,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
nfs_fscache_get_client_cookie(clp);
return clp;
-@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *c
+@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
nfs_free_client(clp);
}
}
@@ -5355,7 +5492,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
/*
-@@ -344,7 +348,7 @@ static int nfs_sockaddr_match_ipaddr(con
+@@ -344,7 +348,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
* Test if two socket addresses represent the same actual socket,
* by comparing (only) relevant fields, including the port number.
*/
@@ -5364,7 +5501,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
const struct sockaddr *sa2)
{
if (sa1->sa_family != sa2->sa_family)
-@@ -358,6 +362,7 @@ static int nfs_sockaddr_cmp(const struct
+@@ -358,6 +362,7 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
}
return 0;
}
@@ -5372,7 +5509,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
/*
* Find a client by IP address and protocol version
-@@ -549,6 +554,7 @@ int nfs4_check_client_ready(struct nfs_c
+@@ -549,6 +554,7 @@ int nfs4_check_client_ready(struct nfs_client *clp)
return -EPROTONOSUPPORT;
return 0;
}
@@ -5389,7 +5526,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
{
unsigned long max_rpc_payload;
-@@ -898,6 +904,10 @@ static void nfs_server_set_fsinfo(struct
+@@ -898,6 +904,10 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -5400,7 +5537,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-@@ -934,12 +944,13 @@ static int nfs_probe_fsinfo(struct nfs_s
+@@ -934,12 +944,13 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
goto out_error;
}
@@ -5415,7 +5552,7 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
/* Get some general file system info */
if (server->namelen == 0) {
-@@ -1017,6 +1028,7 @@ void nfs_free_server(struct nfs_server *
+@@ -1017,6 +1028,7 @@ void nfs_free_server(struct nfs_server *server)
{
dprintk("--> nfs_free_server()\n");
@@ -5440,8044 +5577,5949 @@ diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/cli
/*
-diff -up linux-2.6.35.noarch/fs/nfsd/bl_com.c.orig linux-2.6.35.noarch/fs/nfsd/bl_com.c
---- linux-2.6.35.noarch/fs/nfsd/bl_com.c.orig 2010-09-30 12:25:08.480284000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/bl_com.c 2010-09-30 12:25:08.482284000 -0400
-@@ -0,0 +1,292 @@
-+#if defined(CONFIG_SPNFS_BLOCK)
-+
-+#include <linux/module.h>
-+#include <linux/mutex.h>
-+#include <linux/init.h>
-+#include <linux/types.h>
-+#include <linux/slab.h>
-+#include <linux/socket.h>
-+#include <linux/in.h>
-+#include <linux/sched.h>
-+#include <linux/exportfs.h>
-+#include <linux/namei.h>
-+#include <linux/mount.h>
-+#include <linux/path.h>
-+#include <linux/sunrpc/clnt.h>
-+#include <linux/workqueue.h>
-+#include <linux/sunrpc/rpc_pipe_fs.h>
-+#include <linux/proc_fs.h>
-+#include <linux/nfs_fs.h>
-+
-+#include <linux/nfsd/debug.h>
-+#include <linux/nfsd4_block.h>
-+
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
-+
-+static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-+ char __user *, size_t);
-+static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-+
-+static struct rpc_pipe_ops bl_upcall_ops = {
-+ .upcall = bl_pipe_upcall,
-+ .downcall = bl_pipe_downcall,
-+ .destroy_msg = bl_pipe_destroy_msg,
-+};
-+
-+bl_comm_t *bl_comm_global;
-+
-+int
-+nfsd_bl_start(void)
+diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
+index 064a809..43786c2 100644
+--- a/fs/nfs/direct.c
++++ b/fs/nfs/direct.c
+@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
+ .rpc_release = nfs_direct_read_release,
+ };
+
++static long nfs_direct_read_execute(struct nfs_read_data *data,
++ struct rpc_task_setup *task_setup_data,
++ struct rpc_message *msg)
+{
-+ bl_comm_t *bl_comm = NULL;
-+ struct path path;
-+ struct nameidata nd;
-+ int rc;
++ struct inode *inode = data->inode;
++ struct rpc_task *task;
+
-+ dprintk("%s: starting pipe\n", __func__);
-+ if (bl_comm_global)
-+ return -EEXIST;
++ nfs_fattr_init(&data->fattr);
++ msg->rpc_argp = &data->args;
++ msg->rpc_resp = &data->res;
+
-+ path.mnt = rpc_get_mount();
-+ if (IS_ERR(path.mnt))
-+ return PTR_ERR(path.mnt);
++ task_setup_data->task = &data->task;
++ task_setup_data->callback_data = data;
++ NFS_PROTO(inode)->read_setup(data, msg);
+
-+ /* FIXME: do not abuse rpc_pipefs/nfs */
-+ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
-+ if (rc)
-+ goto err;
++ task = rpc_run_task(task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
+
-+ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL);
-+ if (!bl_comm) {
-+ rc = -ENOMEM;
-+ goto err;
-+ }
++ rpc_put_task(task);
+
-+ /* FIXME: rename to "spnfs_block" */
-+ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm,
-+ &bl_upcall_ops, 0);
-+ if (IS_ERR(bl_comm->pipe_dentry)) {
-+ rc = -EPIPE;
-+ goto err;
-+ }
-+ mutex_init(&bl_comm->lock);
-+ mutex_init(&bl_comm->pipe_lock);
-+ init_waitqueue_head(&bl_comm->pipe_wq);
++ dprintk("NFS: %5u initiated direct read call "
++ "(req %s/%lld, %u bytes @ offset %llu)\n",
++ data->task.tk_pid,
++ inode->i_sb->s_id,
++ (long long)NFS_FILEID(inode),
++ data->args.count,
++ (unsigned long long)data->args.offset);
+
-+ bl_comm_global = bl_comm;
+ return 0;
-+err:
-+ rpc_put_mount();
-+ kfree(bl_comm);
-+ return rc;
+}
+
-+void
-+nfsd_bl_stop(void)
+ /*
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation. If nfs_readdata_alloc() or get_user_pages() fails,
+@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+ unsigned long user_addr = (unsigned long)iov->iov_base;
+ size_t count = iov->iov_len;
+ size_t rsize = NFS_SERVER(inode)->rsize;
+- struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_cred = ctx->cred,
+ };
+@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+ data->res.fattr = &data->fattr;
+ data->res.eof = 0;
+ data->res.count = bytes;
+- nfs_fattr_init(&data->fattr);
+- msg.rpc_argp = &data->args;
+- msg.rpc_resp = &data->res;
+
+- task_setup_data.task = &data->task;
+- task_setup_data.callback_data = data;
+- NFS_PROTO(inode)->read_setup(data, &msg);
+-
+- task = rpc_run_task(&task_setup_data);
+- if (IS_ERR(task))
++ if (nfs_direct_read_execute(data, &task_setup_data, &msg))
+ break;
+- rpc_put_task(task);
+-
+- dprintk("NFS: %5u initiated direct read call "
+- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+- data->task.tk_pid,
+- inode->i_sb->s_id,
+- (long long)NFS_FILEID(inode),
+- bytes,
+- (unsigned long long)data->args.offset);
+
+ started += bytes;
+ user_addr += bytes;
+@@ -457,12 +471,15 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
+ }
+
+ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
++static long nfs_direct_write_execute(struct nfs_write_data *data,
++ struct rpc_task_setup *task_setup_data,
++ struct rpc_message *msg);
++
+ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+ {
+ struct inode *inode = dreq->inode;
+ struct list_head *p;
+ struct nfs_write_data *data;
+- struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_cred = dreq->ctx->cred,
+ };
+@@ -496,25 +513,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+ * Reuse data->task; data->args should not have changed
+ * since the original request was sent.
+ */
+- task_setup_data.task = &data->task;
+- task_setup_data.callback_data = data;
+- msg.rpc_argp = &data->args;
+- msg.rpc_resp = &data->res;
+- NFS_PROTO(inode)->write_setup(data, &msg);
+-
+- /*
+- * We're called via an RPC callback, so BKL is already held.
+- */
+- task = rpc_run_task(&task_setup_data);
+- if (!IS_ERR(task))
+- rpc_put_task(task);
+-
+- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+- data->task.tk_pid,
+- inode->i_sb->s_id,
+- (long long)NFS_FILEID(inode),
+- data->args.count,
+- (unsigned long long)data->args.offset);
++ nfs_direct_write_execute(data, &task_setup_data, &msg);
+ }
+
+ if (put_dreq(dreq))
+@@ -557,10 +556,31 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
+ .rpc_release = nfs_direct_commit_release,
+ };
+
++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq,
++ struct nfs_write_data *data,
++ struct rpc_task_setup *task_setup_data,
++ struct rpc_message *msg)
+{
-+ bl_comm_t *c = bl_comm_global;
++ struct rpc_task *task;
+
-+ dprintk("%s: stopping pipe\n", __func__);
-+ if (!c)
-+ return;
-+ rpc_unlink(c->pipe_dentry);
-+ rpc_put_mount();
-+ bl_comm_global = NULL;
-+ kfree(c);
-+}
++ NFS_PROTO(data->inode)->commit_setup(data, msg);
+
-+static ssize_t
-+bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst,
-+ size_t buflen)
-+{
-+ char *data = (char *)msg->data + msg->copied;
-+ ssize_t mlen = msg->len - msg->copied,
-+ left;
++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
++ dreq->commit_data = NULL;
+
-+ if (mlen > buflen)
-+ mlen = buflen;
++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
-+ left = copy_to_user(dst, data, mlen);
-+ if (left < 0) {
-+ msg->errno = left;
-+ return left;
-+ }
-+ mlen -= left;
-+ msg->copied += mlen;
-+ msg->errno = 0;
++ task = rpc_run_task(task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
+
-+ return mlen;
++ rpc_put_task(task);
++ return 0;
+}
+
-+static ssize_t
-+bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+ {
+ struct nfs_write_data *data = dreq->commit_data;
+- struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+@@ -589,16 +609,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+
+- NFS_PROTO(data->inode)->commit_setup(data, &msg);
+-
+- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+- dreq->commit_data = NULL;
+-
+- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+-
+- task = rpc_run_task(&task_setup_data);
+- if (!IS_ERR(task))
+- rpc_put_task(task);
++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg);
+ }
+
+ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+@@ -700,6 +711,36 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
+ .rpc_release = nfs_direct_write_release,
+ };
+
++static long nfs_direct_write_execute(struct nfs_write_data *data,
++ struct rpc_task_setup *task_setup_data,
++ struct rpc_message *msg)
+{
-+ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
-+ bl_comm_t *bc = (bl_comm_t *)rpci->private;
-+ bl_comm_msg_t *im = &bc->msg;
-+ int ret;
-+ bl_comm_res_t *res;
-+
++ struct inode *inode = data->inode;
++ struct rpc_task *task;
+
-+ if (mlen == 0) {
-+ im->msg_status = PNFS_BLOCK_FAILURE;
-+ im->msg_res = NULL;
-+ wake_up(&bc->pipe_wq);
-+ return -EFAULT;
-+ }
-+
-+ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL)
-+ return -ENOMEM;
-+
-+ if (copy_from_user(res, src, mlen)) {
-+ kfree(res);
-+ return -EFAULT;
-+ }
-+
-+ mutex_lock(&bc->pipe_lock);
-+
-+ ret = mlen;
-+ im->msg_status = res->res_status;
-+ im->msg_res = res;
-+
-+ wake_up(&bc->pipe_wq);
-+ mutex_unlock(&bc->pipe_lock);
-+ return ret;
-+}
++ task_setup_data->task = &data->task;
++ task_setup_data->callback_data = data;
++ msg->rpc_argp = &data->args;
++ msg->rpc_resp = &data->res;
++ NFS_PROTO(inode)->write_setup(data, msg);
+
-+static void
-+bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-+{
-+ bl_comm_msg_t *im = msg->data;
-+ bl_comm_t *bc = container_of(im, struct bl_comm, msg);
-+
-+ if (msg->errno >= 0)
-+ return;
++ task = rpc_run_task(task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
+
-+ mutex_lock(&bc->pipe_lock);
-+ im->msg_status = PNFS_BLOCK_FAILURE;
-+ wake_up(&bc->pipe_wq);
-+ mutex_unlock(&bc->pipe_lock);
++ rpc_put_task(task);
++
++ dprintk("NFS: %5u initiated direct write call "
++ "(req %s/%lld, %u bytes @ offset %llu)\n",
++ data->task.tk_pid,
++ inode->i_sb->s_id,
++ (long long)NFS_FILEID(inode),
++ data->args.count,
++ (unsigned long long)data->args.offset);
++
++ return 0;
+}
+
-+int
-+bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res)
-+{
-+ struct rpc_pipe_msg msg;
-+ DECLARE_WAITQUEUE(wq, current);
-+ int rval = 1;
-+ bl_comm_msg_t *m = &bc->msg;
-+
-+ if (bc == NULL) {
-+ dprintk("%s: No pNFS block daemon available\n", __func__);
-+ return 1;
-+ }
-+
-+ mutex_lock(&bc->lock);
-+ mutex_lock(&bc->pipe_lock);
-+
-+ memcpy(m, upmsg, sizeof (*m));
-+
-+ memset(&msg, 0, sizeof (msg));
-+ msg.data = m;
-+ msg.len = sizeof (*m);
-+
-+ add_wait_queue(&bc->pipe_wq, &wq);
-+ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg);
-+ if (rval < 0) {
-+ remove_wait_queue(&bc->pipe_wq, &wq);
-+ goto out;
-+ }
-+
-+ set_current_state(TASK_UNINTERRUPTIBLE);
-+ mutex_unlock(&bc->pipe_lock);
-+ schedule();
-+ __set_current_state(TASK_RUNNING);
-+ remove_wait_queue(&bc->pipe_wq, &wq);
-+ mutex_lock(&bc->pipe_lock);
-+
-+ if (m->msg_status == PNFS_BLOCK_SUCCESS) {
-+ *res = m->msg_res;
-+ rval = 0;
-+ } else
-+ rval = 1;
-+
-+out:
-+ mutex_unlock(&bc->pipe_lock);
-+ mutex_unlock(&bc->lock);
-+ return rval;
-+}
-+
-+static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len,
-+ loff_t *offset)
-+{
-+ int cmd,
-+ rc;
-+ bl_comm_t *bc = bl_comm_global;
-+ bl_comm_msg_t msg;
-+ bl_comm_res_t *res;
-+
-+ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int)))
-+ return -EFAULT;
-+ switch (cmd) {
-+ case PNFS_BLOCK_CTL_STOP:
-+ msg.msg_type = PNFS_UPCALL_MSG_STOP;
-+ (void) bl_upcall(bc, &msg, &res);
-+ kfree(res);
-+ nfsd_bl_stop();
-+ break;
-+
-+ case PNFS_BLOCK_CTL_START:
-+ rc = nfsd_bl_start();
-+ if (rc != 0)
-+ return rc;
-+ break;
-+
-+ case PNFS_BLOCK_CTL_VERS:
-+ msg.msg_type = PNFS_UPCALL_MSG_VERS;
-+ msg.u.msg_vers = PNFS_UPCALL_VERS;
-+ if (bl_upcall(bc, &msg, &res)) {
-+ dprintk("%s: Failed to contact pNFS block daemon\n",
-+ __func__);
-+ return 0;
-+ }
-+ kfree(res);
-+ break;
-+
-+ default:
-+ dprintk("%s: unknown ctl command %d\n", __func__, cmd);
-+ break;
-+ }
-+ return len;
-+}
-+
-+static struct file_operations ctl_ops = {
-+ .write = ctl_write,
-+};
-+
-+/*
-+ * bl_init_proc -- set up proc interfaces
-+ *
-+ * Creating a pnfs_block directory isn't really required at this point
-+ * since we've only got a single node in that directory. If the need for
-+ * more nodes doesn't present itself shortly this code should revert
-+ * to a single top level node. McNeal 11-Aug-2008.
-+ */
-+int
-+bl_init_proc(void)
-+{
-+ struct proc_dir_entry *e;
-+
-+ e = proc_mkdir("fs/pnfs_block", NULL);
-+ if (!e)
-+ return -ENOMEM;
-+
-+ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL);
-+ if (!e)
-+ return -ENOMEM;
-+ e->proc_fops = &ctl_ops;
-+
-+ return 0;
-+}
-+#endif /* CONFIG_SPNFS_BLOCK */
-diff -up linux-2.6.35.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.35.noarch/fs/nfsd/bl_ops.c
---- linux-2.6.35.noarch/fs/nfsd/bl_ops.c.orig 2010-09-30 12:25:08.485284000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/bl_ops.c 2010-09-30 12:25:08.487284000 -0400
-@@ -0,0 +1,1672 @@
-+/*
-+ * bl_ops.c
-+ * spNFS
-+ *
-+ * Created by Rick McNeal on 4/1/08.
-+ * Copyright 2008 __MyCompanyName__. All rights reserved.
-+ *
-+ */
-+
-+/*
-+ * Block layout operations.
-+ *
-+ * These functions, with the exception of pnfs_block_enabled, are assigned to
-+ * the super block s_export_op structure.
-+ */
-+#if defined(CONFIG_SPNFS_BLOCK)
-+
-+#include <linux/module.h>
-+#include <linux/genhd.h>
-+#include <linux/fs.h>
-+#include <linux/exportfs.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/nfsd/export.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+#include <linux/nfsd/debug.h>
-+#include <linux/spinlock_types.h>
-+#include <linux/dm-ioctl.h>
-+#include <asm/uaccess.h>
-+#include <linux/falloc.h>
-+#include <linux/nfsd4_block.h>
-+
-+#include "pnfsd.h"
-+
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
-+
-+#define MIN(a, b) ((a) < (b) ? (a) : (b))
-+
-+#define BL_LAYOUT_HASH_BITS 4
-+#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS)
-+#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1)
-+#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256)
-+
-+#define bl_layout_hashval(id) \
-+ ((id) & BL_LAYOUT_HASH_MASK)
-+
-+#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len)
-+#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len)
-+#define _2SECTS(v) ((v) >> 9)
-+
-+#ifndef READ32
-+#define READ32(x) (x) = ntohl(*p++)
-+#define READ64(x) do { \
-+(x) = (u64)ntohl(*p++) << 32; \
-+(x) |= ntohl(*p++); \
-+} while (0)
-+#endif
-+
-+
-+typedef enum {True, False} boolean_t;
-+/* ---- block layoutget and commit structure ---- */
-+typedef struct bl_layout_rec {
-+ struct list_head blr_hash,
-+ blr_layouts;
-+ dev_t blr_rdev;
-+ struct inode *blr_inode;
-+ int blr_recalled; // debug
-+ u64 blr_orig_size,
-+ blr_commit_size,
-+ blr_ext_size;
-+ spinlock_t blr_lock; // Protects blr_layouts
-+} bl_layout_rec_t;
-+
-+static struct list_head layout_hash;
-+static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE];
-+static spinlock_t layout_hashtbl_lock;
-+
-+/* ---- prototypes ---- */
-+static boolean_t device_slice(dev_t devid);
-+static boolean_t device_dm(dev_t devid);
-+static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **);
-+static bl_layout_rec_t *layout_inode_find(struct inode *i);
-+static void layout_inode_del(struct inode *i);
-+static char *map_state2name(enum pnfs_block_extent_state4 s);
-+static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type);
-+static void bld_free(pnfs_blocklayout_devinfo_t *bld);
-+static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes,
-+ dev_t devid, int local_index);
-+static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes,
-+ dev_t devid, int my_loc, int idx);
-+static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
-+ struct nfsd4_layout_seg *seg);
-+struct list_head *layout_cache_iter(bl_layout_rec_t *r,
-+ struct list_head *bl_possible, struct nfsd4_layout_seg *seg);
-+static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h);
-+static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h);
-+static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg);
-+static void print_bll(pnfs_blocklayout_layout_t *b, char *);
-+static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r,
-+ struct list_head *h, struct nfsd4_layout_seg *seg);
-+static inline void bll_collapse(bl_layout_rec_t *r,
-+ pnfs_blocklayout_layout_t *c);
-+static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len,
-+ enum bl_cache_state state, struct list_head *h);
-+static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b,
-+ enum bl_cache_state c, struct list_head *h);
-+static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
-+ enum pnfs_block_extent_state4 *s);
-+static void extents_setup(struct fiemap_extent_info *fei);
-+static void extents_count(struct fiemap_extent_info *fei, struct inode *i,
-+ u64 foff, u64 len);
-+static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i,
-+ u64 foff, u64 len);
-+static boolean_t extents_process(struct fiemap_extent_info *fei,
-+ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev,
-+ pnfs_blocklayout_layout_t *b);
-+static void extents_cleanup(struct fiemap_extent_info *fei);
-+
-+void
-+nfsd_bl_init(void)
-+{
-+ int i;
-+ dprintk("%s loaded\n", __func__);
-+
-+ spin_lock_init(&layout_hashtbl_lock);
-+ INIT_LIST_HEAD(&layout_hash);
-+ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++)
-+ INIT_LIST_HEAD(&layout_hashtbl[i]);
-+ bl_init_proc();
-+}
-+
-+/*
-+ * pnfs_block_enabled -- check to see if this file system should be export as
-+ * block pnfs
-+ */
-+int
-+pnfs_block_enabled(struct inode *inode, int ex_flags)
-+{
-+ bl_comm_msg_t msg;
-+ bl_comm_res_t *res = NULL;
-+ static int bl_comm_once = 0;
-+
-+ dprintk("--> %s\n", __func__);
-+ /*
-+ * FIXME: Figure out method to determine if this file system should
-+ * be exported. The following areas need to be checked.
-+ * (1) Validate that this file system was exported as a pNFS
-+ * block-layout
-+ * (2) Has there been successful communication with the
-+ * volume daemon?
-+ */
-+ /* Check #1 */
-+#ifdef notyet
-+ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) {
-+ dprintk("%s: pnfs_block not set in export\n", __func__);
-+ return 0;
-+ }
-+#endif
-+
-+ /* Check #1 */
-+ if (!bl_comm_once) {
-+ msg.msg_type = PNFS_UPCALL_MSG_VERS;
-+ msg.u.msg_vers = PNFS_UPCALL_VERS;
-+ if (bl_upcall(bl_comm_global, &msg, &res)) {
-+ dprintk("%s: Failed to contact pNFS block daemon\n",
-+ __func__);
-+ return 0;
-+ }
-+ if (msg.u.msg_vers != res->u.vers) {
-+ dprintk("%s: vers mismatch, kernel != daemon\n",
-+ __func__);
-+ kfree(res);
-+ return 0;
-+ }
-+ }
-+ bl_comm_once = 1;
-+
-+ kfree(res);
-+
-+ dprintk("<-- %s okay\n", __func__);
-+ return 1;
-+}
-+
-+int
-+bl_layout_type(struct super_block *sb)
-+{
-+ return LAYOUT_BLOCK_VOLUME;
-+}
-+
-+int
-+bl_getdeviceiter(struct super_block *sb,
-+ u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *res)
-+{
-+ res->gd_eof = 1;
-+ if (res->gd_cookie)
-+ return -ENOENT;
-+ res->gd_devid = sb->s_dev;
-+ res->gd_verf = 1;
-+ res->gd_cookie = 1;
-+ return 0;
-+}
-+
-+static int
-+bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_deviceid *devid)
-+{
-+ pnfs_blocklayout_devinfo_t *bld_slice_p,
-+ *bld_simple_p,
-+ *bld;
-+ int status = -EIO,
-+ location = 0;
-+ struct list_head volumes;
-+
-+ dprintk("--> %s\n", __func__);
-+ INIT_LIST_HEAD(&volumes);
-+
-+ bld_simple_p = bld_simple(&volumes, devid->devid,
-+ location++);
-+ if (!bld_simple_p)
-+ goto out;
-+ bld_slice_p = bld_slice(&volumes, devid->devid, location++,
-+ bld_simple_p->bld_index_loc);
-+
-+ if (!bld_slice_p)
-+ goto out;
-+
-+ status = blocklayout_encode_devinfo(xdr, &volumes);
-+
-+out:
-+ while (!list_empty(&volumes)) {
-+ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
-+ bld_list);
-+ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE)
-+ kfree(bld->u.simple.bld_sig);
-+ bld_free(bld);
-+ }
-+
-+ dprintk("<-- %s (rval %d)\n", __func__, status);
-+ return status;
-+}
-+
-+static int
-+bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_deviceid *devid)
-+{
-+ pnfs_blocklayout_devinfo_t *bld = NULL;
-+ int status = -EIO, // default to error
-+ i,
-+ location = 0;
-+ struct list_head volumes;
-+ bl_comm_msg_t msg;
-+ bl_comm_res_t *res;
-+
-+ dprintk("--> %s\n", __func__);
-+ INIT_LIST_HEAD(&volumes);
-+
-+ msg.msg_type = PNFS_UPCALL_MSG_DMGET;
-+ msg.u.msg_dev = devid->devid;
-+ if (bl_upcall(bl_comm_global, &msg, &res)) {
-+ dprintk("%s: upcall for DMGET failed\n", __func__);
-+ goto out;
-+ }
-+
-+ /*
-+ * Don't use bld_alloc() here. If used this will be the first volume
-+ * type added to the list whereas the protocol requires it to be the
-+ * last.
-+ */
-+ bld = kmalloc(sizeof (*bld), GFP_KERNEL);
-+ if (!bld)
-+ goto out;
-+ memset(bld, 0, sizeof (*bld));
-+ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE;
-+ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes;
-+ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL;
-+ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__,
-+ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL);
-+
-+ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes *
-+ sizeof (int), GFP_KERNEL);
-+ if (!bld->u.stripe.bld_stripe_indexs)
-+ goto out;
-+
-+ for (i = 0; i < bld->u.stripe.bld_stripes; i++) {
-+ dev_t dev;
-+ pnfs_blocklayout_devinfo_t *bldp;
-+
-+ dev = MKDEV(res->u.stripe.devs[i].major,
-+ res->u.stripe.devs[i].minor);
-+ if (dev == 0)
-+ goto out;
-+
-+ bldp = bld_simple(&volumes, dev, location++);
-+ if (!bldp) {
-+ dprintk("%s: bld_simple failed\n", __func__);
-+ goto out;
-+ }
-+ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc);
-+
-+ if (!bldp) {
-+ dprintk("%s: bld_slice failed\n", __func__);
-+ goto out;
-+ }
-+ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc;
-+
-+ }
-+ list_add_tail(&bld->bld_list, &volumes);
-+ status = blocklayout_encode_devinfo(xdr, &volumes);
-+
-+out:
-+ while (!list_empty(&volumes)) {
-+ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
-+ bld_list);
-+ switch (bld->bld_type) {
-+ case PNFS_BLOCK_VOLUME_SLICE:
-+ case PNFS_BLOCK_VOLUME_CONCAT:
-+ // No memory to release for these
-+ break;
-+ case PNFS_BLOCK_VOLUME_SIMPLE:
-+ kfree(bld->u.simple.bld_sig);
-+ break;
-+ case PNFS_BLOCK_VOLUME_STRIPE:
-+ kfree(bld->u.stripe.bld_stripe_indexs);
-+ break;
-+ }
-+ bld_free(bld);
-+ }
-+ kfree(res);
-+ dprintk("<-- %s (rval %d)\n", __func__, status);
-+ return status;
-+}
-+
-+/*
-+ * bl_getdeviceinfo -- determine device tree for requested devid
-+ */
-+int
-+bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *devid)
-+{
-+ if (device_slice(devid->devid) == True)
-+ return bl_getdeviceinfo_slice(sb, xdr, devid);
-+ else if (device_dm(devid->devid) == True)
-+ return bl_getdeviceinfo_dm(sb, xdr, devid);
-+ return -EINVAL;
-+}
-+
-+enum nfsstat4
-+bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_layoutget_arg *arg,
-+ struct nfsd4_pnfs_layoutget_res *res)
-+{
-+ pnfs_blocklayout_layout_t *b;
-+ bl_layout_rec_t *r;
-+ struct list_head bl_possible,
-+ *bl_candidates = NULL;
-+ boolean_t del_on_error = False;
-+ int adj;
-+ enum nfsstat4 nfserr = NFS4_OK;
-+
-+ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n",
-+ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset),
-+ _2SECTS(res->lg_seg.length), res->lg_seg.iomode);
-+
-+ if (res->lg_seg.length == 0) {
-+ printk("%s: request length of 0, error condition\n", __func__);
-+ return NFS4ERR_BADLAYOUT;
-+ }
-+
-+ /*
-+ * Adjust the length as required per spec.
-+ * - First case is were the length is set to (u64)-1. Cheap means to
-+ * define the end of the file.
-+ * - Second case is were the I/O mode is read-only, but the request is
-+ * past the end of the file so the request needs to be trimed.
-+ */
-+ if ((res->lg_seg.length == NFS4_MAX_UINT64) ||
-+ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) &&
-+ (res->lg_seg.iomode == IOMODE_READ)))
-+ res->lg_seg.length = i->i_size - res->lg_seg.offset;
-+
-+ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0;
-+ res->lg_seg.offset -= adj;
-+ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511;
-+
-+ if (res->lg_seg.iomode != IOMODE_READ)
-+ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE,
-+ res->lg_seg.offset, res->lg_seg.length))
-+ return NFS4ERR_IO;
-+
-+ INIT_LIST_HEAD(&bl_possible);
-+
-+ if ((r = layout_inode_find(i)) == NULL) {
-+ if (layout_inode_add(i, &r) == False) {
-+ printk("%s: layout_inode_add failed\n", __func__);
-+ return NFS4ERR_IO;
-+ }
-+ del_on_error = True;
-+ }
-+ BUG_ON(!r);
-+
-+ spin_lock(&r->blr_lock);
-+
-+ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) {
-+ /*
-+ * This will send LAYOUTTRYAGAIN error to the client.
-+ */
-+ dprintk("%s: layout_cache_fill_from() failed\n", __func__);
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+
-+ res->lg_return_on_close = 1;
-+ res->lg_seg.length = 0;
-+
-+ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg);
-+ if (!bl_candidates) {
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+
-+ layout_cache_merge(r, bl_candidates);
-+ if (layout_cache_update(r, bl_candidates)) {
-+ /* ---- Failed to allocate memory. ---- */
-+ dprintk("%s: layout_cache_update() failed\n", __func__);
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+
-+ nfserr = blocklayout_encode_layout(xdr, bl_candidates);
-+ if (nfserr)
-+ dprintk("%s: layoutget xdr routine failed\n", __func__);
-+
-+layoutget_cleanup:
-+ if (bl_candidates) {
-+ while (!list_empty(bl_candidates)) {
-+ b = list_entry(bl_candidates->next,
-+ struct pnfs_blocklayout_layout, bll_list);
-+ list_del(&b->bll_list);
-+ kfree(b);
-+ }
-+ }
-+
-+ spin_unlock(&r->blr_lock);
-+ if (unlikely(nfserr)) {
-+ if (del_on_error == True)
-+ layout_inode_del(i);
-+ res->lg_seg.length = 0;
-+ res->lg_seg.offset = 0;
-+ }
-+
-+ dprintk("<-- %s (rval %u)\n", __func__, nfserr);
-+ return nfserr;
-+}
-+
-+/*
-+ * bl_layoutcommit -- commit changes, especially size, to file systemj
-+ *
-+ * Currently this routine isn't called and everything is handled within
-+ * nfsd4_layoutcommit(). By not calling this routine the server doesn't
-+ * handle a partial return, a set of extents, of the layout. The extents
-+ * are decoded here, but nothing is done with them. If this routine is
-+ * be called the interface must change to pass the 'dentry' pointer such
-+ * that notify_change() can be called.
-+ */
-+int
-+bl_layoutcommit(struct inode *i,
-+ const struct nfsd4_pnfs_layoutcommit_arg *args,
-+ struct nfsd4_pnfs_layoutcommit_res *res)
-+{
-+ bl_layout_rec_t *r;
-+ int status = 0;
-+ u64 lw_plus;
-+
-+ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
-+ r = layout_inode_find(i);
-+ if (r) {
-+ lw_plus = args->lc_last_wr + 1;
-+ if (args->lc_newoffset) {
-+ dprintk(" lc_last_wr %Lu\n", lw_plus);
-+ if (r->blr_orig_size < lw_plus) {
-+ r->blr_orig_size = lw_plus;
-+ res->lc_size_chg = 1;
-+ res->lc_newsize = lw_plus;
-+ }
-+ }
-+
-+ if (args->lc_up_len) {
-+ int extents,
-+ i;
-+ struct pnfs_blocklayout_layout *b;
-+ __be32 *p = args->lc_up_layout;
-+
-+ /*
-+ * Client is returning a set of extents which
-+ * should/could be used to update the file system.
-+ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08
-+ */
-+ READ32(extents);
-+ dprintk(" Client returning %d extents: data size %d\n",
-+ extents, args->lc_up_len);
-+ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) *
-+ extents, GFP_KERNEL);
-+ if (b) {
-+ for (i = 0; i < extents; i++) {
-+ READ64(b[i].bll_vol_id.sbid);
-+ READ64(b[i].bll_vol_id.devid);
-+ READ64(b[i].bll_foff);
-+ READ64(b[i].bll_len);
-+ READ64(b[i].bll_soff);
-+ READ32(b[i].bll_es);
-+ dprintk(" %d: foff %Lu, len %Lu, soff %Lu "
-+ "state %s\n",
-+ i, _2SECTS(b[i].bll_foff),
-+ _2SECTS(b[i].bll_len),
-+ _2SECTS(b[i].bll_soff),
-+ map_state2name(b[i].bll_es));
-+ }
-+ kfree(b);
-+ } else {
-+ status = -ENOMEM;
-+ }
-+ }
-+ } else
-+ dprintk("%s: Unexpected commit to inode %p\n", __func__, i);
-+
-+ dprintk("<-- %s (rval %d)\n", __func__, status);
-+ return status;
-+}
-+
-+int
-+bl_layoutreturn(struct inode *i,
-+ const struct nfsd4_pnfs_layoutreturn_arg *args)
-+{
-+ int status = 0;
-+ bl_layout_rec_t *r;
-+
-+ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
-+
-+ r = layout_inode_find(i);
-+ if (r) {
-+ spin_lock(&r->blr_lock);
-+ layout_cache_del(r, &args->lr_seg);
-+ spin_unlock(&r->blr_lock);
-+ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n",
-+ r->blr_ext_size, i->i_size, r->blr_orig_size);
-+ }
-+
-+ layout_inode_del(i);
-+ dprintk("<-- %s (rval %d)\n", __func__, status);
-+ return status;
-+}
-+
-+int
-+bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
-+{
-+ struct super_block *sb;
-+ struct nfsd4_pnfs_cb_layout lr;
-+ bl_layout_rec_t *r;
-+ pnfs_blocklayout_layout_t *b;
-+ u64 adj;
-+
-+ dprintk("--> %s\n", __func__);
-+ BUG_ON(!len);
-+ switch (type) {
-+ case RETURN_FILE:
-+ sb = inode->i_sb;
-+ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n",
-+ inode->i_sb->s_dev, inode->i_ino,
-+ _2SECTS(offset), _2SECTS(len));
-+ break;
-+ case RETURN_FSID:
-+ sb = inode->i_sb;
-+ dprintk("%s: recalling layout for fsid x (unimplemented)\n",
-+ __func__);
-+ return 0;
-+ case RETURN_ALL:
-+ /*
-+ * XXX figure out how to get a sb since there's no
-+ * inode ptr
-+ */
-+ dprintk("%s: recalling all layouts (unimplemented)\n",
-+ __func__);
-+ return 0;
-+ default:
-+ return -EINVAL;
-+ }
-+
-+restart:
-+ r = layout_inode_find(inode);
-+ if (r && len && !r->blr_recalled) {
-+ spin_lock(&r->blr_lock);
-+ list_for_each_entry(b, &r->blr_layouts, bll_list) {
-+ if (!r->blr_recalled && !b->bll_recalled &&
-+ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) {
-+ b->bll_recalled = 1;
-+ lr.cbl_recall_type = type;
-+ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME;
-+ lr.cbl_seg.clientid = 0;
-+ lr.cbl_seg.offset = 0;
-+ lr.cbl_seg.length = NFS4_MAX_UINT64;
-+ r->blr_recalled = 1;
-+ dprintk(" FULL LAYOUTRECALL\n");
-+ lr.cbl_seg.iomode = IOMODE_ANY;
-+
-+ /*
-+ * Currently there are only two cases where the
-+ * layout is being returned.
-+ * (1) Someone is issuing a NFS_WRITE operation
-+ * to this layout.
-+ * (2) The file has been truncated which means
-+ * the layout is immediately made invalid.
-+ * In both cases the client must write any
-+ * uncommitted modifications to the server via
-+ * NFS_WRITE.
-+ */
-+ lr.cbl_layoutchanged = 1;
-+
-+ /*
-+ * Need to drop the lock because we'll get a
-+ * layoutreturn which will block waiting for
-+ * the lock. The request will come in on the
-+ * same thread which will cause a deadlock.
-+ */
-+ spin_unlock(&r->blr_lock);
-+ nfsd_layout_recall_cb(sb, inode, &lr);
-+ adj = MIN(b->bll_len - (offset - b->bll_foff),
-+ len);
-+ offset += adj;
-+ len -= adj;
-+ if (!len) {
-+ spin_lock(&r->blr_lock);
-+ break;
-+ }
-+ /*
-+ * Since layoutreturn will have been called we
-+ * can't assume blr_layouts is still valid,
-+ * so restart.
-+ */
-+ goto restart;
-+ }
-+ }
-+ spin_unlock(&r->blr_lock);
-+ }
-+
-+ dprintk("<-- %s\n", __func__);
-+ return 0;
-+}
-+
-+/*
-+ * []------------------------------------------------------------------[]
-+ * | Support functions from here on down. |
-+ * []------------------------------------------------------------------[]
-+ */
-+
-+/*
-+ * bld_simple -- given a dev_t build a simple volume structure
-+ *
-+ * Simple volume contains the device signature and offset to that data in
-+ * the storage volume.
-+ */
-+static pnfs_blocklayout_devinfo_t *
-+bld_simple(struct list_head *volumes, dev_t devid, int local_index)
-+{
-+ pnfs_blocklayout_devinfo_t *bld = NULL;
-+ bl_comm_msg_t msg;
-+ bl_comm_res_t *res = NULL;
-+
-+ msg.msg_type = PNFS_UPCALL_MSG_GETSIG;
-+ msg.u.msg_dev = devid;
-+ if (bl_upcall(bl_comm_global, &msg, &res)) {
-+ dprintk("%s: Failed to get signature information\n", __func__);
-+ goto error;
-+ }
-+
-+ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE);
-+ if (!bld)
-+ return NULL;
-+
-+ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset;
-+ bld->u.simple.bld_sig_len = res->u.sig.len;
-+ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL);
-+ if (!bld->u.simple.bld_sig)
-+ goto error;
-+
-+ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len);
-+ kfree(res);
-+ return bld;
-+
-+error:
-+ if (bld)
-+ bld_free(bld);
-+ if (res)
-+ kfree(res);
-+ dprintk("%s: error in bld_simple\n", __func__);
-+ return NULL;
-+}
-+
-+/*
-+ * bld_slice -- given a dev_t build a slice volume structure
-+ *
-+ * A slice volume contains the length of the slice/partition and its offset
-+ * from the beginning of the storage volume. There's also a reference to
-+ * the "simple" volume which contains this slice.
-+ */
-+static pnfs_blocklayout_devinfo_t *
-+bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc)
-+{
-+ pnfs_blocklayout_devinfo_t *bld;
-+ bl_comm_msg_t msg;
-+ bl_comm_res_t *res;
-+
-+ dprintk("--> %s\n", __func__);
-+ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE);
-+ if (!bld)
-+ return NULL;
-+
-+ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE;
-+ msg.u.msg_dev = devid;
-+ if (bl_upcall(bl_comm_global, &msg, &res)) {
-+ dprintk("Upcall to get slice info failed\n");
-+ bld_free(bld);
-+ return NULL;
-+ }
-+
-+ bld->bld_devid.devid = devid;
-+ bld->bld_index_loc = my_loc;
-+ bld->u.slice.bld_start = res->u.slice.start * 512LL;
-+ bld->u.slice.bld_len = res->u.slice.length * 512LL;
-+ bld->u.slice.bld_index = simple_loc;
-+
-+ dprintk("%s: start %Lu, len %Lu\n", __func__,
-+ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL);
-+
-+ kfree(res);
-+ dprintk("<-- %s (rval %p)\n", __func__, bld);
-+ return bld;
-+}
-+
-+static int
-+layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
-+ struct nfsd4_layout_seg *seg)
-+{
-+ pnfs_blocklayout_layout_t *n;
-+
-+ dprintk("--> %s\n", __func__);
-+
-+ if (!list_empty(&r->blr_layouts))
-+ if (layout_cache_fill_from_list(r, h, seg) == False)
-+ return -EIO;
-+
-+ /*
-+ * This deals with two conditions.
-+ * (1) When blr_layouts is empty we need to create the first entry
-+ * (2) When the range requested falls past the end of any current
-+ * layout the residual must be taken care of.
-+ */
-+ if (seg->length) {
-+ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h);
-+ if (!n)
-+ return -ENOMEM;
-+ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff),
-+ _2SECTS(n->bll_len));
-+ }
-+
-+ dprintk("<-- %s\n", __func__);
-+ return 0;
-+}
-+
-+struct list_head *
-+layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible,
-+ struct nfsd4_layout_seg *seg)
-+{
-+ pnfs_blocklayout_layout_t *b,
-+ *n = NULL;
-+ struct list_head *bl_candidates = NULL;
-+ struct fiemap_extent_info fei;
-+ struct inode *i;
-+ dev_t dev;
-+
-+ dev = r->blr_rdev;
-+ i = r->blr_inode;
-+
-+ dprintk("--> %s\n", __func__);
-+ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL);
-+ if (!bl_candidates)
-+ return NULL;
-+ INIT_LIST_HEAD(bl_candidates);
-+ extents_setup(&fei);
-+
-+ list_for_each_entry(b, bl_possible, bll_list) {
-+ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
-+
-+ extents_count(&fei, i, b->bll_foff, b->bll_len);
-+ if (fei.fi_extents_mapped) {
-+
-+ /*
-+ * Common case here. Got a range which has
-+ * extents. Now get those extents and process
-+ * them into pNFS extents.
-+ */
-+ if (extents_get(&fei, i, b->bll_foff,
-+ b->bll_len) == False)
-+ goto cleanup;
-+ if (extents_process(&fei, bl_candidates,
-+ seg, dev, b) == False)
-+ goto cleanup;
-+ extents_cleanup(&fei);
-+
-+ } else if (seg->iomode == IOMODE_READ) {
-+
-+ /*
-+ * Found a hole in a file while reading. No
-+ * problem, just create a pNFS extent for the
-+ * range and let the client know there's no
-+ * backing store.
-+ */
-+ n = bll_alloc(b->bll_foff, b->bll_len,
-+ BLOCK_LAYOUT_NEW, bl_candidates);
-+ n->bll_es = PNFS_BLOCK_NONE_DATA;
-+ n->bll_vol_id.sbid = 0;
-+ n->bll_vol_id.devid = dev;
-+ seg->length += b->bll_len;
-+ } else {
-+
-+ /*
-+ * There's a problem here. Since the iomode
-+ * is read/write fallocate should have allocated
-+ * any necessary storage for the given range.
-+ */
-+ dprintk(" Extent count for RW is 0\n");
-+ goto cleanup;
-+ }
-+
-+ } else {
-+ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates);
-+ seg->length += n->bll_len;
-+ }
-+
-+ if (r->blr_ext_size < (b->bll_foff + b->bll_len))
-+ r->blr_ext_size = b->bll_foff + b->bll_len;
-+ }
-+
-+ while (!list_empty(bl_possible)) {
-+ b = list_entry(bl_possible->next,
-+ struct pnfs_blocklayout_layout, bll_list);
-+ list_del(&b->bll_list);
-+ kfree(b);
-+ }
-+
-+ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout,
-+ bll_list);
-+ seg->offset = b->bll_foff;
-+ dprintk("<-- %s okay\n", __func__);
-+ return bl_candidates;
-+
-+cleanup:
-+ extents_cleanup(&fei);
-+ if (bl_candidates)
-+ kfree(bl_candidates);
-+ dprintk("<-- %s, error occurred\n", __func__);
-+ return NULL;
-+}
-+
-+/*
-+ * layout_cache_merge -- collapse layouts which make up a contiguous range.
-+ */
-+static void
-+layout_cache_merge(bl_layout_rec_t *r, struct list_head *h)
-+{
-+ pnfs_blocklayout_layout_t *b,
-+ *p;
-+
-+ dprintk("--> %s\n", __func__);
-+restart:
-+ p = NULL;
-+ list_for_each_entry(b, h, bll_list) {
-+ if (p && (BLL_S_END(p) == b->bll_soff) &&
-+ (p->bll_es == b->bll_es) &&
-+ (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
-+ /*
-+ * We've got a condidate.
-+ */
-+#ifdef too_verbose
-+ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n",
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+ _2SECTS(b->bll_soff),
-+ _2SECTS(p->bll_foff), _2SECTS(p->bll_len),
-+ _2SECTS(b->bll_soff));
-+#endif
-+
-+ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE)
-+ p->bll_cache_state = BLOCK_LAYOUT_UPDATE;
-+ p->bll_len += b->bll_len;
-+ list_del(&b->bll_list);
-+ kfree(b);
-+ goto restart;
-+ } else if (p && (BLL_F_END(p) == b->bll_foff) &&
-+ (p->bll_es == b->bll_es) &&
-+ (b->bll_es == PNFS_BLOCK_NONE_DATA)) {
-+ p->bll_len += b->bll_len;
-+ list_del(&b->bll_list);
-+ kfree(b);
-+ goto restart;
-+ } else
-+ p = b;
-+ }
-+ dprintk("<-- %s\n", __func__);
-+}
-+
-+static int
-+layout_cache_update(bl_layout_rec_t *r, struct list_head *h)
-+{
-+ pnfs_blocklayout_layout_t *b,
-+ *c,
-+ *n;
-+ boolean_t status = 0;
-+
-+ dprintk("--> %s\n", __func__);
-+ if (list_empty(&r->blr_layouts)) {
-+ /* ---- Just add entries and return ---- */
-+ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev,
-+ r->blr_inode->i_ino);
-+ list_for_each_entry(b, h, bll_list) {
-+ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE,
-+ &r->blr_layouts);
-+ if (!c) {
-+ status = -ENOMEM;
-+ break;
-+ }
-+ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n",
-+ _2SECTS(c->bll_foff), _2SECTS(c->bll_len),
-+ _2SECTS(c->bll_soff), c->bll_es);
-+ }
-+ return status;
-+ }
-+
-+ list_for_each_entry(b, h, bll_list) {
-+ BUG_ON(!b->bll_vol_id.devid);
-+ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) {
-+ boolean_t found = False;
-+ list_for_each_entry(c, &r->blr_layouts, bll_list) {
-+ if ((b->bll_soff >= c->bll_soff) &&
-+ (b->bll_soff < BLL_S_END(c)) &&
-+ (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
-+ u64 u;
-+
-+ if ((b->bll_foff < c->bll_foff) ||
-+ (b->bll_foff > BLL_F_END(c)))
-+ BUG();
-+
-+ u = BLL_S_END(b) - BLL_S_END(c);
-+ /*
-+ * The updated cache entry has to be
-+ * different than the current.
-+ * Otherwise the cache state for 'b'
-+ * should be BLOCK_LAYOUT_CACHE.
-+ */
-+ BUG_ON(BLL_S_END(b) < BLL_S_END(c));
-+
-+ dprintk(" "
-+ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n",
-+ _2SECTS(c->bll_foff),
-+ _2SECTS(c->bll_len),
-+ _2SECTS(c->bll_soff),
-+ _2SECTS(c->bll_len + u));
-+ c->bll_len += u;
-+ bll_collapse(r, c);
-+ found = True;
-+ break;
-+ }
-+ }
-+
-+ if (found == False) {
-+ dprintk(" ERROR Expected to find"
-+ " %Lu(f):%Lu(l):%Lu(s), but didn't\n",
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+ _2SECTS(b->bll_soff));
-+ list_for_each_entry(c, &r->blr_layouts, bll_list)
-+ print_bll(c, "Cached");
-+ BUG();
-+ }
-+ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
-+
-+ c = list_first_entry(&r->blr_layouts,
-+ struct pnfs_blocklayout_layout, bll_list);
-+ if (b->bll_foff < c->bll_foff) {
-+ /*
-+ * Special case where new entry is before
-+ * first cached entry.
-+ */
-+ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL);
-+ list_add(&c->bll_list, &r->blr_layouts);
-+ dprintk(" new entry at head of list at %Lu, "
-+ "len %Lu\n",
-+ _2SECTS(c->bll_foff), _2SECTS(c->bll_len));
-+ } else {
-+ list_for_each_entry(c, &r->blr_layouts,
-+ bll_list) {
-+ n = list_entry(c->bll_list.next,
-+ struct pnfs_blocklayout_layout,
-+ bll_list);
-+ /*
-+ * This is ugly, but can't think of
-+ * another way to examine this case.
-+ * Consider the following. Need to
-+ * add an entry which starts at 40
-+ * and the cache has the following
-+ * entries:
-+ * Start Length
-+ * 10 5
-+ * 30 5
-+ * 50 5
-+ * So, need to look and see if the new
-+ * entry starts after the current
-+ * cache, but before the next one.
-+ * There's a catch in that the next
-+ * entry might not be valid as it's
-+ * really just a pointer to the list
-+ * head.
-+ */
-+ if (((b->bll_foff >=
-+ BLL_F_END(c)) &&
-+ (c->bll_list.next == &r->blr_layouts)) ||
-+ ((b->bll_foff >=
-+ BLL_F_END(c)) &&
-+ (b->bll_foff < n->bll_foff))) {
-+
-+ n = bll_alloc_dup(b,
-+ BLOCK_LAYOUT_CACHE, NULL);
-+ dprintk(" adding new %Lu:%Lu"
-+ " after %Lu:%Lu\n",
-+ _2SECTS(n->bll_foff),
-+ _2SECTS(n->bll_len),
-+ _2SECTS(c->bll_foff),
-+ _2SECTS(c->bll_len));
-+ list_add(&n->bll_list,
-+ &c->bll_list);
-+ break;
-+ }
-+ }
-+ }
-+ }
-+ }
-+ dprintk("<-- %s\n", __func__);
-+ return status;
-+}
-+
-+static void
-+layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in)
-+{
-+ struct pnfs_blocklayout_layout *b,
-+ *n;
-+ u64 len;
-+ struct nfsd4_layout_seg seg = *seg_in;
-+
-+ dprintk("--> %s\n", __func__);
-+ if (seg.length == NFS4_MAX_UINT64) {
-+ r->blr_recalled = 0;
-+ dprintk(" Fast return of all layouts\n");
-+ while (!list_empty(&r->blr_layouts)) {
-+ b = list_entry(r->blr_layouts.next,
-+ struct pnfs_blocklayout_layout, bll_list);
-+ dprintk(" foff %Lu, len %Lu, soff %Lu\n",
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+ _2SECTS(b->bll_soff));
-+ list_del(&b->bll_list);
-+ kfree(b);
-+ }
-+ dprintk("<-- %s\n", __func__);
-+ return;
-+ }
-+
-+restart:
-+ list_for_each_entry(b, &r->blr_layouts, bll_list) {
-+ if (seg.offset == b->bll_foff) {
-+ /*
-+ * This handle the following three cases:
-+ * (1) return layout matches entire cache layout
-+ * (2) return layout matches beginning portion of cache
-+ * (3) return layout matches entire cache layout and
-+ * into next entry. Varies from #1 in end case.
-+ */
-+ dprintk(" match on offsets, %Lu:%Lu\n",
-+ _2SECTS(seg.offset), _2SECTS(seg.length));
-+ len = MIN(seg.length, b->bll_len);
-+ b->bll_foff += len;
-+ b->bll_soff += len;
-+ b->bll_len -= len;
-+ seg.length -= len;
-+ seg.offset += len;
-+ if (!b->bll_len) {
-+ list_del(&b->bll_list);
-+ kfree(b);
-+ dprintk(" removing cache line\n");
-+ if (!seg.length) {
-+ dprintk(" also finished\n");
-+ goto complete;
-+ }
-+ /*
-+ * Since 'b' was freed we can't continue at the
-+ * next entry which is referenced as
-+ * b->bll_list.next by the list_for_each_entry
-+ * macro. Need to restart the loop.
-+ * TODO: Think about creating a dummy 'b' which
-+ * would keep list_for_each_entry() happy.
-+ */
-+ goto restart;
-+ }
-+ if (!seg.length) {
-+ dprintk(" finished, but cache line not"
-+ "empty\n");
-+ goto complete;
-+ }
-+ } else if ((seg.offset >= b->bll_foff) &&
-+ (seg.offset < BLL_F_END(b))) {
-+ /*
-+ * layout being returned is within this cache line.
-+ */
-+ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n",
-+ _2SECTS(seg.offset), _2SECTS(seg.length),
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
-+ BUG_ON(!seg.length);
-+ if ((seg.offset + seg.length) >= BLL_F_END(b)) {
-+ /*
-+ * Layout returned starts in the middle of
-+ * cache entry and just need to trim back
-+ * cache to shorter length.
-+ */
-+ dprintk(" trim back cache line\n");
-+ len = seg.offset - b->bll_foff;
-+ seg.offset += b->bll_len - len;
-+ seg.length -= b->bll_len - len;
-+ b->bll_len = len;
-+ if (!seg.length)
-+ return;
-+ } else {
-+ /*
-+ * Need to split current cache layout because
-+ * chunk is being removed from the middle.
-+ */
-+ dprintk(" split cache line\n");
-+ len = seg.offset + seg.length;
-+ n = bll_alloc(len,
-+ (b->bll_foff + b->bll_len) - len,
-+ BLOCK_LAYOUT_CACHE, NULL);
-+ n->bll_soff = b->bll_soff + len;
-+ list_add(&n->bll_list, &b->bll_list);
-+ b->bll_len = seg.offset - b->bll_foff;
-+ return;
-+ }
-+ }
-+ }
-+complete:
-+ if (list_empty(&r->blr_layouts))
-+ r->blr_recalled = 0;
-+ dprintk("<-- %s\n", __func__);
-+}
-+
-+/*
-+ * layout_cache_fill_from_list -- fills from cache list
-+ *
-+ * NOTE: This routine was only seperated out from layout_cache_file_from()
-+ * to reduce the indentation level which makes the code easier to read.
-+ */
-+static inline boolean_t
-+layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h,
-+ struct nfsd4_layout_seg *seg)
-+{
-+ pnfs_blocklayout_layout_t *b,
-+ *n;
-+ enum pnfs_block_extent_state4 s;
-+
-+ list_for_each_entry(b, &r->blr_layouts, bll_list) {
-+ if (seg->offset < b->bll_foff) {
-+ n = bll_alloc(seg->offset,
-+ MIN(seg->length, b->bll_foff - seg->offset),
-+ BLOCK_LAYOUT_NEW, NULL);
-+ if (!n)
-+ return False;
-+
-+ list_add(&n->bll_list, h->prev);
-+ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n",
-+ _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
-+ seg->offset += n->bll_len;
-+ seg->length -= n->bll_len;
-+ if (!seg->length)
-+ break;
-+ }
-+
-+ if ((seg->offset >= b->bll_foff) &&
-+ (seg->offset < BLL_F_END(b))) {
-+ if (layout_conflict(b, seg->iomode, &s) == False) {
-+ dprintk(" CONFLICT FOUND: "
-+ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n",
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+ _2SECTS(b->bll_soff), b->bll_es,
-+ seg->iomode);
-+ return False;
-+ }
-+ n = bll_alloc(seg->offset,
-+ MIN(seg->length, BLL_F_END(b) - seg->offset),
-+ BLOCK_LAYOUT_CACHE, h);
-+ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): "
-+ "in %Lu(f):%Lu(l):%Lu(s):%d\n",
-+ _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
-+ _2SECTS(b->bll_soff), b->bll_es);
-+ if (!n)
-+ return False;
-+
-+ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff;
-+ n->bll_vol_id.sbid = 0;
-+ n->bll_vol_id.devid = b->bll_vol_id.devid;
-+ n->bll_es = s;
-+ seg->offset += n->bll_len;
-+ seg->length -= n->bll_len;
-+ if (!seg->length)
-+ break;
-+ }
-+ }
-+ return True;
-+}
-+
-+static u64
-+bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length,
-+ dev_t dev)
-+{
-+ pnfs_blocklayout_layout_t *n;
-+
-+ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates);
-+ if (!n)
-+ return 0;
-+ n->bll_es = PNFS_BLOCK_NONE_DATA;
-+ n->bll_vol_id.sbid = 0;
-+ n->bll_vol_id.devid = dev;
-+
-+ return n->bll_len;
-+}
-+
-+static void
-+extents_setup(struct fiemap_extent_info *fei)
-+{
-+ fei->fi_extents_start = NULL;
-+}
-+
-+/*
-+ * extents_count -- Determine the number of extents for a given range.
-+ *
-+ * No need to call set_fs() here because the function
-+ * doesn't use copy_to_user() if it's only counting
-+ * the number of extents needed.
-+ */
-+static void
-+extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
-+{
-+ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len));
-+ fei->fi_flags = FIEMAP_FLAG_SYNC;
-+ fei->fi_extents_max = 0;
-+ fei->fi_extents_start = NULL;
-+ fei->fi_extents_mapped = 0;
-+ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1);
-+}
-+
-+/*
-+ * extents_get -- Get list of extents for range
-+ *
-+ * extents_count() must have been called before this routine such that
-+ * fi_extents_mapped is known.
-+ */
-+static boolean_t
-+extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
-+{
-+ int m_space,
-+ rval;
-+ struct fiemap_extent *fe;
-+ mm_segment_t old_fs = get_fs();
-+
-+ /*
-+ * Now malloc the correct amount of space
-+ * needed. It's possible for the file to have changed
-+ * between calls which would require more space for
-+ * the extents. If that occurs the last extent will
-+ * not have FIEMAP_EXTENT_LAST set and the error will
-+ * be caught in extents_process().
-+ */
-+ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent);
-+ fe = kmalloc(m_space, GFP_KERNEL);
-+ if (!fe)
-+ return False;
-+ memset(fe, 0, m_space);
-+
-+ fei->fi_extents_max = fei->fi_extents_mapped;
-+ fei->fi_extents_mapped = 0;
-+ fei->fi_extents_start = fe;
-+
-+ set_fs(KERNEL_DS);
-+ rval = i->i_op->fiemap(i, fei, foff, len +
-+ (1 << i->i_sb->s_blocksize_bits) - 1);
-+ set_fs(old_fs);
-+
-+ if (rval || !fei->fi_extents_mapped) {
-+ dprintk(" No extents. Wanted %d, got %d\n",
-+ fei->fi_extents_max, fei->fi_extents_mapped);
-+ kfree(fe);
-+ fei->fi_extents_start = NULL;
-+ return False;
-+ } else
-+ return True;
-+}
-+
-+/*
-+ * extents_process -- runs through the extent returned from the file system and
-+ * creates block layout entries.
-+ */
-+static boolean_t
-+extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates,
-+ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b)
-+{
-+ struct fiemap_extent *fep,
-+ *fep_last = NULL;
-+ int i;
-+ pnfs_blocklayout_layout_t *n;
-+ u64 last_end,
-+ rval;
-+
-+ dprintk("--> %s\n", __func__);
-+ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped;
-+ i++, fep++) {
-+
-+ BUG_ON(!fep->fe_physical);
-+ /*
-+ * Deal with corner cases of hoel-y files.
-+ */
-+ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) !=
-+ fep->fe_logical)) {
-+
-+ /*
-+ * If the last extent doesn't end logically
-+ * at the beginning of the current we've got
-+ * hole and need to create a pNFS extent.
-+ */
-+ dprintk(" Got a hole at %Ld:%Ld \n",
-+ _2SECTS(fep_last->fe_logical),
-+ _2SECTS(fep_last->fe_length));
-+ last_end = fep_last->fe_logical + fep_last->fe_length;
-+ rval = bll_alloc_holey(bl_candidates, last_end,
-+ fep->fe_logical - last_end, dev);
-+ if (!rval)
-+ return False;
-+ seg->length += rval;
-+ }
-+
-+ n = bll_alloc(fep->fe_logical, fep->fe_length,
-+ BLOCK_LAYOUT_NEW, bl_candidates);
-+ if (unlikely(n == NULL)) {
-+ dprintk("%s: bll_alloc failed\n", __func__);
-+ return False;
-+ }
-+
-+ n->bll_soff = fep->fe_physical;
-+ n->bll_es = seg->iomode == IOMODE_READ ?
-+ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA;
-+ n->bll_vol_id.sbid = 0;
-+ n->bll_vol_id.devid = dev;
-+ seg->length += fep->fe_length;
-+ print_bll(n, "New extent");
-+ fep_last = fep;
-+ }
-+ dprintk("<-- %s (i=%d)\n", __func__, i);
-+
-+ return True;
-+}
-+
-+static void
-+extents_cleanup(struct fiemap_extent_info *fei)
-+{
-+ if (fei->fi_extents_start) {
-+ kfree(fei->fi_extents_start);
-+ fei->fi_extents_start = NULL;
-+ }
-+}
-+
-+/*
-+ * device_slice -- check to see if device is a slice or DM
-+ */
-+static boolean_t
-+device_slice(dev_t devid)
-+{
-+ struct block_device *bd = open_by_devnum(devid, FMODE_READ);
-+ boolean_t rval = False;
-+
-+ if (bd) {
-+ if (bd->bd_disk->minors > 1)
-+ rval = True;
-+ blkdev_put(bd, FMODE_READ);
-+ }
-+ return rval;
-+}
-+
-+/*
-+ * device_dm -- check to see if device is a Device Mapper volume.
-+ *
-+ * Returns 1 for DM or 0 if not
-+ */
-+static boolean_t
-+device_dm(dev_t devid)
-+{
-+ boolean_t rval = False;
-+ bl_comm_msg_t msg;
-+ bl_comm_res_t *res;
-+
-+ msg.msg_type = PNFS_UPCALL_MSG_DMCHK;
-+ msg.u.msg_dev = devid;
-+ if (bl_upcall(bl_comm_global, &msg, &res)) {
-+ dprintk("Failed upcall to check on DM status\n");
-+ } else if (res->u.dm_vol) {
-+ rval = True;
-+ dprintk("Device is DM volume\n");
-+ } else
-+ dprintk("Device is not DM volume\n");
-+ kfree(res);
-+
-+ return rval;
-+}
-+
-+static boolean_t
-+layout_inode_add(struct inode *i, bl_layout_rec_t **p)
-+{
-+ bl_layout_rec_t *r = NULL;
-+
-+ if (!i->i_op->fiemap || !i->i_op->fallocate) {
-+ printk("pNFS: file system doesn't support required fiemap or"
-+ "fallocate methods\n");
-+ return False;
-+ }
-+
-+ r = kmalloc(sizeof (*r), GFP_KERNEL);
-+ if (!r)
-+ goto error;
-+
-+ r->blr_rdev = i->i_sb->s_dev;
-+ r->blr_inode = i;
-+ r->blr_orig_size = i->i_size;
-+ r->blr_ext_size = 0;
-+ r->blr_recalled = 0;
-+ INIT_LIST_HEAD(&r->blr_layouts);
-+ spin_lock_init(&r->blr_lock);
-+ spin_lock(&layout_hashtbl_lock);
-+ list_add_tail(&r->blr_hash, &layout_hash);
-+ spin_unlock(&layout_hashtbl_lock);
-+ *p = r;
-+ return True;
-+
-+error:
-+ if (r)
-+ kfree(r);
-+ return False;
-+}
-+
-+static bl_layout_rec_t *
-+__layout_inode_find(struct inode *i)
-+{
-+ bl_layout_rec_t *r;
-+
-+ if (!list_empty(&layout_hash)) {
-+ list_for_each_entry(r, &layout_hash, blr_hash) {
-+ if ((r->blr_inode->i_ino == i->i_ino) &&
-+ (r->blr_rdev == i->i_sb->s_dev)) {
-+ return r;
-+ }
-+ }
-+ }
-+ return NULL;
-+}
-+
-+static bl_layout_rec_t *
-+layout_inode_find(struct inode *i)
-+{
-+ bl_layout_rec_t *r;
-+
-+ spin_lock(&layout_hashtbl_lock);
-+ r = __layout_inode_find(i);
-+ spin_unlock(&layout_hashtbl_lock);
-+
-+ return r;
-+}
-+
-+static void
-+layout_inode_del(struct inode *i)
-+{
-+ bl_layout_rec_t *r;
-+
-+ spin_lock(&layout_hashtbl_lock);
-+ r = __layout_inode_find(i);
-+ if (r) {
-+ spin_lock(&r->blr_lock);
-+ if (list_empty(&r->blr_layouts)) {
-+ list_del(&r->blr_hash);
-+ spin_unlock(&r->blr_lock);
-+ kfree(r);
-+ } else {
-+ spin_unlock(&r->blr_lock);
-+ }
-+ } else {
-+ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n",
-+ __func__, i->i_sb->s_dev, i->i_ino);
-+ }
-+ spin_unlock(&layout_hashtbl_lock);
-+}
-+
-+/*
-+ * map_state2name -- converts state in ascii string.
-+ *
-+ * Used for debug messages only.
-+ */
-+static char *
-+map_state2name(enum pnfs_block_extent_state4 s)
-+{
-+ switch (s) {
-+ case PNFS_BLOCK_READWRITE_DATA: return " RW";
-+ case PNFS_BLOCK_READ_DATA: return " RO";
-+ case PNFS_BLOCK_INVALID_DATA: return "INVALID";
-+ case PNFS_BLOCK_NONE_DATA: return " NONE";
-+ default:
-+ BUG();
-+ }
-+}
-+
-+static pnfs_blocklayout_devinfo_t *
-+bld_alloc(struct list_head *volumes, int type)
-+{
-+ pnfs_blocklayout_devinfo_t *bld;
-+
-+ bld = kmalloc(sizeof (*bld), GFP_KERNEL);
-+ if (!bld)
-+ return NULL;
-+
-+ memset(bld, 0, sizeof (*bld));
-+ bld->bld_type = type;
-+ list_add_tail(&bld->bld_list, volumes);
-+
-+ return bld;
-+}
-+
-+static void
-+bld_free(pnfs_blocklayout_devinfo_t *bld)
-+{
-+ list_del(&bld->bld_list);
-+ kfree(bld);
-+}
-+
-+static void
-+print_bll(pnfs_blocklayout_layout_t *b, char *text)
-+{
-+ dprintk(" BLL: %s\n", text);
-+ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n",
-+ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len),
-+ map_state2name(b->bll_es));
-+}
-+
-+static inline void
-+bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c)
-+{
-+ pnfs_blocklayout_layout_t *n;
-+ int dbg_count = 0;
-+ u64 endpoint;
-+
-+ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA);
-+ while (c->bll_list.next != &r->blr_layouts) {
-+ n = list_entry(c->bll_list.next,
-+ struct pnfs_blocklayout_layout, bll_list);
-+ endpoint = BLL_S_END(c);
-+ if ((n->bll_soff >= c->bll_soff) &&
-+ (n->bll_soff < endpoint)) {
-+ if (endpoint < BLL_S_END(n)) {
-+ /*
-+ * The following is possible.
-+ *
-+ *
-+ * Existing: +---+ +---+
-+ * New: +-----------------------+
-+ * The client request merge entries together
-+ * but didn't require picking up all of the
-+ * last entry. So, we still need to delete
-+ * the last entry and add the remaining space
-+ * to the new entry.
-+ */
-+ c->bll_len += BLL_S_END(n) - endpoint;
-+ }
-+ dbg_count++;
-+ list_del(&n->bll_list);
-+ kfree(n);
-+ } else {
-+ break;
-+ }
-+ }
-+ /* ---- Debug only, remove before integration ---- */
-+ if (dbg_count)
-+ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n",
-+ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c)));
-+}
-+
-+static pnfs_blocklayout_layout_t *
-+bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h)
-+{
-+ pnfs_blocklayout_layout_t *n = NULL;
-+
-+ n = kmalloc(sizeof (*n), GFP_KERNEL);
-+ if (n) {
-+ memset(n, 0, sizeof (*n));
-+ n->bll_foff = offset;
-+ n->bll_len = len;
-+ n->bll_cache_state = state;
-+ if (h)
-+ list_add_tail(&n->bll_list, h);
-+ }
-+ return n;
-+}
-+
-+static pnfs_blocklayout_layout_t *
-+bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c,
-+ struct list_head *h)
-+{
-+ pnfs_blocklayout_layout_t *n = NULL;
-+
-+ n = bll_alloc(b->bll_foff, b->bll_len, c, h);
-+ if (n) {
-+ n->bll_es = b->bll_es;
-+ n->bll_soff = b->bll_soff;
-+ n->bll_vol_id.devid = b->bll_vol_id.devid;
-+ }
-+ return n;
-+}
-+
-+static inline boolean_t
-+layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
-+ enum pnfs_block_extent_state4 *s)
-+{
-+ /* ---- Normal case ---- */
-+ *s = b->bll_es;
-+
-+ switch (b->bll_es) {
-+ case PNFS_BLOCK_READWRITE_DATA:
-+ if (iomode == IOMODE_READ)
-+ *s = PNFS_BLOCK_READ_DATA;
-+ /* ---- Any use is permitted. ---- */
-+ break;
-+ case PNFS_BLOCK_READ_DATA:
-+ /* ---- Committed as read only data. ---- */
-+ if (iomode == IOMODE_RW)
-+ return False;
-+ break;
-+ case PNFS_BLOCK_INVALID_DATA:
-+ /* ---- Blocks have been allocated, but not initialized ---- */
-+ if (iomode == IOMODE_READ)
-+ *s = PNFS_BLOCK_NONE_DATA;
-+ break;
-+ case PNFS_BLOCK_NONE_DATA:
-+ /* ---- Hole-y file. No backing store avail. ---- */
-+ if (iomode != IOMODE_READ)
-+ return False;
-+ break;
-+ default:
-+ BUG();
-+ }
-+ return True;
-+}
-+
-+#endif /* CONFIG_SPNFS_BLOCK */
-diff -up linux-2.6.35.noarch/fs/nfsd/export.c.orig linux-2.6.35.noarch/fs/nfsd/export.c
---- linux-2.6.35.noarch/fs/nfsd/export.c.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/export.c 2010-09-30 12:25:08.493284000 -0400
-@@ -17,11 +17,19 @@
- #include <linux/module.h>
- #include <linux/exportfs.h>
-
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+#if defined(CONFIG_SPNFS)
-+#include <linux/nfsd4_spnfs.h>
-+#if defined(CONFIG_SPNFS_BLOCK)
-+#include <linux/nfsd4_block.h>
-+#endif
-+#endif
- #include <linux/nfsd/syscall.h>
- #include <net/ipv6.h>
-
- #include "nfsd.h"
- #include "nfsfh.h"
-+#include "pnfsd.h"
-
- #define NFSDDBG_FACILITY NFSDDBG_EXPORT
-
-@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach
- return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
- }
-
-+#if defined(CONFIG_PNFSD)
-+static struct pnfsd_cb_operations pnfsd_cb_op = {
-+ .cb_layout_recall = nfsd_layout_recall_cb,
-+ .cb_device_notify = nfsd_device_notify_cb,
-+
-+ .cb_get_state = nfs4_pnfs_cb_get_state,
-+ .cb_change_state = nfs4_pnfs_cb_change_state,
-+};
-+
-+#if defined(CONFIG_SPNFS)
-+static struct pnfs_export_operations spnfs_export_ops = {
-+ .layout_type = spnfs_layout_type,
-+ .get_device_info = spnfs_getdeviceinfo,
-+ .get_device_iter = spnfs_getdeviceiter,
-+ .layout_get = spnfs_layoutget,
-+ .layout_return = spnfs_layoutreturn,
-+};
-+
-+static struct pnfs_export_operations spnfs_ds_export_ops = {
-+ .get_state = spnfs_get_state,
-+};
-+
-+#if defined(CONFIG_SPNFS_BLOCK)
-+static struct pnfs_export_operations bl_export_ops = {
-+ .layout_type = bl_layout_type,
-+ .get_device_info = bl_getdeviceinfo,
-+ .get_device_iter = bl_getdeviceiter,
-+ .layout_get = bl_layoutget,
-+ .layout_return = bl_layoutreturn,
-+};
-+#endif /* CONFIG_SPNFS_BLOCK */
-+#endif /* CONFIG_SPNFS */
-+#endif /* CONFIG_PNFSD */
-+
- static struct svc_export *svc_export_update(struct svc_export *new,
- struct svc_export *old);
- static struct svc_export *svc_export_lookup(struct svc_export *);
-@@ -395,6 +437,47 @@ static int check_export(struct inode *in
- return -EINVAL;
- }
-
-+#if !defined(CONFIG_SPNFS)
-+ if (inode->i_sb->s_pnfs_op &&
-+ (!inode->i_sb->s_pnfs_op->layout_type ||
-+ !inode->i_sb->s_pnfs_op->get_device_info ||
-+ !inode->i_sb->s_pnfs_op->layout_get)) {
-+ dprintk("exp_export: export of invalid fs pnfs export ops.\n");
-+ return -EINVAL;
-+ }
-+#endif /* CONFIG_SPNFS */
-+
-+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
-+ if (!inode->i_sb->s_pnfs_op)
-+ pnfsd_lexp_init(inode);
-+ return 0;
-+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
-+
-+#if defined(CONFIG_SPNFS)
-+#if defined(CONFIG_SPNFS_BLOCK)
-+ if (pnfs_block_enabled(inode, *flags)) {
-+ dprintk("set pnfs block export structure... \n");
-+ inode->i_sb->s_pnfs_op = &bl_export_ops;
-+ } else
-+#endif /* CONFIG_SPNFS_BLOCK */
-+ /*
-+ * spnfs_enabled() indicates we're an MDS.
-+ * XXX Better to check an export time option as well.
-+ */
-+ if (spnfs_enabled()) {
-+ dprintk("set spnfs export structure...\n");
-+ inode->i_sb->s_pnfs_op = &spnfs_export_ops;
-+ } else {
-+ dprintk("%s spnfs not in use\n", __func__);
-+
-+ /*
-+ * get_state is needed if we're a DS using spnfs.
-+ * XXX Better to check an export time option instead.
-+ */
-+ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops;
-+ }
-+#endif /* CONFIG_SPNFS */
-+
- return 0;
-
- }
-@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache
- if (exp.ex_uuid == NULL)
- err = -ENOMEM;
- }
-+ } else if (strcmp(buf, "pnfs") == 0) {
-+ exp.ex_pnfs = 1;
- } else if (strcmp(buf, "secinfo") == 0)
- err = secinfo_parse(&mesg, buf, &exp);
- else
-@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi
- seq_printf(m, "%02x", exp->ex_uuid[i]);
- }
- }
-+ if (exp->ex_pnfs)
-+ seq_puts(m, ",pnfs");
- show_secinfo(m, exp);
- }
- seq_puts(m, ")\n");
-@@ -687,6 +774,7 @@ static void svc_export_init(struct cache
- new->ex_fslocs.locations = NULL;
- new->ex_fslocs.locations_count = 0;
- new->ex_fslocs.migrated = 0;
-+ new->ex_pnfs = 0;
- }
-
- static void export_update(struct cache_head *cnew, struct cache_head *citem)
-@@ -699,6 +787,7 @@ static void export_update(struct cache_h
- new->ex_anon_uid = item->ex_anon_uid;
- new->ex_anon_gid = item->ex_anon_gid;
- new->ex_fsid = item->ex_fsid;
-+ new->ex_pnfs = item->ex_pnfs;
- new->ex_uuid = item->ex_uuid;
- item->ex_uuid = NULL;
- new->ex_pathname = item->ex_pathname;
-@@ -1635,8 +1724,17 @@ nfsd_export_init(void)
- if (rv)
- return rv;
- rv = cache_register(&svc_expkey_cache);
-- if (rv)
-+ if (rv) {
- cache_unregister(&svc_export_cache);
-+ goto out;
-+ }
-+#if defined(CONFIG_PNFSD)
-+ spin_lock(&pnfsd_cb_ctl.lock);
-+ pnfsd_cb_ctl.module = THIS_MODULE;
-+ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op;
-+ spin_unlock(&pnfsd_cb_ctl.lock);
-+#endif /* CONFIG_PNFSD */
-+out:
- return rv;
-
- }
-@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void)
-
- exp_writelock();
-
-+#if defined(CONFIG_PNFSD)
-+ spin_lock(&pnfsd_cb_ctl.lock);
-+ pnfsd_cb_ctl.module = NULL;
-+ pnfsd_cb_ctl.cb_op = NULL;
-+ spin_unlock(&pnfsd_cb_ctl.lock);
-+#endif /* CONFIG_PNFSD */
- cache_unregister(&svc_expkey_cache);
- cache_unregister(&svc_export_cache);
- svcauth_unix_purge();
-diff -up linux-2.6.35.noarch/fs/nfs/direct.c.orig linux-2.6.35.noarch/fs/nfs/direct.c
---- linux-2.6.35.noarch/fs/nfs/direct.c.orig 2010-09-30 12:22:45.113041000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/direct.c 2010-09-30 12:25:08.274277000 -0400
-@@ -271,6 +271,38 @@ static const struct rpc_call_ops nfs_rea
- .rpc_release = nfs_direct_read_release,
- };
-
-+static long nfs_direct_read_execute(struct nfs_read_data *data,
-+ struct rpc_task_setup *task_setup_data,
-+ struct rpc_message *msg)
-+{
-+ struct inode *inode = data->inode;
-+ struct rpc_task *task;
-+
-+ nfs_fattr_init(&data->fattr);
-+ msg->rpc_argp = &data->args;
-+ msg->rpc_resp = &data->res;
-+
-+ task_setup_data->task = &data->task;
-+ task_setup_data->callback_data = data;
-+ NFS_PROTO(inode)->read_setup(data, msg);
-+
-+ task = rpc_run_task(task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+
-+ rpc_put_task(task);
-+
-+ dprintk("NFS: %5u initiated direct read call "
-+ "(req %s/%lld, %u bytes @ offset %llu)\n",
-+ data->task.tk_pid,
-+ inode->i_sb->s_id,
-+ (long long)NFS_FILEID(inode),
-+ data->args.count,
-+ (unsigned long long)data->args.offset);
-+
-+ return 0;
-+}
-+
- /*
- * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
- * operation. If nfs_readdata_alloc() or get_user_pages() fails,
-@@ -287,7 +319,6 @@ static ssize_t nfs_direct_read_schedule_
- unsigned long user_addr = (unsigned long)iov->iov_base;
- size_t count = iov->iov_len;
- size_t rsize = NFS_SERVER(inode)->rsize;
-- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_cred = ctx->cred,
- };
-@@ -348,26 +379,9 @@ static ssize_t nfs_direct_read_schedule_
- data->res.fattr = &data->fattr;
- data->res.eof = 0;
- data->res.count = bytes;
-- nfs_fattr_init(&data->fattr);
-- msg.rpc_argp = &data->args;
-- msg.rpc_resp = &data->res;
-
-- task_setup_data.task = &data->task;
-- task_setup_data.callback_data = data;
-- NFS_PROTO(inode)->read_setup(data, &msg);
--
-- task = rpc_run_task(&task_setup_data);
-- if (IS_ERR(task))
-- break;
-- rpc_put_task(task);
--
-- dprintk("NFS: %5u initiated direct read call "
-- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-- data->task.tk_pid,
-- inode->i_sb->s_id,
-- (long long)NFS_FILEID(inode),
-- bytes,
-- (unsigned long long)data->args.offset);
-+ if (nfs_direct_read_execute(data, &task_setup_data, &msg))
-+ break;
-
- started += bytes;
- user_addr += bytes;
-@@ -457,12 +471,15 @@ static void nfs_direct_free_writedata(st
- }
-
- #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-+static long nfs_direct_write_execute(struct nfs_write_data *data,
-+ struct rpc_task_setup *task_setup_data,
-+ struct rpc_message *msg);
-+
- static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
- {
- struct inode *inode = dreq->inode;
- struct list_head *p;
- struct nfs_write_data *data;
-- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_cred = dreq->ctx->cred,
- };
-@@ -496,25 +513,7 @@ static void nfs_direct_write_reschedule(
- * Reuse data->task; data->args should not have changed
- * since the original request was sent.
- */
-- task_setup_data.task = &data->task;
-- task_setup_data.callback_data = data;
-- msg.rpc_argp = &data->args;
-- msg.rpc_resp = &data->res;
-- NFS_PROTO(inode)->write_setup(data, &msg);
--
-- /*
-- * We're called via an RPC callback, so BKL is already held.
-- */
-- task = rpc_run_task(&task_setup_data);
-- if (!IS_ERR(task))
-- rpc_put_task(task);
--
-- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-- data->task.tk_pid,
-- inode->i_sb->s_id,
-- (long long)NFS_FILEID(inode),
-- data->args.count,
-- (unsigned long long)data->args.offset);
-+ nfs_direct_write_execute(data, &task_setup_data, &msg);
- }
-
- if (put_dreq(dreq))
-@@ -557,10 +556,31 @@ static const struct rpc_call_ops nfs_com
- .rpc_release = nfs_direct_commit_release,
- };
-
-+static long nfs_direct_commit_execute(struct nfs_direct_req *dreq,
-+ struct nfs_write_data *data,
-+ struct rpc_task_setup *task_setup_data,
-+ struct rpc_message *msg)
-+{
-+ struct rpc_task *task;
-+
-+ NFS_PROTO(data->inode)->commit_setup(data, msg);
-+
-+ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
-+ dreq->commit_data = NULL;
-+
-+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-+
-+ task = rpc_run_task(task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+
-+ rpc_put_task(task);
-+ return 0;
-+}
-+
- static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
- {
- struct nfs_write_data *data = dreq->commit_data;
-- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
-@@ -589,16 +609,7 @@ static void nfs_direct_commit_schedule(s
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
-
-- NFS_PROTO(data->inode)->commit_setup(data, &msg);
--
-- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
-- dreq->commit_data = NULL;
--
-- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
--
-- task = rpc_run_task(&task_setup_data);
-- if (!IS_ERR(task))
-- rpc_put_task(task);
-+ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg);
- }
-
- static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
-@@ -700,6 +711,36 @@ static const struct rpc_call_ops nfs_wri
- .rpc_release = nfs_direct_write_release,
- };
-
-+static long nfs_direct_write_execute(struct nfs_write_data *data,
-+ struct rpc_task_setup *task_setup_data,
-+ struct rpc_message *msg)
-+{
-+ struct inode *inode = data->inode;
-+ struct rpc_task *task;
-+
-+ task_setup_data->task = &data->task;
-+ task_setup_data->callback_data = data;
-+ msg->rpc_argp = &data->args;
-+ msg->rpc_resp = &data->res;
-+ NFS_PROTO(inode)->write_setup(data, msg);
-+
-+ task = rpc_run_task(task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+
-+ rpc_put_task(task);
-+
-+ dprintk("NFS: %5u initiated direct write call "
-+ "(req %s/%lld, %u bytes @ offset %llu)\n",
-+ data->task.tk_pid,
-+ inode->i_sb->s_id,
-+ (long long)NFS_FILEID(inode),
-+ data->args.count,
-+ (unsigned long long)data->args.offset);
-+
-+ return 0;
-+}
-+
- /*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation. If nfs_writedata_alloc() or get_user_pages() fails,
-@@ -715,7 +756,6 @@ static ssize_t nfs_direct_write_schedule
- struct inode *inode = ctx->path.dentry->d_inode;
- unsigned long user_addr = (unsigned long)iov->iov_base;
- size_t count = iov->iov_len;
-- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_cred = ctx->cred,
- };
-@@ -782,24 +822,8 @@ static ssize_t nfs_direct_write_schedule
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
-
-- task_setup_data.task = &data->task;
-- task_setup_data.callback_data = data;
-- msg.rpc_argp = &data->args;
-- msg.rpc_resp = &data->res;
-- NFS_PROTO(inode)->write_setup(data, &msg);
--
-- task = rpc_run_task(&task_setup_data);
-- if (IS_ERR(task))
-- break;
-- rpc_put_task(task);
--
-- dprintk("NFS: %5u initiated direct write call "
-- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
-- data->task.tk_pid,
-- inode->i_sb->s_id,
-- (long long)NFS_FILEID(inode),
-- bytes,
-- (unsigned long long)data->args.offset);
-+ if (nfs_direct_write_execute(data, &task_setup_data, &msg))
-+ break;
-
- started += bytes;
- user_addr += bytes;
-diff -up linux-2.6.35.noarch/fs/nfsd/Kconfig.orig linux-2.6.35.noarch/fs/nfsd/Kconfig
---- linux-2.6.35.noarch/fs/nfsd/Kconfig.orig 2010-09-30 12:22:45.252047000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/Kconfig 2010-09-30 12:25:08.472291000 -0400
-@@ -79,3 +79,52 @@ config NFSD_V4
- available from http://linux-nfs.org/.
-
- If unsure, say N.
-+
-+config PNFSD
-+ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)"
-+ depends on NFSD_V4 && EXPERIMENTAL
-+ select EXPORTFS_FILE_LAYOUT
-+ help
-+ This option enables support for the parallel NFS features of the
-+ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1)
-+ in the kernel's NFS server.
-+
-+ Unless you're an NFS developer, say N.
-+
-+config PNFSD_LOCAL_EXPORT
-+ bool "Enable pNFS support for exporting local filesystems for debugging purposes"
-+ depends on PNFSD
-+ help
-+ Say Y here if you want your pNFS server to export local file systems
-+ over the files layout type. With this option the MDS (metadata
-+ server) functions also as a single DS (data server). This is mostly
-+ useful for development and debugging purposes.
-+
-+ If unsure, say N.
-+
-+config SPNFS
-+ bool "Provide spNFS server support (EXPERIMENTAL)"
-+ depends on PNFSD
-+ select RPCSEC_GSS_KRB5
-+ help
-+ Say Y here if you want spNFS server support.
-+
-+ If unsure, say N.
-+
-+config SPNFS_LAYOUTSEGMENTS
-+ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)"
-+ depends on SPNFS
-+ select RPCSEC_GSS_KRB5
-+ help
-+ Say Y here if you want spNFS to be able to return layout segments.
-+
-+ If unsure, say N.
-+
-+config SPNFS_BLOCK
-+ bool "Provide Block Layout server support (EXPERIMENTAL)"
-+ depends on SPNFS
-+ select EXPORTFS_BLOCK_LAYOUT
-+ help
-+ Say Y here if you want spNFS block layout support
-+
-+ If unsure, say N.
-diff -up linux-2.6.35.noarch/fs/nfsd/Makefile.orig linux-2.6.35.noarch/fs/nfsd/Makefile
---- linux-2.6.35.noarch/fs/nfsd/Makefile.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/Makefile 2010-09-30 12:25:08.477293000 -0400
-@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs
- nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
- nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
- nfs4acl.o nfs4callback.o nfs4recover.o
-+nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o
-+nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o
-+nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o
-+nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4callback.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4callback.c.orig 2010-09-30 12:22:45.263045000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4callback.c 2010-09-30 12:25:08.499284000 -0400
-@@ -41,7 +41,6 @@
-
- #define NFSPROC4_CB_NULL 0
- #define NFSPROC4_CB_COMPOUND 1
--#define NFS4_STATEID_SIZE 16
-
- /* Index of predefined Linux callback client operations */
-
-@@ -49,11 +48,17 @@ enum {
- NFSPROC4_CLNT_CB_NULL = 0,
- NFSPROC4_CLNT_CB_RECALL,
- NFSPROC4_CLNT_CB_SEQUENCE,
-+#if defined(CONFIG_PNFSD)
-+ NFSPROC4_CLNT_CB_LAYOUT,
-+ NFSPROC4_CLNT_CB_DEVICE,
-+#endif
- };
-
- enum nfs_cb_opnum4 {
- OP_CB_RECALL = 4,
-+ OP_CB_LAYOUT = 5,
- OP_CB_SEQUENCE = 11,
-+ OP_CB_DEVICE = 14,
- };
-
- #define NFS4_MAXTAGLEN 20
-@@ -79,6 +84,19 @@ enum nfs_cb_opnum4 {
- #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
- cb_sequence_dec_sz + \
- op_dec_sz)
-+#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
-+ cb_sequence_enc_sz + \
-+ 1 + 3 + \
-+ enc_nfs4_fh_sz + 4)
-+#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
-+ cb_sequence_dec_sz + \
-+ op_dec_sz)
-+#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \
-+ cb_sequence_enc_sz + \
-+ 1 + 6)
-+#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \
-+ cb_sequence_dec_sz + \
-+ op_dec_sz)
-
- /*
- * Generic encode routines from fs/nfs/nfs4xdr.c
-@@ -95,6 +113,10 @@ xdr_writemem(__be32 *p, const void *ptr,
- }
-
- #define WRITE32(n) *p++ = htonl(n)
-+#define WRITE64(n) do { \
-+ *p++ = htonl((u32)((n) >> 32)); \
-+ *p++ = htonl((u32)(n)); \
-+} while (0)
- #define WRITEMEM(ptr,nbytes) do { \
- p = xdr_writemem(p, ptr, nbytes); \
- } while (0)
-@@ -268,6 +290,111 @@ encode_cb_sequence(struct xdr_stream *xd
- hdr->nops++;
- }
-
-+#if defined(CONFIG_PNFSD)
-+
-+#include "pnfsd.h"
-+
-+static void
-+encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr,
-+ struct nfs4_cb_compound_hdr *hdr)
-+{
-+ u32 *p;
-+
-+ BUG_ON(hdr->minorversion == 0);
-+
-+ RESERVE_SPACE(20);
-+ WRITE32(OP_CB_LAYOUT);
-+ WRITE32(clr->cb.cbl_seg.layout_type);
-+ WRITE32(clr->cb.cbl_seg.iomode);
-+ WRITE32(clr->cb.cbl_layoutchanged);
-+ WRITE32(clr->cb.cbl_recall_type);
-+ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) {
-+ struct nfs4_fsid fsid = clr->cb.cbl_fsid;
-+
-+ RESERVE_SPACE(16);
-+ WRITE64(fsid.major);
-+ WRITE64(fsid.minor);
-+ dprintk("%s: type %x iomode %d changed %d recall_type %d "
-+ "fsid 0x%llx-0x%llx\n",
-+ __func__, clr->cb.cbl_seg.layout_type,
-+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
-+ clr->cb.cbl_recall_type, fsid.major, fsid.minor);
-+ } else if (clr->cb.cbl_recall_type == RETURN_FILE) {
-+ int len = clr->clr_file->fi_fhlen;
-+ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid;
-+
-+ RESERVE_SPACE(20 + len);
-+ WRITE32(len);
-+ WRITEMEM(clr->clr_file->fi_fhval, len);
-+ WRITE64(clr->cb.cbl_seg.offset);
-+ WRITE64(clr->cb.cbl_seg.length);
-+ encode_stateid(xdr, cbl_sid);
-+ dprintk("%s: type %x iomode %d changed %d recall_type %d "
-+ "offset %lld length %lld stateid " STATEID_FMT "\n",
-+ __func__, clr->cb.cbl_seg.layout_type,
-+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
-+ clr->cb.cbl_recall_type,
-+ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length,
-+ STATEID_VAL(cbl_sid));
-+ } else {
-+ dprintk("%s: type %x iomode %d changed %d recall_type %d\n",
-+ __func__, clr->cb.cbl_seg.layout_type,
-+ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
-+ clr->cb.cbl_recall_type);
-+ }
-+ hdr->nops++;
-+}
-+
-+static void
-+encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd,
-+ struct nfs4_cb_compound_hdr *hdr)
-+{
-+ u32 *p;
-+ int i;
-+ int len = nd->nd_list->cbd_len;
-+ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list;
-+
-+ dprintk("NFSD %s: --> num %d\n", __func__, len);
-+
-+ BUG_ON(hdr->minorversion == 0);
-+
-+ RESERVE_SPACE(8);
-+ WRITE32(OP_CB_DEVICE);
-+
-+ /* notify4 cnda_changes<>; */
-+ WRITE32(len);
-+ for (i = 0; i < len; i++) {
-+ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n",
-+ __func__, cbd[i].cbd_notify_type,
-+ cbd[i].cbd_layout_type,
-+ cbd[i].cbd_devid.sbid,
-+ cbd[i].cbd_devid.devid,
-+ cbd[i].cbd_immediate, i);
-+
-+ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
-+ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE);
-+ RESERVE_SPACE(32);
-+ /* bitmap4 notify_mask; */
-+ WRITE32(1);
-+ WRITE32(cbd[i].cbd_notify_type);
-+ /* opaque notify_vals<>; */
-+ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
-+ WRITE32(24);
-+ else
-+ WRITE32(20);
-+ WRITE32(cbd[i].cbd_layout_type);
-+ WRITE64(cbd[i].cbd_devid.sbid);
-+ WRITE64(cbd[i].cbd_devid.devid);
-+
-+ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) {
-+ RESERVE_SPACE(4);
-+ WRITE32(cbd[i].cbd_immediate);
-+ }
-+ }
-+ hdr->nops++;
-+}
-+#endif /* CONFIG_PNFSD */
-+
- static int
- nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
- {
-@@ -297,6 +424,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *
- return 0;
- }
-
-+#if defined(CONFIG_PNFSD)
-+static int
-+nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p,
-+ struct nfs4_rpc_args *rpc_args)
-+{
-+ struct xdr_stream xdr;
-+ struct nfs4_layoutrecall *args = rpc_args->args_op;
-+ struct nfs4_cb_compound_hdr hdr = {
-+ .ident = 0,
-+ .minorversion = rpc_args->args_seq.cbs_minorversion,
-+ };
-+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_cb_compound_hdr(&xdr, &hdr);
-+ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
-+ encode_cb_layout(&xdr, args, &hdr);
-+ encode_cb_nops(&hdr);
-+ return 0;
-+}
-+
-+static int
-+nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p,
-+ struct nfs4_rpc_args *rpc_args)
-+{
-+ struct xdr_stream xdr;
-+ struct nfs4_notify_device *args = rpc_args->args_op;
-+ struct nfs4_cb_compound_hdr hdr = {
-+ .ident = 0,
-+ .minorversion = rpc_args->args_seq.cbs_minorversion,
-+ };
-+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_cb_compound_hdr(&xdr, &hdr);
-+ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
-+ encode_cb_device(&xdr, args, &hdr);
-+ encode_cb_nops(&hdr);
-+ return 0;
-+}
-+#endif /* CONFIG_PNFSD */
-
- static int
- decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-@@ -413,6 +579,48 @@ out:
- return status;
- }
-
-+#if defined(CONFIG_PNFSD)
-+static int
-+nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p,
-+ struct nfsd4_cb_sequence *seq)
-+{
-+ struct xdr_stream xdr;
-+ struct nfs4_cb_compound_hdr hdr;
-+ int status;
-+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_cb_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_cb_sequence(&xdr, seq, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT);
-+out:
-+ return status;
-+}
-+
-+static int
-+nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p,
-+ struct nfsd4_cb_sequence *seq)
-+{
-+ struct xdr_stream xdr;
-+ struct nfs4_cb_compound_hdr hdr;
-+ int status;
-+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_cb_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_cb_sequence(&xdr, seq, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE);
-+out:
-+ return status;
-+}
-+#endif /* CONFIG_PNFSD */
-+
- /*
- * RPC procedure tables
- */
-@@ -430,6 +638,10 @@ out:
- static struct rpc_procinfo nfs4_cb_procedures[] = {
- PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null),
- PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall),
-+#if defined(CONFIG_PNFSD)
-+ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout),
-+ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device),
-+#endif
- };
-
- static struct rpc_version nfs_cb_version4 = {
-@@ -615,10 +827,9 @@ out:
- * TODO: cb_sequence should support referring call lists, cachethis, multiple
- * slots, and mark callback channel down on communication errors.
- */
--static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
-+static void nfsd4_cb_prepare_sequence(struct rpc_task *task,
-+ struct nfs4_client *clp)
- {
-- struct nfs4_delegation *dp = calldata;
-- struct nfs4_client *clp = dp->dl_client;
- struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
- u32 minorversion = clp->cl_cb_conn.cb_minorversion;
- int status = 0;
-@@ -638,11 +849,15 @@ static void nfsd4_cb_prepare(struct rpc_
- rpc_call_start(task);
- }
-
--static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
-+static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata)
- {
- struct nfs4_delegation *dp = calldata;
-- struct nfs4_client *clp = dp->dl_client;
-+ nfsd4_cb_prepare_sequence(task, dp->dl_client);
-+}
-
-+static void nfsd4_cb_done_sequence(struct rpc_task *task,
-+ struct nfs4_client *clp)
-+{
- dprintk("%s: minorversion=%d\n", __func__,
- clp->cl_cb_conn.cb_minorversion);
-
-@@ -666,7 +881,7 @@ static void nfsd4_cb_recall_done(struct
- struct nfs4_client *clp = dp->dl_client;
- struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
-
-- nfsd4_cb_done(task, calldata);
-+ nfsd4_cb_done_sequence(task, clp);
-
- if (current_rpc_client == NULL) {
- /* We're shutting down; give up. */
-@@ -713,7 +928,7 @@ static void nfsd4_cb_recall_release(void
- }
-
- static const struct rpc_call_ops nfsd4_cb_recall_ops = {
-- .rpc_call_prepare = nfsd4_cb_prepare,
-+ .rpc_call_prepare = nfsd4_cb_recall_prepare,
- .rpc_call_done = nfsd4_cb_recall_done,
- .rpc_release = nfsd4_cb_recall_release,
- };
-@@ -788,3 +1003,173 @@ void nfsd4_cb_recall(struct nfs4_delegat
- {
- queue_work(callback_wq, &dp->dl_recall.cb_work);
- }
-+
-+#if defined(CONFIG_PNFSD)
-+static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata)
-+{
-+ struct nfs4_layoutrecall *clr = calldata;
-+ nfsd4_cb_prepare_sequence(task, clr->clr_client);
-+}
-+
-+static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata)
-+{
-+ struct nfs4_layoutrecall *clr = calldata;
-+ struct nfs4_client *clp = clr->clr_client;
-+
-+ nfsd4_cb_done_sequence(task, clp);
-+
-+ if (!task->tk_status)
-+ return;
-+
-+ printk("%s: clp %p cb_client %p fp %p failed with status %d\n",
-+ __func__,
-+ clp,
-+ clp->cl_cb_client,
-+ clr->clr_file,
-+ task->tk_status);
-+
-+ switch (task->tk_status) {
-+ case -EIO:
-+ /* Network partition? */
-+ atomic_set(&clp->cl_cb_set, 0);
-+ warn_no_callback_path(clp, task->tk_status);
-+ /* FIXME:
-+ * The pnfs standard states that we need to only expire
-+ * the client after at-least "lease time" .eg lease-time * 2
-+ * when failing to communicate a recall
-+ */
-+ break;
-+ case -NFS4ERR_DELAY:
-+ /* Poll the client until it's done with the layout */
-+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
-+ task->tk_status = 0;
-+ rpc_restart_call_prepare(task);
-+ break;
-+ case -NFS4ERR_NOMATCHING_LAYOUT:
-+ task->tk_status = 0;
-+ nomatching_layout(clr);
-+ }
-+}
-+
-+static void nfsd4_cb_layout_release(void *calldata)
-+{
-+ struct nfs4_layoutrecall *clr = calldata;
-+ kfree(clr->clr_args);
-+ clr->clr_args = NULL;
-+ put_layoutrecall(clr);
-+}
-+
-+static const struct rpc_call_ops nfsd4_cb_layout_ops = {
-+ .rpc_call_prepare = nfsd4_cb_layout_prepare,
-+ .rpc_call_done = nfsd4_cb_layout_done,
-+ .rpc_release = nfsd4_cb_layout_release,
-+};
-+
-+/*
-+ * Called with state lock.
-+ */
-+int
-+nfsd4_cb_layout(struct nfs4_layoutrecall *clr)
-+{
-+ struct nfs4_client *clp = clr->clr_client;
-+ struct rpc_clnt *clnt = clp->cl_cb_client;
-+ struct nfs4_rpc_args *args;
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT],
-+ .rpc_cred = callback_cred
-+ };
-+ int status;
-+
-+ args = kzalloc(sizeof(*args), GFP_KERNEL);
-+ if (!args) {
-+ status = -ENOMEM;
-+ goto out;
-+ }
-+ clr->clr_args = args;
-+ args->args_op = clr;
-+ msg.rpc_argp = args;
-+ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-+ &nfsd4_cb_layout_ops, clr);
-+out:
-+ if (status) {
-+ kfree(args);
-+ put_layoutrecall(clr);
-+ }
-+ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status);
-+ return status;
-+}
-+
-+static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata)
-+{
-+ struct nfs4_notify_device *cbnd = calldata;
-+ nfsd4_cb_prepare_sequence(task, cbnd->nd_client);
-+}
-+
-+static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata)
-+{
-+ struct nfs4_notify_device *cbnd = calldata;
-+ struct nfs4_client *clp = cbnd->nd_client;
-+
-+ nfsd4_cb_done_sequence(task, clp);
-+
-+ dprintk("%s: clp %p cb_client %p: status %d\n",
-+ __func__,
-+ clp,
-+ clp->cl_cb_client,
-+ task->tk_status);
-+
-+ if (task->tk_status == -EIO) {
-+ /* Network partition? */
-+ atomic_set(&clp->cl_cb_set, 0);
-+ warn_no_callback_path(clp, task->tk_status);
-+ }
-+}
-+
-+static void nfsd4_cb_device_release(void *calldata)
-+{
-+ struct nfs4_notify_device *cbnd = calldata;
-+ kfree(cbnd->nd_args);
-+ cbnd->nd_args = NULL;
-+ kfree(cbnd);
-+}
-+
-+static const struct rpc_call_ops nfsd4_cb_device_ops = {
-+ .rpc_call_prepare = nfsd4_cb_device_prepare,
-+ .rpc_call_done = nfsd4_cb_device_done,
-+ .rpc_release = nfsd4_cb_device_release,
-+};
-+
-+/*
-+ * Called with state lock.
-+ */
-+int
-+nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd)
-+{
-+ struct nfs4_client *clp = cbnd->nd_client;
-+ struct rpc_clnt *clnt = clp->cl_cb_client;
-+ struct nfs4_rpc_args *args;
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE],
-+ .rpc_cred = callback_cred
-+ };
-+ int status = -EIO;
-+
-+ dprintk("%s: clp %p\n", __func__, clp);
-+
-+ args = kzalloc(sizeof(*args), GFP_KERNEL);
-+ if (!args) {
-+ status = -ENOMEM;
-+ goto out;
-+ }
-+ args->args_op = cbnd;
-+ msg.rpc_argp = args;
-+
-+ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
-+ &nfsd4_cb_device_ops, cbnd);
-+out:
-+ if (status)
-+ kfree(args);
-+ dprintk("%s: status %d\n", __func__, status);
-+ return status;
-+}
-+#endif /* CONFIG_PNFSD */
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-09-30 12:25:08.503285000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c 2010-09-30 12:25:08.505284000 -0400
-@@ -0,0 +1,1679 @@
-+/******************************************************************************
-+ *
-+ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
-+ * (c) 2009 NetApp. All Rights Reserved.
-+ *
-+ * NetApp provides this source code under the GPL v2 License.
-+ * The GPL v2 license is available at
-+ * http://opensource.org/licenses/gpl-license.php.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ *****************************************************************************/
-+
-+#include "pnfsd.h"
-+
-+#define NFSDDBG_FACILITY NFSDDBG_PROC
-+
-+/* Globals */
-+static u32 current_layoutid = 1;
-+
-+/*
-+ * Currently used for manipulating the layout state.
-+ */
-+static DEFINE_SPINLOCK(layout_lock);
-+
-+#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP)
-+# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock))
-+#else
-+# define BUG_ON_UNLOCKED_LAYOUT()
-+#endif
-+
-+/*
-+ * Layout state - NFSv4.1 pNFS
-+ */
-+static struct kmem_cache *pnfs_layout_slab;
-+static struct kmem_cache *pnfs_layoutrecall_slab;
-+
-+/* hash table for nfsd4_pnfs_deviceid.sbid */
-+#define SBID_HASH_BITS 8
-+#define SBID_HASH_SIZE (1 << SBID_HASH_BITS)
-+#define SBID_HASH_MASK (SBID_HASH_SIZE - 1)
-+
-+struct sbid_tracker {
-+ u64 id;
-+ struct super_block *sb;
-+ struct list_head hash;
-+};
-+
-+static u64 current_sbid;
-+static struct list_head sbid_hashtbl[SBID_HASH_SIZE];
-+
-+static inline unsigned long
-+sbid_hashval(struct super_block *sb)
-+{
-+ return hash_ptr(sb, SBID_HASH_BITS);
-+}
-+
-+static inline struct sbid_tracker *
-+alloc_sbid(void)
-+{
-+ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL);
-+}
-+
-+static void
-+destroy_sbid(struct sbid_tracker *sbid)
-+{
-+ spin_lock(&layout_lock);
-+ list_del(&sbid->hash);
-+ spin_unlock(&layout_lock);
-+ kfree(sbid);
-+}
-+
-+void
-+nfsd4_free_pnfs_slabs(void)
-+{
-+ int i;
-+ struct sbid_tracker *sbid;
-+
-+ nfsd4_free_slab(&pnfs_layout_slab);
-+ nfsd4_free_slab(&pnfs_layoutrecall_slab);
-+
-+ for (i = 0; i < SBID_HASH_SIZE; i++) {
-+ while (!list_empty(&sbid_hashtbl[i])) {
-+ sbid = list_first_entry(&sbid_hashtbl[i],
-+ struct sbid_tracker,
-+ hash);
-+ destroy_sbid(sbid);
-+ }
-+ }
-+}
-+
-+int
-+nfsd4_init_pnfs_slabs(void)
-+{
-+ int i;
-+
-+ pnfs_layout_slab = kmem_cache_create("pnfs_layouts",
-+ sizeof(struct nfs4_layout), 0, 0, NULL);
-+ if (pnfs_layout_slab == NULL)
-+ return -ENOMEM;
-+ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls",
-+ sizeof(struct nfs4_layoutrecall), 0, 0, NULL);
-+ if (pnfs_layoutrecall_slab == NULL)
-+ return -ENOMEM;
-+
-+ for (i = 0; i < SBID_HASH_SIZE; i++) {
-+ INIT_LIST_HEAD(&sbid_hashtbl[i]);
-+ }
-+
-+ return 0;
-+}
-+
-+/* XXX: Need to implement the notify types and track which
-+ * clients have which devices. */
-+void pnfs_set_device_notify(clientid_t *clid, unsigned int types)
-+{
-+ struct nfs4_client *clp;
-+ dprintk("%s: -->\n", __func__);
-+
-+ nfs4_lock_state();
-+ /* Indicate that client has a device so we can only notify
-+ * the correct clients */
-+ clp = find_confirmed_client(clid);
-+ if (clp) {
-+ atomic_inc(&clp->cl_deviceref);
-+ dprintk("%s: Incr device count (clnt %p) to %d\n",
-+ __func__, clp, atomic_read(&clp->cl_deviceref));
-+ }
-+ nfs4_unlock_state();
-+}
-+
-+/* Clear notifications for this client
-+ * XXX: Do we need to loop through a clean up all
-+ * krefs when nfsd cleans up the client? */
-+void pnfs_clear_device_notify(struct nfs4_client *clp)
-+{
-+ atomic_dec(&clp->cl_deviceref);
-+ dprintk("%s: Decr device count (clnt %p) to %d\n",
-+ __func__, clp, atomic_read(&clp->cl_deviceref));
-+}
-+
-+static struct nfs4_layout_state *
-+alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp,
-+ stateid_t *stateid)
-+{
-+ struct nfs4_layout_state *new;
-+
-+ /* FIXME: use a kmem_cache */
-+ new = kzalloc(sizeof(*new), GFP_KERNEL);
-+ if (!new)
-+ return new;
-+ get_nfs4_file(fp);
-+ INIT_LIST_HEAD(&new->ls_perfile);
-+ INIT_LIST_HEAD(&new->ls_layouts);
-+ kref_init(&new->ls_ref);
-+ new->ls_client = clp;
-+ new->ls_file = fp;
-+ new->ls_stateid.si_boot = stateid->si_boot;
-+ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */
-+ new->ls_stateid.si_generation = 1;
-+ spin_lock(&layout_lock);
-+ new->ls_stateid.si_fileid = current_layoutid++;
-+ list_add(&new->ls_perfile, &fp->fi_layout_states);
-+ spin_unlock(&layout_lock);
-+ return new;
-+}
-+
-+static inline void
-+get_layout_state(struct nfs4_layout_state *ls)
-+{
-+ kref_get(&ls->ls_ref);
-+}
-+
-+static void
-+destroy_layout_state_common(struct nfs4_layout_state *ls)
-+{
-+ struct nfs4_file *fp = ls->ls_file;
-+
-+ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp,
-+ ls->ls_client);
-+ BUG_ON(!list_empty(&ls->ls_layouts));
-+ kfree(ls);
-+ put_nfs4_file(fp);
-+}
-+
-+static void
-+destroy_layout_state(struct kref *kref)
-+{
-+ struct nfs4_layout_state *ls =
-+ container_of(kref, struct nfs4_layout_state, ls_ref);
-+
-+ spin_lock(&layout_lock);
-+ list_del(&ls->ls_perfile);
-+ spin_unlock(&layout_lock);
-+ destroy_layout_state_common(ls);
-+}
-+
-+static void
-+destroy_layout_state_locked(struct kref *kref)
-+{
-+ struct nfs4_layout_state *ls =
-+ container_of(kref, struct nfs4_layout_state, ls_ref);
-+
-+ list_del(&ls->ls_perfile);
-+ destroy_layout_state_common(ls);
-+}
-+
-+static inline void
-+put_layout_state(struct nfs4_layout_state *ls)
-+{
-+ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
-+ atomic_read(&ls->ls_ref.refcount));
-+ kref_put(&ls->ls_ref, destroy_layout_state);
-+}
-+
-+static inline void
-+put_layout_state_locked(struct nfs4_layout_state *ls)
-+{
-+ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
-+ atomic_read(&ls->ls_ref.refcount));
-+ kref_put(&ls->ls_ref, destroy_layout_state_locked);
-+}
-+
-+/*
-+ * Search the fp->fi_layout_state list for a layout state with the clientid.
-+ * If not found, then this is a 'first open/delegation/lock stateid' from
-+ * the client for this file.
-+ * Called under the layout_lock.
-+ */
-+static struct nfs4_layout_state *
-+find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp)
-+{
-+ struct nfs4_layout_state *ls;
+ /*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation. If nfs_writedata_alloc() or get_user_pages() fails,
+@@ -715,7 +756,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+ struct inode *inode = ctx->path.dentry->d_inode;
+ unsigned long user_addr = (unsigned long)iov->iov_base;
+ size_t count = iov->iov_len;
+- struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_cred = ctx->cred,
+ };
+@@ -782,24 +822,8 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+
+- task_setup_data.task = &data->task;
+- task_setup_data.callback_data = data;
+- msg.rpc_argp = &data->args;
+- msg.rpc_resp = &data->res;
+- NFS_PROTO(inode)->write_setup(data, &msg);
+-
+- task = rpc_run_task(&task_setup_data);
+- if (IS_ERR(task))
++ if (nfs_direct_write_execute(data, &task_setup_data, &msg))
+ break;
+- rpc_put_task(task);
+-
+- dprintk("NFS: %5u initiated direct write call "
+- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
+- data->task.tk_pid,
+- inode->i_sb->s_id,
+- (long long)NFS_FILEID(inode),
+- bytes,
+- (unsigned long long)data->args.offset);
+
+ started += bytes;
+ user_addr += bytes;
+diff --git a/fs/nfs/file.c b/fs/nfs/file.c
+index 05bf3c0..28d4aa3 100644
+--- a/fs/nfs/file.c
++++ b/fs/nfs/file.c
+@@ -36,6 +36,7 @@
+ #include "internal.h"
+ #include "iostat.h"
+ #include "fscache.h"
++#include "pnfs.h"
+
+ #define NFSDBG_FACILITY NFSDBG_FILE
+
+@@ -380,12 +381,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int once_thru = 0;
++ struct pnfs_layout_segment *lseg;
+
+ dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
++ lseg = pnfs_update_layout(mapping->host,
++ nfs_file_open_context(file),
++ pos, len, IOMODE_RW);
+ start:
+ /*
+ * Prevent starvation issues if someone is doing a consistency
+@@ -394,17 +399,22 @@ start:
+ ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+- return ret;
++ goto out;
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+- if (!page)
+- return -ENOMEM;
++ if (!page) {
++ ret = -ENOMEM;
++ goto out;
++ }
+ *pagep = page;
+
+- ret = nfs_flush_incompatible(file, page);
++ ret = nfs_flush_incompatible(file, page, lseg);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
++ *pagep = NULL;
++ *fsdata = NULL;
++ goto out;
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, page, pos, len)) {
+ once_thru = 1;
+@@ -413,6 +423,12 @@ start:
+ if (!ret)
+ goto start;
+ }
++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
++ out:
++ if (ret) {
++ put_lseg(lseg);
++ *fsdata = NULL;
++ }
+ return ret;
+ }
+
+@@ -422,6 +438,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
+ {
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ int status;
++ struct pnfs_layout_segment *lseg;
+
+ dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+@@ -448,10 +465,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
+ zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ }
+
+- status = nfs_updatepage(file, page, offset, copied);
++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
++ status = pnfs_write_end(file, page, pos, len, copied, lseg);
++ if (status)
++ goto out;
++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
+
++ out:
+ unlock_page(page);
+ page_cache_release(page);
++ pnfs_write_end_cleanup(file, fsdata);
++ put_lseg(lseg);
+
+ if (status < 0)
+ return status;
+@@ -562,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ /* make sure the cache has finished storing the page */
+ nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+
++ /* XXX Do we want to call pnfs_update_layout here? */
+
-+ BUG_ON_UNLOCKED_LAYOUT();
-+ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) {
-+ if (ls->ls_client == clp) {
-+ dprintk("pNFS %s: before GET ls %p ls_ref %d\n",
-+ __func__, ls,
-+ atomic_read(&ls->ls_ref.refcount));
-+ get_layout_state(ls);
-+ return ls;
-+ }
+ lock_page(page);
+ mapping = page->mapping;
+ if (mapping != dentry->d_inode->i_mapping)
+@@ -572,11 +598,11 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ if (pagelen == 0)
+ goto out_unlock;
+
+- ret = nfs_flush_incompatible(filp, page);
++ ret = nfs_flush_incompatible(filp, page, NULL);
+ if (ret != 0)
+ goto out_unlock;
+
+- ret = nfs_updatepage(filp, page, 0, pagelen);
++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL);
+ out_unlock:
+ if (!ret)
+ return VM_FAULT_LOCKED;
+diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
+index 7d2d6c7..437d9a6 100644
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -48,6 +48,7 @@
+ #include "internal.h"
+ #include "fscache.h"
+ #include "dns_resolve.h"
++#include "pnfs.h"
+
+ #define NFSDBG_FACILITY NFSDBG_VFS
+
+@@ -648,6 +649,7 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
+ atomic_inc(&ctx->lock_context.count);
+ return ctx;
+ }
++EXPORT_SYMBOL(get_nfs_open_context);
+
+ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+ {
+@@ -1000,6 +1002,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
+ fattr->time_start = jiffies;
+ fattr->gencount = nfs_inc_attr_generation_counter();
+ }
++EXPORT_SYMBOL(nfs_fattr_init);
+
+ struct nfs_fattr *nfs_alloc_fattr(void)
+ {
+@@ -1209,6 +1212,14 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+ server->fsid = fattr->fsid;
+
+ /*
++ * file needs layout commit, server attributes may be stale
++ */
++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) {
++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n",
++ __func__, inode->i_sb->s_id, inode->i_ino);
++ return 0;
+ }
-+ return NULL;
-+}
++ /*
+ * Update the read time so we don't revalidate too often.
+ */
+ nfsi->read_cache_jiffies = fattr->time_start;
+@@ -1407,11 +1418,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+ */
+ void nfs4_evict_inode(struct inode *inode)
+ {
++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true);
+ truncate_inode_pages(&inode->i_data, 0);
+ end_writeback(inode);
++ pnfs_destroy_layout(NFS_I(inode));
+ /* If we are holding a delegation, return it! */
+ nfs_inode_return_delegation_noreclaim(inode);
+- /* First call standard NFS clear_inode() code */
+ nfs_clear_inode(inode);
+ }
+ #endif
+@@ -1446,6 +1458,8 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
+ nfsi->delegation = NULL;
+ nfsi->delegation_state = 0;
+ init_rwsem(&nfsi->rwsem);
++ rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout");
++ nfsi->layout = NULL;
+ #endif
+ }
+
+diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
+index c961bc9..4e7a4c9 100644
+--- a/fs/nfs/internal.h
++++ b/fs/nfs/internal.h
+@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+ struct nfs_fattr *);
+ extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
+ extern int nfs4_check_client_ready(struct nfs_client *clp);
++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1,
++ const struct sockaddr *sa2);
++extern int nfs4_set_client(struct nfs_server *server,
++ const char *hostname,
++ const struct sockaddr *addr,
++ const size_t addrlen,
++ const char *ip_addr,
++ rpc_authflavor_t authflavour,
++ int proto, const struct rpc_timeout *timeparms,
++ u32 minorversion);
+ #ifdef CONFIG_PROC_FS
+ extern int __init nfs_fs_proc_init(void);
+ extern void nfs_fs_proc_exit(void);
+@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead;
+ extern struct rpc_procinfo nfs4_procedures[];
+ #endif
+
++extern int nfs4_recover_expired_lease(struct nfs_client *clp);
+
-+static __be32
-+verify_stateid(struct nfs4_file *fp, stateid_t *stateid)
+ /* proc.c */
+ void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
+@@ -249,10 +261,31 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
+ #endif
+
+ /* read.c */
++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops);
++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops);
+ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+
+ /* write.c */
++extern int nfs_initiate_write(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how);
++extern int pnfs_initiate_write(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how);
++extern int nfs_initiate_commit(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how);
++extern int pnfs_initiate_commit(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how, int pnfs);
+ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
++extern void nfs_mark_list_commit(struct list_head *head);
+ #ifdef CONFIG_MIGRATION
+ extern int nfs_migrate_page(struct address_space *,
+ struct page *, struct page *);
+diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
+index 311e15c..cb390fb 100644
+--- a/fs/nfs/nfs4_fs.h
++++ b/fs/nfs/nfs4_fs.h
+@@ -46,6 +46,7 @@ enum nfs4_client_state {
+ NFS4CLNT_DELEGRETURN,
+ NFS4CLNT_SESSION_RESET,
+ NFS4CLNT_RECALL_SLOT,
++ NFS4CLNT_LAYOUT_RECALL,
+ };
+
+ enum nfs4_session_state {
+@@ -256,10 +257,12 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
+ }
+
+ extern int nfs4_setup_sequence(const struct nfs_server *server,
++ struct nfs4_session *ds_session,
+ struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+ int cache_reply, struct rpc_task *task);
+ extern void nfs4_destroy_session(struct nfs4_session *session);
+ extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *);
+ extern int nfs4_proc_create_session(struct nfs_client *);
+ extern int nfs4_proc_destroy_session(struct nfs4_session *);
+ extern int nfs4_init_session(struct nfs_server *server);
+@@ -272,6 +275,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
+ }
+
+ static inline int nfs4_setup_sequence(const struct nfs_server *server,
++ struct nfs4_session *ds_session,
+ struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+ int cache_reply, struct rpc_task *task)
+ {
+@@ -289,7 +293,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
+ extern const u32 nfs4_fattr_bitmap[2];
+ extern const u32 nfs4_statfs_bitmap[2];
+ extern const u32 nfs4_pathconf_bitmap[2];
+-extern const u32 nfs4_fsinfo_bitmap[2];
++extern const u32 nfs4_fsinfo_bitmap[3];
+ extern const u32 nfs4_fs_locations_bitmap[2];
+
+ /* nfs4renewd.c */
+@@ -299,13 +303,24 @@ extern void nfs4_kill_renewd(struct nfs_client *);
+ extern void nfs4_renew_state(struct work_struct *);
+
+ /* nfs4state.c */
++struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
+ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
+ #if defined(CONFIG_NFS_V4_1)
+-struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
+ struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+ #endif /* CONFIG_NFS_V4_1 */
+
++static inline struct rpc_cred *
++nfs4_get_machine_cred(struct nfs_client *clp)
+{
-+ struct nfs4_stateid *local = NULL;
-+ struct nfs4_delegation *temp = NULL;
++ struct rpc_cred *cred;
+
-+ /* check if open or lock stateid */
-+ local = find_stateid(stateid, RD_STATE);
-+ if (local)
-+ return 0;
-+ temp = find_delegation_stateid(fp->fi_inode, stateid);
-+ if (temp)
-+ return 0;
-+ return nfserr_bad_stateid;
++ spin_lock(&clp->cl_lock);
++ cred = nfs4_get_machine_cred_locked(clp);
++ spin_unlock(&clp->cl_lock);
++ return cred;
+}
+
+ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+ extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
+new file mode 100644
+index 0000000..aaabe2f
+--- /dev/null
++++ b/fs/nfs/nfs4filelayout.c
+@@ -0,0 +1,679 @@
+/*
-+ * nfs4_preocess_layout_stateid ()
++ * Module for the pnfs nfs4 file layout driver.
++ * Defines all I/O and Policy interface operations, plus code
++ * to register itself with the pNFS client.
+ *
-+ * We have looked up the nfs4_file corresponding to the current_fh, and
-+ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op()
-+ * that make sense with a layout stateid.
++ * Copyright (c) 2002
++ * The Regents of the University of Michigan
++ * All Rights Reserved
+ *
-+ * Called with the state_lock held
-+ * Returns zero and stateid is updated, or error.
++ * Dean Hildebrand <dhildebz at umich.edu>
+ *
-+ * Note: the struct nfs4_layout_state pointer is only set by layoutget.
++ * Permission is granted to use, copy, create derivative works, and
++ * redistribute this software and such derivative works for any purpose,
++ * so long as the name of the University of Michigan is not used in
++ * any advertising or publicity pertaining to the use or distribution
++ * of this software without specific, written prior authorization. If
++ * the above copyright notice or any other identification of the
++ * University of Michigan is included in any copy of any portion of
++ * this software, then the disclaimer below must also be included.
++ *
++ * This software is provided as is, without representation or warranty
++ * of any kind either express or implied, including without limitation
++ * the implied warranties of merchantability, fitness for a particular
++ * purpose, or noninfringement. The Regents of the University of
++ * Michigan shall not be liable for any damages, including special,
++ * indirect, incidental, or consequential damages, with respect to any
++ * claim arising out of or in connection with the use of the software,
++ * even if it has been or is hereafter advised of the possibility of
++ * such damages.
+ */
-+static __be32
-+nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp,
-+ stateid_t *stateid, struct nfs4_layout_state **lsp)
-+{
-+ struct nfs4_layout_state *ls = NULL;
-+ __be32 status = 0;
-+
-+ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp);
-+
-+ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__,
-+ STATEID_VAL(stateid));
-+
-+ status = nfs4_check_stateid(stateid);
-+ if (status)
-+ goto out;
-+
-+ /* Is this the first use of this layout ? */
-+ spin_lock(&layout_lock);
-+ ls = find_get_layout_state(clp, fp);
-+ spin_unlock(&layout_lock);
-+ if (!ls) {
-+ /* Only alloc layout state on layoutget (which sets lsp). */
-+ if (!lsp) {
-+ dprintk("%s ERROR: Not layoutget & no layout stateid\n",
-+ __func__);
-+ status = nfserr_bad_stateid;
-+ goto out;
-+ }
-+ dprintk("%s Initial stateid for layout: file %p client %p\n",
-+ __func__, fp, clp);
-+
-+ /* verify input stateid */
-+ status = verify_stateid(fp, stateid);
-+ if (status) {
-+ dprintk("%s ERROR: invalid open/deleg/lock stateid\n",
-+ __func__);
-+ goto out;
-+ }
-+ ls = alloc_init_layout_state(clp, fp, stateid);
-+ if (!ls) {
-+ dprintk("%s pNFS ERROR: no memory for layout state\n",
-+ __func__);
-+ status = nfserr_resource;
-+ goto out;
-+ }
-+ } else {
-+ dprintk("%s Not initial stateid. Layout state %p file %p\n",
-+ __func__, ls, fp);
-+
-+ /* BAD STATEID */
-+ status = nfserr_bad_stateid;
-+ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque,
-+ sizeof(stateid_opaque_t)) != 0) {
-+
-+ /* if a LAYOUTGET operation and stateid is a valid
-+ * open/deleg/lock stateid, accept it as a parallel
-+ * initial layout stateid
-+ */
-+ if (lsp && ((verify_stateid(fp, stateid)) == 0)) {
-+ dprintk("%s parallel initial layout state\n",
-+ __func__);
-+ goto update;
-+ }
-+
-+ dprintk("%s ERROR bad opaque in stateid 1\n", __func__);
-+ goto out_put;
-+ }
-+
-+ /* stateid is a valid layout stateid for this file. */
-+ if (stateid->si_generation > ls->ls_stateid.si_generation) {
-+ dprintk("%s bad stateid 1\n", __func__);
-+ goto out_put;
-+ }
-+update:
-+ update_stateid(&ls->ls_stateid);
-+ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n",
-+ __func__, ls->ls_stateid.si_generation, ls);
-+ }
-+ status = 0;
-+ /* Set the stateid to be encoded */
-+ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t));
-+
-+ /* Return the layout state if requested */
-+ if (lsp) {
-+ get_layout_state(ls);
-+ *lsp = ls;
-+ }
-+ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__,
-+ STATEID_VAL(&ls->ls_stateid));
-+out_put:
-+ dprintk("%s PUT LO STATE:\n", __func__);
-+ put_layout_state(ls);
-+out:
-+ dprintk("<-- %s status %d\n", __func__, htonl(status));
-+
-+ return status;
-+}
+
-+static inline struct nfs4_layout *
-+alloc_layout(void)
-+{
-+ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL);
-+}
++#include <linux/nfs_fs.h>
+
-+static inline void
-+free_layout(struct nfs4_layout *lp)
-+{
-+ kmem_cache_free(pnfs_layout_slab, lp);
-+}
++#include "internal.h"
++#include "nfs4filelayout.h"
+
-+static void
-+init_layout(struct nfs4_layout_state *ls,
-+ struct nfs4_layout *lp,
-+ struct nfs4_file *fp,
-+ struct nfs4_client *clp,
-+ struct svc_fh *current_fh,
-+ struct nfsd4_layout_seg *seg)
-+{
-+ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__,
-+ ls, lp, clp, fp, fp->fi_inode);
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+ get_nfs4_file(fp);
-+ lp->lo_client = clp;
-+ lp->lo_file = fp;
-+ get_layout_state(ls);
-+ lp->lo_state = ls;
-+ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg));
-+ spin_lock(&layout_lock);
-+ list_add_tail(&lp->lo_perstate, &ls->ls_layouts);
-+ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts);
-+ list_add_tail(&lp->lo_perfile, &fp->fi_layouts);
-+ spin_unlock(&layout_lock);
-+ dprintk("pNFS %s end\n", __func__);
-+}
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Dean Hildebrand <dhildebz at umich.edu>");
++MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
-+static void
-+dequeue_layout(struct nfs4_layout *lp)
++int
++filelayout_initialize_mountpoint(struct nfs_server *nfss,
++ const struct nfs_fh *mntfh)
+{
-+ BUG_ON_UNLOCKED_LAYOUT();
-+ list_del(&lp->lo_perclnt);
-+ list_del(&lp->lo_perfile);
-+ list_del(&lp->lo_perstate);
++ int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
++ nfs4_fl_free_deviceid_callback);
++ if (status) {
++ printk(KERN_WARNING "%s: deviceid cache could not be "
++ "initialized\n", __func__);
++ return status;
++ }
++ dprintk("%s: deviceid cache has been initialized successfully\n",
++ __func__);
++ return 0;
+}
+
-+static void
-+destroy_layout(struct nfs4_layout *lp)
++/* Uninitialize a mountpoint by destroying its device list */
++int
++filelayout_uninitialize_mountpoint(struct nfs_server *nfss)
+{
-+ struct nfs4_client *clp;
-+ struct nfs4_file *fp;
-+ struct nfs4_layout_state *ls;
-+
-+ BUG_ON_UNLOCKED_LAYOUT();
-+ clp = lp->lo_client;
-+ fp = lp->lo_file;
-+ ls = lp->lo_state;
-+ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n",
-+ __func__, lp, clp, fp, fp->fi_inode,
-+ list_empty(&ls->ls_layouts));
++ dprintk("--> %s\n", __func__);
+
-+ kmem_cache_free(pnfs_layout_slab, lp);
-+ /* release references taken by init_layout */
-+ put_layout_state_locked(ls);
-+ put_nfs4_file(fp);
++ if (nfss->nfs_client->cl_devid_cache)
++ pnfs_put_deviceid_cache(nfss->nfs_client);
++ return 0;
+}
+
-+void fs_layout_return(struct super_block *sb, struct inode *ino,
-+ struct nfsd4_pnfs_layoutreturn *lrp, int flags,
-+ void *recall_cookie)
++/* This function is used by the layout driver to calculate the
++ * offset of the file on the dserver based on whether the
++ * layout type is STRIPE_DENSE or STRIPE_SPARSE
++ */
++static loff_t
++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
-+ int ret;
-+
-+ if (unlikely(!sb->s_pnfs_op->layout_return))
-+ return;
++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
-+ lrp->lr_flags = flags;
-+ lrp->args.lr_cookie = recall_cookie;
++ switch (flseg->stripe_type) {
++ case STRIPE_SPARSE:
++ return offset;
+
-+ if (!ino) /* FSID or ALL */
-+ ino = sb->s_root->d_inode;
++ case STRIPE_DENSE:
++ {
++ u32 stripe_width;
++ u64 tmp, off;
++ u32 unit = flseg->stripe_unit;
+
-+ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args);
-+ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx "
-+ "cookie = %p flags 0x%x status=%d\n",
-+ __func__, ino->i_ino, lrp->args.lr_seg.iomode,
-+ lrp->args.lr_seg.offset, lrp->args.lr_seg.length,
-+ recall_cookie, flags, ret);
++ stripe_width = unit * flseg->dsaddr->stripe_count;
++ tmp = off = offset - flseg->pattern_offset;
++ do_div(tmp, stripe_width);
++ return tmp * unit + do_div(off, unit);
++ }
++ default:
++ BUG();
++ }
++
++ /* We should never get here... just to stop the gcc warning */
++ return 0;
+}
+
-+static u64
-+alloc_init_sbid(struct super_block *sb)
++/*
++ * Call ops for the async read/write cases
++ * In the case of dense layouts, the offset needs to be reset to its
++ * original value.
++ */
++static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
-+ struct sbid_tracker *sbid;
-+ struct sbid_tracker *new = alloc_sbid();
-+ unsigned long hash_idx = sbid_hashval(sb);
-+ u64 id = 0;
-+
-+ if (likely(new)) {
-+ spin_lock(&layout_lock);
-+ id = ++current_sbid;
-+ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK);
-+ id = new->id;
-+ BUG_ON(id == 0);
-+ new->sb = sb;
++ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
-+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash)
-+ if (sbid->sb == sb) {
-+ kfree(new);
-+ id = sbid->id;
-+ spin_unlock(&layout_lock);
-+ return id;
-+ }
-+ list_add(&new->hash, &sbid_hashtbl[hash_idx]);
-+ spin_unlock(&layout_lock);
++ if (rdata->fldata.orig_offset) {
++ dprintk("%s new off %llu orig offset %llu\n", __func__,
++ rdata->args.offset, rdata->fldata.orig_offset);
++ rdata->args.offset = rdata->fldata.orig_offset;
+ }
-+ return id;
++
++ /* Note this may cause RPC to be resent */
++ rdata->pdata.call_ops->rpc_call_done(task, data);
+}
+
-+struct super_block *
-+find_sbid_id(u64 id)
++static void filelayout_read_release(void *data)
+{
-+ struct sbid_tracker *sbid;
-+ struct super_block *sb = NULL;
-+ unsigned long hash_idx = id & SBID_HASH_MASK;
-+ int pos = 0;
++ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
-+ spin_lock(&layout_lock);
-+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
-+ pos++;
-+ if (sbid->id != id)
-+ continue;
-+ if (pos > 1)
-+ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
-+ sb = sbid->sb;
-+ break;
-+ }
-+ spin_unlock(&layout_lock);
-+ return sb;
++ put_lseg(rdata->pdata.lseg);
++ rdata->pdata.lseg = NULL;
++ rdata->pdata.call_ops->rpc_release(data);
+}
+
-+u64
-+find_create_sbid(struct super_block *sb)
++static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
-+ struct sbid_tracker *sbid;
-+ unsigned long hash_idx = sbid_hashval(sb);
-+ int pos = 0;
-+ u64 id = 0;
++ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
-+ spin_lock(&layout_lock);
-+ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
-+ pos++;
-+ if (sbid->sb != sb)
-+ continue;
-+ if (pos > 1)
-+ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
-+ id = sbid->id;
-+ break;
++ if (wdata->fldata.orig_offset) {
++ dprintk("%s new off %llu orig offset %llu\n", __func__,
++ wdata->args.offset, wdata->fldata.orig_offset);
++ wdata->args.offset = wdata->fldata.orig_offset;
+ }
-+ spin_unlock(&layout_lock);
+
-+ if (!id)
-+ id = alloc_init_sbid(sb);
++ /* Note this may cause RPC to be resent */
++ wdata->pdata.call_ops->rpc_call_done(task, data);
++}
+
-+ return id;
++static void filelayout_write_release(void *data)
++{
++ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
++
++ put_lseg(wdata->pdata.lseg);
++ wdata->pdata.lseg = NULL;
++ wdata->pdata.call_ops->rpc_release(data);
+}
+
-+/*
-+ * Create a layoutrecall structure
-+ * An optional layoutrecall can be cloned (except for the layoutrecall lists)
++struct rpc_call_ops filelayout_read_call_ops = {
++ .rpc_call_prepare = nfs_read_prepare,
++ .rpc_call_done = filelayout_read_call_done,
++ .rpc_release = filelayout_read_release,
++};
++
++struct rpc_call_ops filelayout_write_call_ops = {
++ .rpc_call_prepare = nfs_write_prepare,
++ .rpc_call_done = filelayout_write_call_done,
++ .rpc_release = filelayout_write_release,
++};
++
++/* Perform sync or async reads.
++ *
++ * An optimization for the NFS file layout driver
++ * allows the original read/write data structs to be passed in the
++ * last argument.
++ *
++ * TODO: join with write_pagelist?
+ */
-+static struct nfs4_layoutrecall *
-+alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl,
-+ struct nfs4_client *clp,
-+ struct nfs4_file *lrfile)
++static enum pnfs_try_status
++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages)
+{
-+ struct nfs4_layoutrecall *clr;
++ struct pnfs_layout_segment *lseg = data->pdata.lseg;
++ struct nfs4_pnfs_ds *ds;
++ loff_t offset = data->args.offset;
++ u32 idx;
++ struct nfs_fh *fh;
+
-+ dprintk("NFSD %s\n", __func__);
-+ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL);
-+ if (clr == NULL)
-+ return clr;
++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n",
++ __func__, data->inode->i_ino, nr_pages,
++ data->args.pgbase, (size_t)data->args.count, offset);
+
-+ dprintk("NFSD %s -->\n", __func__);
++ /* Retrieve the correct rpc_client for the byte range */
++ idx = nfs4_fl_calc_ds_index(lseg, offset);
++ ds = nfs4_fl_prepare_ds(lseg, idx);
++ if (!ds) {
++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
++ return PNFS_NOT_ATTEMPTED;
++ }
++ dprintk("%s USE DS:ip %x %hu\n", __func__,
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
-+ memset(clr, 0, sizeof(*clr));
-+ if (lrfile)
-+ get_nfs4_file(lrfile);
-+ clr->clr_client = clp;
-+ clr->clr_file = lrfile;
-+ clr->cb = *cbl;
++ /* just try the first data server for the index..*/
++ data->fldata.ds_nfs_client = ds->ds_clp;
++ fh = nfs4_fl_select_ds_fh(lseg, offset);
++ if (fh)
++ data->args.fh = fh;
+
-+ kref_init(&clr->clr_ref);
-+ INIT_LIST_HEAD(&clr->clr_perclnt);
++ /*
++ * Now get the file offset on the dserver
++ * Set the read offset to this offset, and
++ * save the original offset in orig_offset
++ * In the case of aync reads, the offset will be reset in the
++ * call_ops->rpc_call_done() routine.
++ */
++ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
++ data->fldata.orig_offset = offset;
+
-+ dprintk("NFSD %s return %p\n", __func__, clr);
-+ return clr;
-+}
++ /* Perform an asynchronous read */
++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
++ &filelayout_read_call_ops);
+
-+static void
-+get_layoutrecall(struct nfs4_layoutrecall *clr)
-+{
-+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
-+ atomic_read(&clr->clr_ref.refcount));
-+ kref_get(&clr->clr_ref);
-+}
++ data->pdata.pnfs_error = 0;
+
-+static void
-+destroy_layoutrecall(struct kref *kref)
-+{
-+ struct nfs4_layoutrecall *clr =
-+ container_of(kref, struct nfs4_layoutrecall, clr_ref);
-+ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr,
-+ clr->clr_file, clr->clr_client);
-+ BUG_ON(!list_empty(&clr->clr_perclnt));
-+ if (clr->clr_file)
-+ put_nfs4_file(clr->clr_file);
-+ kmem_cache_free(pnfs_layoutrecall_slab, clr);
++ return PNFS_ATTEMPTED;
+}
+
-+int
-+put_layoutrecall(struct nfs4_layoutrecall *clr)
++/* Perform async writes. */
++static enum pnfs_try_status
++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync)
+{
-+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
-+ atomic_read(&clr->clr_ref.refcount));
-+ return kref_put(&clr->clr_ref, destroy_layoutrecall);
-+}
++ struct pnfs_layout_segment *lseg = data->pdata.lseg;
++ struct nfs4_pnfs_ds *ds;
++ loff_t offset = data->args.offset;
++ u32 idx;
++ struct nfs_fh *fh;
+
-+void *
-+layoutrecall_done(struct nfs4_layoutrecall *clr)
-+{
-+ void *recall_cookie = clr->cb.cbl_cookie;
-+ struct nfs4_layoutrecall *parent = clr->parent;
++ /* Retrieve the correct rpc_client for the byte range */
++ idx = nfs4_fl_calc_ds_index(lseg, offset);
++ ds = nfs4_fl_prepare_ds(lseg, idx);
++ if (!ds) {
++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
++ return PNFS_NOT_ATTEMPTED;
++ }
++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
++ data->inode->i_ino, sync, (size_t) data->args.count, offset,
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
-+ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
-+ atomic_read(&clr->clr_ref.refcount));
-+ BUG_ON_UNLOCKED_LAYOUT();
-+ list_del_init(&clr->clr_perclnt);
-+ put_layoutrecall(clr);
++ data->fldata.ds_nfs_client = ds->ds_clp;
++ fh = nfs4_fl_select_ds_fh(lseg, offset);
++ if (fh)
++ data->args.fh = fh;
++ /*
++ * Get the file offset on the dserver. Set the write offset to
++ * this offset and save the original offset.
++ */
++ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
++ data->fldata.orig_offset = offset;
+
-+ if (parent && !put_layoutrecall(parent))
-+ recall_cookie = NULL;
++ /*
++ * Perform an asynchronous write The offset will be reset in the
++ * call_ops->rpc_call_done() routine
++ */
++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
++ &filelayout_write_call_ops, sync);
+
-+ return recall_cookie;
++ data->pdata.pnfs_error = 0;
++ return PNFS_ATTEMPTED;
+}
+
+/*
-+ * get_state() and cb_get_state() are
++ * filelayout_check_layout()
++ *
++ * Make sure layout segment parameters are sane WRT the device.
++ * At this point no generic layer initialization of the lseg has occurred,
++ * and nothing has been added to the layout_hdr cache.
++ *
+ */
-+void
-+release_pnfs_ds_dev_list(struct nfs4_stateid *stp)
++static int
++filelayout_check_layout(struct pnfs_layout_hdr *lo,
++ struct nfs4_filelayout_segment *fl,
++ struct nfs4_layoutget_res *lgr,
++ struct nfs4_deviceid *id)
+{
-+ struct pnfs_ds_dev_entry *ddp;
++ struct nfs4_file_layout_dsaddr *dsaddr;
++ int status = -EINVAL;
++ struct nfs_server *nfss = NFS_SERVER(lo->inode);
+
-+ while (!list_empty(&stp->st_pnfs_ds_id)) {
-+ ddp = list_entry(stp->st_pnfs_ds_id.next,
-+ struct pnfs_ds_dev_entry, dd_dev_entry);
-+ list_del(&ddp->dd_dev_entry);
-+ kfree(ddp);
++ dprintk("--> %s\n", __func__);
++
++ if (fl->pattern_offset > lgr->range.offset) {
++ dprintk("%s pattern_offset %lld to large\n",
++ __func__, fl->pattern_offset);
++ goto out;
+ }
-+}
+
-+static int
-+nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid)
-+{
-+ struct pnfs_ds_dev_entry *ddp;
++ if (fl->stripe_unit % PAGE_SIZE) {
++ dprintk("%s Stripe unit (%u) not page aligned\n",
++ __func__, fl->stripe_unit);
++ goto out;
++ }
+
-+ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL);
-+ if (!ddp)
-+ return -ENOMEM;
++ /* find and reference the deviceid */
++ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
++ if (dsaddr == NULL) {
++ dsaddr = get_device_info(lo->inode, id);
++ if (dsaddr == NULL)
++ goto out;
++ }
++ fl->dsaddr = dsaddr;
+
-+ INIT_LIST_HEAD(&ddp->dd_dev_entry);
-+ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id);
-+ ddp->dd_dsid = dsid;
-+ return 0;
-+}
++ if (fl->first_stripe_index < 0 ||
++ fl->first_stripe_index >= dsaddr->stripe_count) {
++ dprintk("%s Bad first_stripe_index %d\n",
++ __func__, fl->first_stripe_index);
++ goto out_put;
++ }
+
-+/*
-+ * are two octet ranges overlapping?
-+ * start1 last1
-+ * |-----------------|
-+ * start2 last2
-+ * |----------------|
-+ */
-+static inline int
-+lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
-+{
-+ u64 start1 = l1->offset;
-+ u64 last1 = last_byte_offset(start1, l1->length);
-+ u64 start2 = l2->offset;
-+ u64 last2 = last_byte_offset(start2, l2->length);
-+ int ret;
++ if ((fl->stripe_type == STRIPE_SPARSE &&
++ fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
++ (fl->stripe_type == STRIPE_DENSE &&
++ fl->num_fh != dsaddr->stripe_count)) {
++ dprintk("%s num_fh %u not valid for given packing\n",
++ __func__, fl->num_fh);
++ goto out_put;
++ }
+
-+ /* if last1 == start2 there's a single byte overlap */
-+ ret = (last2 >= start1) && (last1 >= start2);
-+ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__,
-+ l1->offset, l1->length, l2->offset, l2->length, ret);
-+ return ret;
++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
++ dprintk("%s Stripe unit (%u) not aligned with rsize %u "
++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
++ nfss->wsize);
++ }
++
++ status = 0;
++out:
++ dprintk("--> %s returns %d\n", __func__, status);
++ return status;
++out_put:
++ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
++ goto out;
+}
+
-+static inline int
-+same_fsid_major(struct nfs4_fsid *fsid, u64 major)
++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
-+ return fsid->major == major;
++ int i;
++
++ for (i = 0; i < fl->num_fh; i++) {
++ if (!fl->fh_array[i])
++ break;
++ kfree(fl->fh_array[i]);
++ }
++ kfree(fl->fh_array);
++ fl->fh_array = NULL;
+}
+
-+static inline int
-+same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh)
++static void
++_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
-+ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid);
++ filelayout_free_fh_array(fl);
++ kfree(fl);
+}
+
-+/*
-+ * find a layout recall conflicting with the specified layoutget
-+ */
+static int
-+is_layout_recalled(struct nfs4_client *clp,
-+ struct svc_fh *current_fh,
-+ struct nfsd4_layout_seg *seg)
++filelayout_decode_layout(struct pnfs_layout_hdr *flo,
++ struct nfs4_filelayout_segment *fl,
++ struct nfs4_layoutget_res *lgr,
++ struct nfs4_deviceid *id)
+{
-+ struct nfs4_layoutrecall *clr;
++ uint32_t *p = (uint32_t *)lgr->layout.buf;
++ uint32_t nfl_util;
++ int i;
+
-+ spin_lock(&layout_lock);
-+ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) {
-+ if (clr->cb.cbl_seg.layout_type != seg->layout_type)
-+ continue;
-+ if (clr->cb.cbl_recall_type == RETURN_ALL)
-+ goto found;
-+ if (clr->cb.cbl_recall_type == RETURN_FSID) {
-+ if (same_fsid(&clr->cb.cbl_fsid, current_fh))
-+ goto found;
-+ else
-+ continue;
-+ }
-+ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE);
-+ if (clr->cb.cbl_seg.clientid == seg->clientid &&
-+ lo_seg_overlapping(&clr->cb.cbl_seg, seg))
-+ goto found;
-+ }
-+ spin_unlock(&layout_lock);
-+ return 0;
-+found:
-+ spin_unlock(&layout_lock);
-+ return 1;
-+}
++ dprintk("%s: set_layout_map Begin\n", __func__);
+
-+/*
-+ * are two octet ranges overlapping or adjacent?
-+ */
-+static inline int
-+lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
-+{
-+ u64 start1 = l1->offset;
-+ u64 end1 = end_offset(start1, l1->length);
-+ u64 start2 = l2->offset;
-+ u64 end2 = end_offset(start2, l2->length);
++ memcpy(id, p, sizeof(*id));
++ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
++ print_deviceid(id);
+
-+ /* is end1 == start2 ranges are adjacent */
-+ return (end2 >= start1) && (end1 >= start2);
-+}
++ nfl_util = be32_to_cpup(p++);
++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
++ fl->commit_through_mds = 1;
++ if (nfl_util & NFL4_UFLG_DENSE)
++ fl->stripe_type = STRIPE_DENSE;
++ else
++ fl->stripe_type = STRIPE_SPARSE;
++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
-+static void
-+extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg)
-+{
-+ u64 lo_start = lo->offset;
-+ u64 lo_end = end_offset(lo_start, lo->length);
-+ u64 lg_start = lg->offset;
-+ u64 lg_end = end_offset(lg_start, lg->length);
++ fl->first_stripe_index = be32_to_cpup(p++);
++ p = xdr_decode_hyper(p, &fl->pattern_offset);
++ fl->num_fh = be32_to_cpup(p++);
+
-+ /* lo already covers lg? */
-+ if (lo_start <= lg_start && lg_end <= lo_end)
-+ return;
++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
++ fl->pattern_offset);
+
-+ /* extend start offset */
-+ if (lo_start > lg_start)
-+ lo_start = lg_start;
++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
++ GFP_KERNEL);
++ if (!fl->fh_array)
++ return -ENOMEM;
+
-+ /* extend end offset */
-+ if (lo_end < lg_end)
-+ lo_end = lg_end;
++ for (i = 0; i < fl->num_fh; i++) {
++ /* Do we want to use a mempool here? */
++ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
++ if (!fl->fh_array[i]) {
++ filelayout_free_fh_array(fl);
++ return -ENOMEM;
++ }
++ fl->fh_array[i]->size = be32_to_cpup(p++);
++ if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
++ printk(KERN_ERR "Too big fh %d received %d\n",
++ i, fl->fh_array[i]->size);
++ filelayout_free_fh_array(fl);
++ return -EIO;
++ }
++ memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
++ p += XDR_QUADLEN(fl->fh_array[i]->size);
++ dprintk("DEBUG: %s: fh len %d\n", __func__,
++ fl->fh_array[i]->size);
++ }
+
-+ lo->offset = lo_start;
-+ lo->length = (lo_end == NFS4_MAX_UINT64) ?
-+ lo_end : lo_end - lo_start;
++ return 0;
+}
+
-+static struct nfs4_layout *
-+merge_layout(struct nfs4_file *fp,
-+ struct nfs4_client *clp,
-+ struct nfsd4_layout_seg *seg)
++static struct pnfs_layout_segment *
++filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
++ struct nfs4_layoutget_res *lgr)
+{
-+ struct nfs4_layout *lp = NULL;
++ struct nfs4_filelayout_segment *fl;
++ int rc;
++ struct nfs4_deviceid id;
+
-+ spin_lock(&layout_lock);
-+ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile)
-+ if (lp->lo_seg.layout_type == seg->layout_type &&
-+ lp->lo_seg.clientid == seg->clientid &&
-+ lp->lo_seg.iomode == seg->iomode &&
-+ lo_seg_mergeable(&lp->lo_seg, seg)) {
-+ extend_layout(&lp->lo_seg, seg);
-+ break;
-+ }
-+ spin_unlock(&layout_lock);
++ dprintk("--> %s\n", __func__);
++ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
++ if (!fl)
++ return NULL;
+
-+ return lp;
++ rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
++ if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
++ _filelayout_free_lseg(fl);
++ return NULL;
++ }
++ return &fl->generic_hdr;
+}
+
-+__be32
-+nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp,
-+ struct exp_xdr_stream *xdr)
++static void
++filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
-+ u32 status;
-+ __be32 nfserr;
-+ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode;
-+ struct super_block *sb = ino->i_sb;
-+ int can_merge;
-+ struct nfs4_file *fp;
-+ struct nfs4_client *clp;
-+ struct nfs4_layout *lp = NULL;
-+ struct nfs4_layout_state *ls = NULL;
-+ struct nfsd4_pnfs_layoutget_arg args = {
-+ .lg_minlength = lgp->lg_minlength,
-+ .lg_fh = &lgp->lg_fhp->fh_handle,
-+ };
-+ struct nfsd4_pnfs_layoutget_res res = {
-+ .lg_seg = lgp->lg_seg,
-+ };
++ struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
++ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
-+ dprintk("NFSD: %s Begin\n", __func__);
++ dprintk("--> %s\n", __func__);
++ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
++ &fl->dsaddr->deviceid);
++ _filelayout_free_lseg(fl);
++}
+
-+ args.lg_sbid = find_create_sbid(sb);
-+ if (!args.lg_sbid) {
-+ nfserr = nfserr_layouttrylater;
++/* Allocate a new nfs_write_data struct and initialize */
++static struct nfs_write_data *
++filelayout_clone_write_data(struct nfs_write_data *old)
++{
++ static struct nfs_write_data *new;
++
++ new = nfs_commitdata_alloc();
++ if (!new)
+ goto out;
-+ }
++ kref_init(&new->refcount);
++ new->parent = old;
++ kref_get(&old->refcount);
++ new->inode = old->inode;
++ new->cred = old->cred;
++ new->args.offset = 0;
++ new->args.count = 0;
++ new->res.count = 0;
++ new->res.fattr = &new->fattr;
++ nfs_fattr_init(&new->fattr);
++ new->res.verf = &new->verf;
++ new->args.context = get_nfs_open_context(old->args.context);
++ new->pdata.lseg = NULL;
++ new->pdata.call_ops = old->pdata.call_ops;
++ new->pdata.how = old->pdata.how;
++out:
++ return new;
++}
+
-+ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL &&
-+ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type);
++static void filelayout_commit_call_done(struct rpc_task *task, void *data)
++{
++ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
-+ nfs4_lock_state();
-+ fp = find_alloc_file(ino, lgp->lg_fhp);
-+ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid);
-+ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp);
-+ if (!fp || !clp) {
-+ nfserr = nfserr_inval;
-+ goto out_unlock;
-+ }
++ wdata->pdata.call_ops->rpc_call_done(task, data);
++}
+
-+ /* Check decoded layout stateid */
-+ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls);
-+ if (nfserr)
-+ goto out_unlock;
++static struct rpc_call_ops filelayout_commit_call_ops = {
++ .rpc_call_prepare = nfs_write_prepare,
++ .rpc_call_done = filelayout_commit_call_done,
++ .rpc_release = filelayout_write_release,
++};
+
-+ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) {
-+ nfserr = nfserr_recallconflict;
-+ goto out;
-+ }
++/*
++ * Execute a COMMIT op to the MDS or to each data server on which a page
++ * in 'pages' exists.
++ * Invoke the pnfs_commit_complete callback.
++ */
++enum pnfs_try_status
++filelayout_commit(struct nfs_write_data *data, int sync)
++{
++ LIST_HEAD(head);
++ struct nfs_page *req;
++ loff_t file_offset = 0;
++ u16 idx, i;
++ struct list_head **ds_page_list = NULL;
++ u16 *indices_used;
++ int num_indices_seen = 0;
++ const struct rpc_call_ops *call_ops;
++ struct rpc_clnt *clnt;
++ struct nfs_write_data **clone_list = NULL;
++ struct nfs_write_data *dsdata;
++ struct nfs4_pnfs_ds *ds;
+
-+ /* pre-alloc layout in case we can't merge after we call
-+ * the file system
++ dprintk("%s data %p sync %d\n", __func__, data, sync);
++
++ /* Alloc room for both in one go */
++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) *
++ (sizeof(u16) + sizeof(struct list_head *)),
++ GFP_KERNEL);
++ if (!ds_page_list)
++ goto mem_error;
++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1);
++ /*
++ * Sort pages based on which ds to send to.
++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT.
++ * Note we are assuming there is only a single lseg in play.
++ * When that is not true, we could first sort on lseg, then
++ * sort within each as we do here.
+ */
-+ lp = alloc_layout();
-+ if (!lp) {
-+ nfserr = nfserr_layouttrylater;
-+ goto out_unlock;
++ while (!list_empty(&data->pages)) {
++ req = nfs_list_entry(data->pages.next);
++ nfs_list_remove_request(req);
++ if (!req->wb_lseg ||
++ ((struct nfs4_filelayout_segment *)
++ FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds)
++ idx = NFS4_PNFS_MAX_MULTI_CNT;
++ else {
++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset);
++ }
++ if (ds_page_list[idx]) {
++ /* Already seen this idx */
++ list_add(&req->wb_list, ds_page_list[idx]);
++ } else {
++ /* New idx not seen so far */
++ list_add_tail(&req->wb_list, &head);
++ indices_used[num_indices_seen++] = idx;
++ }
++ ds_page_list[idx] = &req->wb_list;
+ }
++ /* Once created, clone must be released via call_op */
++ clone_list = kzalloc(num_indices_seen *
++ sizeof(struct nfs_write_data *), GFP_KERNEL);
++ if (!clone_list)
++ goto mem_error;
++ for (i = 0; i < num_indices_seen - 1; i++) {
++ clone_list[i] = filelayout_clone_write_data(data);
++ if (!clone_list[i])
++ goto mem_error;
++ }
++ clone_list[i] = data;
++ /*
++ * Now send off the RPCs to each ds. Note that it is important
++ * that any RPC to the MDS be sent last (or at least after all
++ * clones have been made.)
++ */
++ for (i = 0; i < num_indices_seen; i++) {
++ dsdata = clone_list[i];
++ idx = indices_used[i];
++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]);
++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) {
++ call_ops = data->pdata.call_ops;;
++ clnt = NFS_CLIENT(dsdata->inode);
++ ds = NULL;
++ } else {
++ struct nfs_fh *fh;
+
-+ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd "
-+ "iomode %u offset %llu length %llu\n",
-+ __func__, lgp->lg_seg.layout_type,
-+ exp_xdr_qbytes(xdr->end - xdr->p),
-+ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length);
-+
-+ /* FIXME: need to eliminate the use of the state lock */
-+ nfs4_unlock_state();
-+ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res);
-+ nfs4_lock_state();
++ call_ops = &filelayout_commit_call_ops;
++ req = nfs_list_entry(dsdata->pages.next);
++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx);
++ if (!ds) {
++ /* Trigger retry of this chunk through MDS */
++ dsdata->task.tk_status = -EIO;
++ data->pdata.call_ops->rpc_release(dsdata);
++ continue;
++ }
++ clnt = ds->ds_clp->cl_rpcclient;
++ dsdata->fldata.ds_nfs_client = ds->ds_clp;
++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset);
++ if (fh)
++ dsdata->args.fh = fh;
++ }
++ dprintk("%s: Initiating commit: %llu USE DS:\n",
++ __func__, file_offset);
++ ifdebug(FACILITY)
++ print_ds(ds);
+
-+ dprintk("pNFS %s: post-export status %u "
-+ "iomode %u offset %llu length %llu\n",
-+ __func__, status, res.lg_seg.iomode,
-+ res.lg_seg.offset, res.lg_seg.length);
++ /* Send COMMIT to data server */
++ nfs_initiate_commit(dsdata, clnt, call_ops, sync);
++ }
++ kfree(clone_list);
++ kfree(ds_page_list);
++ data->pdata.pnfs_error = 0;
++ return PNFS_ATTEMPTED;
+
-+ /*
-+ * The allowable error codes for the layout_get pNFS export
-+ * operations vector function (from the file system) can be
-+ * expanded as needed to include other errors defined for
-+ * the RFC 5561 LAYOUTGET operation.
-+ */
-+ switch (status) {
-+ case 0:
-+ nfserr = NFS4_OK;
-+ break;
-+ case NFS4ERR_ACCESS:
-+ case NFS4ERR_BADIOMODE:
-+ /* No support for LAYOUTIOMODE4_RW layouts */
-+ case NFS4ERR_BADLAYOUT:
-+ /* No layout matching loga_minlength rules */
-+ case NFS4ERR_INVAL:
-+ case NFS4ERR_IO:
-+ case NFS4ERR_LAYOUTTRYLATER:
-+ case NFS4ERR_LAYOUTUNAVAILABLE:
-+ case NFS4ERR_LOCKED:
-+ case NFS4ERR_NOSPC:
-+ case NFS4ERR_RECALLCONFLICT:
-+ case NFS4ERR_SERVERFAULT:
-+ case NFS4ERR_TOOSMALL:
-+ /* Requested layout too big for loga_maxcount */
-+ case NFS4ERR_WRONG_TYPE:
-+ /* Not a regular file */
-+ nfserr = cpu_to_be32(status);
-+ goto out_freelayout;
-+ default:
-+ BUG();
-+ nfserr = nfserr_serverfault;
++ mem_error:
++ if (clone_list) {
++ for (i = 0; i < num_indices_seen - 1; i++) {
++ if (!clone_list[i])
++ break;
++ data->pdata.call_ops->rpc_release(clone_list[i]);
++ }
++ kfree(clone_list);
+ }
++ kfree(ds_page_list);
++ /* One of these will be empty, but doesn't hurt to do both */
++ nfs_mark_list_commit(&head);
++ nfs_mark_list_commit(&data->pages);
++ data->pdata.call_ops->rpc_release(data);
++ return PNFS_ATTEMPTED;
++}
+
-+ lgp->lg_seg = res.lg_seg;
-+ lgp->lg_roc = res.lg_return_on_close;
++/*
++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
++ *
++ * return 1 : coalesce page
++ * return 0 : don't coalesce page
++ *
++ * By the time this is called, we know req->wb_lseg == prev->wb_lseg
++ */
++int
++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
++ struct nfs_page *req)
++{
++ u64 p_stripe, r_stripe;
++ u32 stripe_unit;
+
-+ /* SUCCESS!
-+ * Can the new layout be merged into an existing one?
-+ * If so, free unused layout struct
-+ */
-+ if (can_merge && merge_layout(fp, clp, &res.lg_seg))
-+ goto out_freelayout;
++ if (!req->wb_lseg)
++ return 1;
++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
++ stripe_unit = FILELAYOUT_LSEG(req->wb_lseg)->stripe_unit;
+
-+ /* Can't merge, so let's initialize this new layout */
-+ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg);
-+out_unlock:
-+ if (ls)
-+ put_layout_state(ls);
-+ if (fp)
-+ put_nfs4_file(fp);
-+ nfs4_unlock_state();
-+out:
-+ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp,
-+ be32_to_cpu(nfserr));
-+ return nfserr;
-+out_freelayout:
-+ free_layout(lp);
-+ goto out_unlock;
++ do_div(p_stripe, stripe_unit);
++ do_div(r_stripe, stripe_unit);
++
++ return (p_stripe == r_stripe);
+}
+
-+static void
-+trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr)
++static struct pnfs_layoutdriver_type filelayout_type = {
++ .id = LAYOUT_NFSV4_1_FILES,
++ .name = "LAYOUT_NFSV4_1_FILES",
++ .owner = THIS_MODULE,
++ .flags = PNFS_USE_RPC_CODE,
++ .initialize_mountpoint = filelayout_initialize_mountpoint,
++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
++ .alloc_lseg = filelayout_alloc_lseg,
++ .free_lseg = filelayout_free_lseg,
++ .pg_test = filelayout_pg_test,
++ .read_pagelist = filelayout_read_pagelist,
++ .write_pagelist = filelayout_write_pagelist,
++ .commit = filelayout_commit,
++};
++
++static int __init nfs4filelayout_init(void)
+{
-+ u64 lo_start = lo->offset;
-+ u64 lo_end = end_offset(lo_start, lo->length);
-+ u64 lr_start = lr->offset;
-+ u64 lr_end = end_offset(lr_start, lr->length);
++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
++ __func__);
++ return pnfs_register_layoutdriver(&filelayout_type);
++}
+
-+ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__,
-+ lo->offset, lo->length, lr->offset, lr->length);
++static void __exit nfs4filelayout_exit(void)
++{
++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
++ __func__);
++ pnfs_unregister_layoutdriver(&filelayout_type);
++}
+
-+ /* lr fully covers lo? */
-+ if (lr_start <= lo_start && lo_end <= lr_end) {
-+ lo->length = 0;
-+ goto out;
-+ }
++module_init(nfs4filelayout_init);
++module_exit(nfs4filelayout_exit);
+diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
+new file mode 100644
+index 0000000..f884b0c
+--- /dev/null
++++ b/fs/nfs/nfs4filelayout.h
+@@ -0,0 +1,100 @@
++/*
++ * NFSv4 file layout driver data structures.
++ *
++ * Copyright (c) 2002
++ * The Regents of the University of Michigan
++ * All Rights Reserved
++ *
++ * Dean Hildebrand <dhildebz at umich.edu>
++ *
++ * Permission is granted to use, copy, create derivative works, and
++ * redistribute this software and such derivative works for any purpose,
++ * so long as the name of the University of Michigan is not used in
++ * any advertising or publicity pertaining to the use or distribution
++ * of this software without specific, written prior authorization. If
++ * the above copyright notice or any other identification of the
++ * University of Michigan is included in any copy of any portion of
++ * this software, then the disclaimer below must also be included.
++ *
++ * This software is provided as is, without representation or warranty
++ * of any kind either express or implied, including without limitation
++ * the implied warranties of merchantability, fitness for a particular
++ * purpose, or noninfringement. The Regents of the University of
++ * Michigan shall not be liable for any damages, including special,
++ * indirect, incidental, or consequential damages, with respect to any
++ * claim arising out of or in connection with the use of the software,
++ * even if it has been or is hereafter advised of the possibility of
++ * such damages.
++ */
+
-+ /*
-+ * split not supported yet. retain layout segment.
-+ * remains must be returned by the client
-+ * on the final layout return.
-+ */
-+ if (lo_start < lr_start && lr_end < lo_end) {
-+ dprintk("%s: split not supported\n", __func__);
-+ goto out;
-+ }
++#ifndef FS_NFS_NFS4FILELAYOUT_H
++#define FS_NFS_NFS4FILELAYOUT_H
+
-+ if (lo_start < lr_start)
-+ lo_end = lr_start - 1;
-+ else /* lr_end < lo_end */
-+ lo_start = lr_end + 1;
++#include "pnfs.h"
+
-+ lo->offset = lo_start;
-+ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start;
-+out:
-+ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length);
-+}
++/*
++ * Field testing shows we need to support upto 4096 stripe indices.
++ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
++ * reasonable. This in turn means we support a maximum of 256
++ * RFC 5661 multipath_list4 structures.
++ */
++#define NFS4_PNFS_MAX_STRIPE_CNT 4096
++#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
+
-+static int
-+pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp,
-+ struct nfsd4_pnfs_layoutreturn *lrp)
++enum stripetype4 {
++ STRIPE_SPARSE = 1,
++ STRIPE_DENSE = 2
++};
++
++/* Individual ip address */
++struct nfs4_pnfs_ds {
++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
++ u32 ds_ip_addr;
++ u32 ds_port;
++ struct nfs_client *ds_clp;
++ atomic_t ds_count;
++};
++
++struct nfs4_file_layout_dsaddr {
++ struct pnfs_deviceid_node deviceid;
++ u32 stripe_count;
++ u8 *stripe_indices;
++ u32 ds_num;
++ struct nfs4_pnfs_ds *ds_list[1];
++};
++
++struct nfs4_filelayout_segment {
++ struct pnfs_layout_segment generic_hdr;
++ u32 stripe_type;
++ u32 commit_through_mds;
++ u32 stripe_unit;
++ u32 first_stripe_index;
++ u64 pattern_offset;
++ struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
++ unsigned int num_fh;
++ struct nfs_fh **fh_array;
++};
++
++static inline struct nfs4_filelayout_segment *
++FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
-+ int layouts_found = 0;
-+ struct nfs4_layout *lp, *nextlp;
++ return container_of(lseg,
++ struct nfs4_filelayout_segment,
++ generic_hdr);
++}
+
-+ dprintk("%s: clp %p fp %p\n", __func__, clp, fp);
-+ spin_lock(&layout_lock);
-+ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) {
-+ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n",
-+ __func__, lp,
-+ lp->lo_client, clp,
-+ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type,
-+ lp->lo_seg.iomode, lrp->args.lr_seg.iomode);
-+ if (lp->lo_client != clp ||
-+ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type ||
-+ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode &&
-+ lrp->args.lr_seg.iomode != IOMODE_ANY) ||
-+ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg))
-+ continue;
-+ layouts_found++;
-+ trim_layout(&lp->lo_seg, &lrp->args.lr_seg);
-+ if (!lp->lo_seg.length) {
-+ lrp->lrs_present = 0;
-+ dequeue_layout(lp);
-+ destroy_layout(lp);
-+ }
-+ }
-+ spin_unlock(&layout_lock);
++extern struct nfs_fh *
++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset);
++
++extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
++extern void print_ds(struct nfs4_pnfs_ds *ds);
++extern void print_deviceid(struct nfs4_deviceid *dev_id);
++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset);
++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
++ u32 ds_idx);
++extern struct nfs4_file_layout_dsaddr *
++nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
++struct nfs4_file_layout_dsaddr *
++get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
++
++#endif /* FS_NFS_NFS4FILELAYOUT_H */
+diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
+new file mode 100644
+index 0000000..1f0ab62
+--- /dev/null
++++ b/fs/nfs/nfs4filelayoutdev.c
+@@ -0,0 +1,620 @@
++/*
++ * Device operations for the pnfs nfs4 file layout driver.
++ *
++ * Copyright (c) 2002
++ * The Regents of the University of Michigan
++ * All Rights Reserved
++ *
++ * Dean Hildebrand <dhildebz at umich.edu>
++ * Garth Goodson <Garth.Goodson at netapp.com>
++ *
++ * Permission is granted to use, copy, create derivative works, and
++ * redistribute this software and such derivative works for any purpose,
++ * so long as the name of the University of Michigan is not used in
++ * any advertising or publicity pertaining to the use or distribution
++ * of this software without specific, written prior authorization. If
++ * the above copyright notice or any other identification of the
++ * University of Michigan is included in any copy of any portion of
++ * this software, then the disclaimer below must also be included.
++ *
++ * This software is provided as is, without representation or warranty
++ * of any kind either express or implied, including without limitation
++ * the implied warranties of merchantability, fitness for a particular
++ * purpose, or noninfringement. The Regents of the University of
++ * Michigan shall not be liable for any damages, including special,
++ * indirect, incidental, or consequential damages, with respect to any
++ * claim arising out of or in connection with the use of the software,
++ * even if it has been or is hereafter advised of the possibility of
++ * such damages.
++ */
+
-+ return layouts_found;
-+}
++#include <linux/nfs_fs.h>
++#include <linux/vmalloc.h>
+
-+static int
-+pnfs_return_client_layouts(struct nfs4_client *clp,
-+ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid)
-+{
-+ int layouts_found = 0;
-+ struct nfs4_layout *lp, *nextlp;
++#include "internal.h"
++#include "nfs4filelayout.h"
+
-+ spin_lock(&layout_lock);
-+ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) {
-+ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type ||
-+ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode &&
-+ lrp->args.lr_seg.iomode != IOMODE_ANY))
-+ continue;
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+ if (lrp->args.lr_return_type == RETURN_FSID &&
-+ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid))
-+ continue;
++/*
++ * Data server cache
++ *
++ * Data servers can be mapped to different device ids.
++ * nfs4_pnfs_ds reference counting
++ * - set to 1 on allocation
++ * - incremented when a device id maps a data server already in the cache.
++ * - decremented when deviceid is removed from the cache.
++ */
++DEFINE_SPINLOCK(nfs4_ds_cache_lock);
++static LIST_HEAD(nfs4_data_server_cache);
+
-+ layouts_found++;
-+ dequeue_layout(lp);
-+ destroy_layout(lp);
++/* Debug routines */
++void
++print_ds(struct nfs4_pnfs_ds *ds)
++{
++ if (ds == NULL) {
++ printk("%s NULL device\n", __func__);
++ return;
+ }
-+ spin_unlock(&layout_lock);
-+
-+ return layouts_found;
++ printk(" ip_addr %x port %hu\n"
++ " ref count %d\n"
++ " client %p\n"
++ " cl_exchange_flags %x\n",
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
++ atomic_read(&ds->ds_count), ds->ds_clp,
++ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
-+static int
-+recall_return_perfect_match(struct nfs4_layoutrecall *clr,
-+ struct nfsd4_pnfs_layoutreturn *lrp,
-+ struct nfs4_file *fp,
-+ struct svc_fh *current_fh)
++void
++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
-+ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode ||
-+ clr->cb.cbl_recall_type != lrp->args.lr_return_type)
-+ return 0;
-+
-+ return (clr->cb.cbl_recall_type == RETURN_FILE &&
-+ clr->clr_file == fp &&
-+ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset &&
-+ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) ||
-+
-+ (clr->cb.cbl_recall_type == RETURN_FSID &&
-+ same_fsid(&clr->cb.cbl_fsid, current_fh)) ||
++ int i;
+
-+ clr->cb.cbl_recall_type == RETURN_ALL;
++ ifdebug(FACILITY) {
++ printk("%s dsaddr->ds_num %d\n", __func__,
++ dsaddr->ds_num);
++ for (i = 0; i < dsaddr->ds_num; i++)
++ print_ds(dsaddr->ds_list[i]);
++ }
+}
+
-+static int
-+recall_return_partial_match(struct nfs4_layoutrecall *clr,
-+ struct nfsd4_pnfs_layoutreturn *lrp,
-+ struct nfs4_file *fp,
-+ struct svc_fh *current_fh)
++void print_deviceid(struct nfs4_deviceid *id)
+{
-+ /* iomode matching? */
-+ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode &&
-+ clr->cb.cbl_seg.iomode != IOMODE_ANY &&
-+ lrp->args.lr_seg.iomode != IOMODE_ANY)
-+ return 0;
-+
-+ if (clr->cb.cbl_recall_type == RETURN_ALL ||
-+ lrp->args.lr_return_type == RETURN_ALL)
-+ return 1;
-+
-+ /* fsid matches? */
-+ if (clr->cb.cbl_recall_type == RETURN_FSID ||
-+ lrp->args.lr_return_type == RETURN_FSID)
-+ return same_fsid(&clr->cb.cbl_fsid, current_fh);
++ u32 *p = (u32 *)id;
+
-+ /* file matches, range overlapping? */
-+ return clr->clr_file == fp &&
-+ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg);
++ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
++ p[0], p[1], p[2], p[3]);
+}
+
-+int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh,
-+ struct nfsd4_pnfs_layoutreturn *lrp)
++/* nfs4_ds_cache_lock is held */
++static struct nfs4_pnfs_ds *
++_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
-+ int status = 0;
-+ int layouts_found = 0;
-+ struct inode *ino = current_fh->fh_dentry->d_inode;
-+ struct nfs4_file *fp = NULL;
-+ struct nfs4_client *clp;
-+ struct nfs4_layoutrecall *clr, *nextclr;
-+ u64 ex_fsid = current_fh->fh_export->ex_fsid;
-+ void *recall_cookie = NULL;
-+
-+ dprintk("NFSD: %s\n", __func__);
++ struct nfs4_pnfs_ds *ds;
+
-+ nfs4_lock_state();
-+ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid);
-+ if (!clp)
-+ goto out;
++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
++ ntohl(ip_addr), ntohs(port));
+
-+ if (lrp->args.lr_return_type == RETURN_FILE) {
-+ fp = find_file(ino);
-+ if (!fp) {
-+ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for "
-+ "ino %p:%lu\n",
-+ __func__, ino, ino ? ino->i_ino : 0L);
-+ goto out;
++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
++ if (ds->ds_ip_addr == ip_addr &&
++ ds->ds_port == port) {
++ return ds;
+ }
++ }
++ return NULL;
++}
+
-+ /* Check the stateid */
-+ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino);
-+ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid,
-+ NULL);
-+ if (status)
-+ goto out_put_file;
++/* Create an rpc to the data server defined in 'dev_list' */
++static int
++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
++{
++ struct nfs_server *tmp;
++ struct sockaddr_in sin;
++ struct rpc_clnt *mds_clnt = mds_srv->client;
++ struct nfs_client *clp = mds_srv->nfs_client;
++ struct sockaddr *mds_addr;
++ int err = 0;
+
-+ /* update layouts */
-+ layouts_found = pnfs_return_file_layouts(clp, fp, lrp);
-+ /* optimize for the all-empty case */
-+ if (list_empty(&fp->fi_layouts))
-+ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
-+ } else {
-+ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid);
-+ }
++ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
++ mds_clnt->cl_auth->au_flavor);
+
-+ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d "
-+ "return_type %d fsid 0x%llx offset %llu length %llu: "
-+ "layouts_found %d\n",
-+ __func__, clp, fp, lrp->args.lr_seg.layout_type,
-+ lrp->args.lr_seg.iomode, lrp->args.lr_return_type,
-+ ex_fsid,
-+ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found);
++ sin.sin_family = AF_INET;
++ sin.sin_addr.s_addr = ds->ds_ip_addr;
++ sin.sin_port = ds->ds_port;
+
-+ /* update layoutrecalls
-+ * note: for RETURN_{FSID,ALL}, fp may be NULL
++ /*
++ * If this DS is also the MDS, use the MDS session only if the
++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role.
+ */
-+ spin_lock(&layout_lock);
-+ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls,
-+ clr_perclnt) {
-+ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type)
-+ continue;
-+
-+ if (recall_return_perfect_match(clr, lrp, fp, current_fh))
-+ recall_cookie = layoutrecall_done(clr);
-+ else if (layouts_found &&
-+ recall_return_partial_match(clr, lrp, fp, current_fh))
-+ clr->clr_time = CURRENT_TIME;
++ mds_addr = (struct sockaddr *)&clp->cl_addr;
++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) {
++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
++ printk(KERN_INFO
++ "ip:port %x:%hu is not a pNFS Data Server\n",
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++ err = -ENODEV;
++ } else {
++ atomic_inc(&clp->cl_count);
++ ds->ds_clp = clp;
++ dprintk("%s Using MDS Session for DS\n", __func__);
++ }
++ goto out;
+ }
-+ spin_unlock(&layout_lock);
-+
-+out_put_file:
-+ if (fp)
-+ put_nfs4_file(fp);
-+out:
-+ nfs4_unlock_state();
-+
-+ /* call exported filesystem layout_return (ignore return-code) */
-+ fs_layout_return(sb, ino, lrp, 0, recall_cookie);
-+
-+ dprintk("pNFS %s: exit status %d \n", __func__, status);
-+ return status;
-+}
+
-+/*
-+ * PNFS Metadata server export operations callback for get_state
-+ *
-+ * called by the cluster fs when it receives a get_state() from a data
-+ * server.
-+ * returns status, or pnfs_get_state* with pnfs_get_state->status set.
-+ *
-+ */
-+int
-+nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg)
-+{
-+ struct nfs4_stateid *stp;
-+ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */
-+ int status = -EINVAL;
-+ struct inode *ino;
-+ struct nfs4_delegation *dl;
-+ stateid_t *stid = (stateid_t *)&arg->stid;
++ /* Temporay server for nfs4_set_client */
++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
++ if (!tmp)
++ goto out;
+
-+ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__,
-+ STATEID_VAL(stid), arg->ino);
++ /*
++ * Set a retrans, timeout interval, and authflavor equual to the MDS
++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the
++ * same co_ownerid as the MDS.
++ */
++ err = nfs4_set_client(tmp,
++ mds_srv->nfs_client->cl_hostname,
++ (struct sockaddr *)&sin,
++ sizeof(struct sockaddr),
++ mds_srv->nfs_client->cl_ipaddr,
++ mds_clnt->cl_auth->au_flavor,
++ IPPROTO_TCP,
++ mds_clnt->cl_xprt->timeout,
++ 1 /* minorversion */);
++ if (err < 0)
++ goto out_free;
+
-+ nfs4_lock_state();
-+ stp = find_stateid(stid, flags);
-+ if (!stp) {
-+ ino = iget_locked(sb, arg->ino);
-+ if (!ino)
-+ goto out;
++ clp = tmp->nfs_client;
+
-+ if (ino->i_state & I_NEW) {
-+ iget_failed(ino);
-+ goto out;
-+ }
++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */
++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp);
++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS;
+
-+ dl = find_delegation_stateid(ino, stid);
-+ if (dl)
-+ status = 0;
++ err = nfs4_recover_expired_lease(clp);
++ if (!err)
++ err = nfs4_check_client_ready(clp);
++ if (err)
++ goto out_put;
+
-+ iput(ino);
-+ } else {
-+ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */
++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
++ printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n",
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++ err = -ENODEV;
++ goto out_put;
++ }
++ /*
++ * Set DS lease equal to the MDS lease, renewal is scheduled in
++ * create_session
++ */
++ spin_lock(&mds_srv->nfs_client->cl_lock);
++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
++ spin_unlock(&mds_srv->nfs_client->cl_lock);
++ clp->cl_last_renewal = jiffies;
+
-+ /* arg->devid is the Data server id, set by the cluster fs */
-+ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid);
-+ if (status)
-+ goto out;
++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
++ ds->ds_clp = clp;
+
-+ arg->access = stp->st_access_bmap;
-+ *(clientid_t *)&arg->clid =
-+ stp->st_stateowner->so_client->cl_clientid;
-+ }
++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__,
++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
++ clp->cl_rpcclient);
++out_free:
++ kfree(tmp);
+out:
-+ nfs4_unlock_state();
-+ return status;
++ dprintk("%s Returns %d\n", __func__, err);
++ return err;
++out_put:
++ nfs_put_client(clp);
++ goto out_free;
+}
+
-+static int
-+cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile,
-+ stateid_t *lsid)
++static void
++destroy_ds(struct nfs4_pnfs_ds *ds)
+{
-+ int found = 0;
-+ struct nfs4_layout *lp;
-+ struct nfs4_layout_state *ls;
-+
-+ spin_lock(&layout_lock);
-+ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) {
-+ if (lp->lo_file != lrfile)
-+ continue;
-+
-+ ls = find_get_layout_state(clp, lrfile);
-+ if (!ls) {
-+ /* This shouldn't happen as the file should have a
-+ * layout stateid if it has a layout.
-+ */
-+ printk(KERN_ERR "%s: file %p has no layout stateid\n",
-+ __func__, lrfile);
-+ WARN_ON(1);
-+ break;
-+ }
-+ update_stateid(&ls->ls_stateid);
-+ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t));
-+ put_layout_state_locked(ls);
-+ found = 1;
-+ break;
-+ }
-+ spin_unlock(&layout_lock);
++ dprintk("--> %s\n", __func__);
++ ifdebug(FACILITY)
++ print_ds(ds);
+
-+ return found;
++ if (ds->ds_clp)
++ nfs_put_client(ds->ds_clp);
++ kfree(ds);
+}
+
-+static int
-+cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid)
++static void
++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
-+ int found = 0;
-+ struct nfs4_layout *lp;
++ struct nfs4_pnfs_ds *ds;
++ int i;
+
-+ /* note: minor version unused */
-+ spin_lock(&layout_lock);
-+ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt)
-+ if (lp->lo_file->fi_fsid.major == fsid->major) {
-+ found = 1;
-+ break;
++ print_deviceid(&dsaddr->deviceid.de_id);
++
++ for (i = 0; i < dsaddr->ds_num; i++) {
++ ds = dsaddr->ds_list[i];
++ if (ds != NULL) {
++ if (atomic_dec_and_lock(&ds->ds_count,
++ &nfs4_ds_cache_lock)) {
++ list_del_init(&ds->ds_node);
++ spin_unlock(&nfs4_ds_cache_lock);
++ destroy_ds(ds);
++ }
+ }
-+ spin_unlock(&layout_lock);
-+ return found;
++ }
++ kfree(dsaddr->stripe_indices);
++ kfree(dsaddr);
+}
+
-+static int
-+cl_has_any_layout(struct nfs4_client *clp)
++void
++nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
-+ return !list_empty(&clp->cl_layouts);
++ struct nfs4_file_layout_dsaddr *dsaddr =
++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
++
++ nfs4_fl_free_deviceid(dsaddr);
+}
+
-+static int
-+cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl,
-+ struct nfs4_file *lrfile, stateid_t *lsid)
++static struct nfs4_pnfs_ds *
++nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
-+ switch (cbl->cbl_recall_type) {
-+ case RETURN_FILE:
-+ return cl_has_file_layout(clp, lrfile, lsid);
-+ case RETURN_FSID:
-+ return cl_has_fsid_layout(clp, &cbl->cbl_fsid);
-+ default:
-+ return cl_has_any_layout(clp);
++ struct nfs4_pnfs_ds *tmp_ds, *ds;
++
++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
++ if (!ds)
++ goto out;
++
++ spin_lock(&nfs4_ds_cache_lock);
++ tmp_ds = _data_server_lookup_locked(ip_addr, port);
++ if (tmp_ds == NULL) {
++ ds->ds_ip_addr = ip_addr;
++ ds->ds_port = port;
++ atomic_set(&ds->ds_count, 1);
++ INIT_LIST_HEAD(&ds->ds_node);
++ ds->ds_clp = NULL;
++ list_add(&ds->ds_node, &nfs4_data_server_cache);
++ dprintk("%s add new data server ip 0x%x\n", __func__,
++ ds->ds_ip_addr);
++ } else {
++ kfree(ds);
++ atomic_inc(&tmp_ds->ds_count);
++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
++ __func__, tmp_ds->ds_ip_addr,
++ atomic_read(&tmp_ds->ds_count));
++ ds = tmp_ds;
+ }
++ spin_unlock(&nfs4_ds_cache_lock);
++out:
++ return ds;
+}
+
+/*
-+ * Called without the layout_lock.
++ * Currently only support ipv4, and one multi-path address.
+ */
-+void
-+nomatching_layout(struct nfs4_layoutrecall *clr)
++static struct nfs4_pnfs_ds *
++decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
-+ struct nfsd4_pnfs_layoutreturn lr = {
-+ .args.lr_return_type = clr->cb.cbl_recall_type,
-+ .args.lr_seg = clr->cb.cbl_seg,
-+ };
-+ struct inode *inode;
-+ void *recall_cookie;
++ struct nfs4_pnfs_ds *ds = NULL;
++ char *buf;
++ const char *ipend, *pstr;
++ u32 ip_addr, port;
++ int nlen, rlen, i;
++ int tmp[2];
++ __be32 *r_netid, *r_addr, *p = *pp;
+
-+ if (clr->clr_file) {
-+ inode = igrab(clr->clr_file->fi_inode);
-+ if (WARN_ON(!inode))
-+ return;
-+ } else {
-+ inode = NULL;
++ /* r_netid */
++ nlen = be32_to_cpup(p++);
++ r_netid = p;
++ p += XDR_QUADLEN(nlen);
++
++ /* r_addr */
++ rlen = be32_to_cpup(p++);
++ r_addr = p;
++ p += XDR_QUADLEN(rlen);
++ *pp = p;
++
++ /* Check that netid is "tcp" */
++ if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
++ dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
++ goto out_err;
+ }
+
-+ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__,
-+ clr->clr_client, clr->clr_file);
++ /* ipv6 length plus port is legal */
++ if (rlen > INET6_ADDRSTRLEN + 8) {
++ dprintk("%s Invalid address, length %d\n", __func__,
++ rlen);
++ goto out_err;
++ }
++ buf = kmalloc(rlen + 1, GFP_KERNEL);
++ buf[rlen] = '\0';
++ memcpy(buf, r_addr, rlen);
+
-+ if (clr->cb.cbl_recall_type == RETURN_FILE)
-+ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr);
-+ else
-+ pnfs_return_client_layouts(clr->clr_client, &lr,
-+ clr->cb.cbl_fsid.major);
++ /* replace the port dots with dashes for the in4_pton() delimiter*/
++ for (i = 0; i < 2; i++) {
++ char *res = strrchr(buf, '.');
++ *res = '-';
++ }
+
-+ spin_lock(&layout_lock);
-+ recall_cookie = layoutrecall_done(clr);
-+ spin_unlock(&layout_lock);
++ /* Currently only support ipv4 address */
++ if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
++ dprintk("%s: Only ipv4 addresses supported\n", __func__);
++ goto out_free;
++ }
++
++ /* port */
++ pstr = ipend;
++ sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
++ port = htons((tmp[0] << 8) | (tmp[1]));
+
-+ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN,
-+ recall_cookie);
-+ iput(inode);
++ ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
++ dprintk("%s Decoded address and port %s\n", __func__, buf);
++out_free:
++ kfree(buf);
++out_err:
++ return ds;
+}
+
-+void pnfs_expire_client(struct nfs4_client *clp)
++/* Decode opaque device data and return the result */
++static struct nfs4_file_layout_dsaddr*
++decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
-+ for (;;) {
-+ struct nfs4_layoutrecall *lrp = NULL;
++ int i, dummy;
++ u32 cnt, num;
++ u8 *indexp;
++ __be32 *p = (__be32 *)pdev->area, *indicesp;
++ struct nfs4_file_layout_dsaddr *dsaddr;
+
-+ spin_lock(&layout_lock);
-+ if (!list_empty(&clp->cl_layoutrecalls)) {
-+ lrp = list_entry(clp->cl_layoutrecalls.next,
-+ struct nfs4_layoutrecall, clr_perclnt);
-+ get_layoutrecall(lrp);
-+ }
-+ spin_unlock(&layout_lock);
-+ if (!lrp)
-+ break;
++ /* Get the stripe count (number of stripe index) */
++ cnt = be32_to_cpup(p++);
++ dprintk("%s stripe count %d\n", __func__, cnt);
++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
++ printk(KERN_WARNING "%s: stripe count %d greater than "
++ "supported maximum %d\n", __func__,
++ cnt, NFS4_PNFS_MAX_STRIPE_CNT);
++ goto out_err;
++ }
+
-+ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file);
-+ BUG_ON(lrp->clr_client != clp);
-+ nomatching_layout(lrp);
-+ put_layoutrecall(lrp);
++ /* Check the multipath list count */
++ indicesp = p;
++ p += XDR_QUADLEN(cnt << 2);
++ num = be32_to_cpup(p++);
++ dprintk("%s ds_num %u\n", __func__, num);
++ if (num > NFS4_PNFS_MAX_MULTI_CNT) {
++ printk(KERN_WARNING "%s: multipath count %d greater than "
++ "supported maximum %d\n", __func__,
++ num, NFS4_PNFS_MAX_MULTI_CNT);
++ goto out_err;
+ }
++ dsaddr = kzalloc(sizeof(*dsaddr) +
++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
++ GFP_KERNEL);
++ if (!dsaddr)
++ goto out_err;
+
-+ for (;;) {
-+ struct nfs4_layout *lp = NULL;
-+ struct inode *inode = NULL;
-+ struct nfsd4_pnfs_layoutreturn lr;
-+ bool empty = false;
++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
++ if (!dsaddr->stripe_indices)
++ goto out_err_free;
+
-+ spin_lock(&layout_lock);
-+ if (!list_empty(&clp->cl_layouts)) {
-+ lp = list_entry(clp->cl_layouts.next,
-+ struct nfs4_layout, lo_perclnt);
-+ inode = igrab(lp->lo_file->fi_inode);
-+ memset(&lr, 0, sizeof(lr));
-+ lr.args.lr_return_type = RETURN_FILE;
-+ lr.args.lr_seg = lp->lo_seg;
-+ empty = list_empty(&lp->lo_file->fi_layouts);
-+ BUG_ON(lp->lo_client != clp);
-+ dequeue_layout(lp);
-+ destroy_layout(lp); /* do not access lp after this */
-+ }
-+ spin_unlock(&layout_lock);
-+ if (!lp)
-+ break;
++ dsaddr->stripe_count = cnt;
++ dsaddr->ds_num = num;
+
-+ if (WARN_ON(!inode))
-+ break;
++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+
-+ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino,
-+ lp, clp);
++ /* Go back an read stripe indices */
++ p = indicesp;
++ indexp = &dsaddr->stripe_indices[0];
++ for (i = 0; i < dsaddr->stripe_count; i++) {
++ *indexp = be32_to_cpup(p++);
++ if (*indexp >= num)
++ goto out_err_free;
++ indexp++;
++ }
++ /* Skip already read multipath list count */
++ p++;
+
-+ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE,
-+ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL);
-+ iput(inode);
++ for (i = 0; i < dsaddr->ds_num; i++) {
++ int j;
++
++ dummy = be32_to_cpup(p++); /* multipath count */
++ if (dummy > 1) {
++ printk(KERN_WARNING
++ "%s: Multipath count %d not supported, "
++ "skipping all greater than 1\n", __func__,
++ dummy);
++ }
++ for (j = 0; j < dummy; j++) {
++ if (j == 0) {
++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
++ if (dsaddr->ds_list[i] == NULL)
++ goto out_err_free;
++ } else {
++ u32 len;
++ /* skip extra multipath */
++ len = be32_to_cpup(p++);
++ p += XDR_QUADLEN(len);
++ len = be32_to_cpup(p++);
++ p += XDR_QUADLEN(len);
++ continue;
++ }
++ }
+ }
-+}
++ return dsaddr;
+
-+struct create_recall_list_arg {
-+ struct nfsd4_pnfs_cb_layout *cbl;
-+ struct nfs4_file *lrfile;
-+ struct list_head *todolist;
-+ unsigned todo_count;
-+};
++out_err_free:
++ nfs4_fl_free_deviceid(dsaddr);
++out_err:
++ dprintk("%s ERROR: returning NULL\n", __func__);
++ return NULL;
++}
+
+/*
-+ * look for matching layout for the given client
-+ * and add a pending layout recall to the todo list
-+ * if found any.
-+ * returns:
-+ * 0 if layouts found or negative error.
++ * Decode the opaque device specified in 'dev'
++ * and add it to the list of available devices.
++ * If the deviceid is already cached, nfs4_add_deviceid will return
++ * a pointer to the cached struct and throw away the new.
+ */
-+static int
-+lo_recall_per_client(struct nfs4_client *clp, void *p)
++static struct nfs4_file_layout_dsaddr*
++decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
-+ stateid_t lsid;
-+ struct nfs4_layoutrecall *pending;
-+ struct create_recall_list_arg *arg = p;
++ struct nfs4_file_layout_dsaddr *dsaddr;
++ struct pnfs_deviceid_node *d;
+
-+ memset(&lsid, 0, sizeof(lsid));
-+ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid))
-+ return 0;
++ dsaddr = decode_device(inode, dev);
++ if (!dsaddr) {
++ printk(KERN_WARNING "%s: Could not decode or add device\n",
++ __func__);
++ return NULL;
++ }
+
-+ /* Matching put done by layoutreturn */
-+ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile);
-+ /* out of memory, drain todo queue */
-+ if (!pending)
-+ return -ENOMEM;
++ d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
++ &dsaddr->deviceid);
+
-+ *(stateid_t *)&pending->cb.cbl_sid = lsid;
-+ list_add(&pending->clr_perclnt, arg->todolist);
-+ arg->todo_count++;
-+ return 0;
++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+
-+/* Create a layoutrecall structure for each client based on the
-+ * original structure. */
-+int
-+create_layout_recall_list(struct list_head *todolist, unsigned *todo_len,
-+ struct nfsd4_pnfs_cb_layout *cbl,
-+ struct nfs4_file *lrfile)
++/*
++ * Retrieve the information for dev_id, add it to the list
++ * of available devices, and return it.
++ */
++struct nfs4_file_layout_dsaddr *
++get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
-+ struct nfs4_client *clp;
-+ struct create_recall_list_arg arg = {
-+ .cbl = cbl,
-+ .lrfile = lrfile,
-+ .todolist = todolist,
-+ };
-+ int status = 0;
++ struct pnfs_device *pdev = NULL;
++ u32 max_resp_sz;
++ int max_pages;
++ struct page **pages = NULL;
++ struct nfs4_file_layout_dsaddr *dsaddr = NULL;
++ int rc, i;
++ struct nfs_server *server = NFS_SERVER(inode);
+
-+ dprintk("%s: -->\n", __func__);
++ /*
++ * Use the session max response size as the basis for setting
++ * GETDEVICEINFO's maxcount
++ */
++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
++ max_pages = max_resp_sz >> PAGE_SHIFT;
++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
++ __func__, inode, max_resp_sz, max_pages);
+
-+ /* If client given by fs, just do single client */
-+ if (cbl->cbl_seg.clientid) {
-+ clp = find_confirmed_client(
-+ (clientid_t *)&cbl->cbl_seg.clientid);
-+ if (!clp) {
-+ status = -ENOENT;
-+ dprintk("%s: clientid %llx not found\n", __func__,
-+ (unsigned long long)cbl->cbl_seg.clientid);
-+ goto out;
-+ }
++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
++ if (pdev == NULL)
++ return NULL;
++
++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
++ if (pages == NULL) {
++ kfree(pdev);
++ return NULL;
++ }
++ for (i = 0; i < max_pages; i++) {
++ pages[i] = alloc_page(GFP_KERNEL);
++ if (!pages[i])
++ goto out_free;
++ }
++
++ /* set pdev->area */
++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
++ if (!pdev->area)
++ goto out_free;
++
++ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
++ pdev->layout_type = LAYOUT_NFSV4_1_FILES;
++ pdev->pages = pages;
++ pdev->pgbase = 0;
++ pdev->pglen = PAGE_SIZE * max_pages;
++ pdev->mincount = 0;
++
++ rc = nfs4_proc_getdeviceinfo(server, pdev);
++ dprintk("%s getdevice info returns %d\n", __func__, rc);
++ if (rc)
++ goto out_free;
+
-+ status = lo_recall_per_client(clp, &arg);
-+ } else {
-+ /* Check all clients for layout matches */
-+ status = filter_confirmed_clients(lo_recall_per_client, &arg);
-+ }
++ /*
++ * Found new device, need to decode it and then add it to the
++ * list of known devices for this mountpoint.
++ */
++ dsaddr = decode_and_add_device(inode, pdev);
++out_free:
++ if (pdev->area != NULL)
++ vunmap(pdev->area);
++ for (i = 0; i < max_pages; i++)
++ __free_page(pages[i]);
++ kfree(pages);
++ kfree(pdev);
++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
++ return dsaddr;
++}
+
-+out:
-+ *todo_len = arg.todo_count;
-+ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status);
-+ return status;
++struct nfs4_file_layout_dsaddr *
++nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
++{
++ struct pnfs_deviceid_node *d;
++
++ d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
++ return (d == NULL) ? NULL :
++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+
+/*
-+ * Recall layouts asynchronously
-+ * Called with state lock.
++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
++ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
-+static int
-+spawn_layout_recall(struct super_block *sb, struct list_head *todolist,
-+ unsigned todo_len)
++static u32
++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
-+ struct nfs4_layoutrecall *pending;
-+ struct nfs4_layoutrecall *parent = NULL;
-+ int status = 0;
-+
-+ dprintk("%s: -->\n", __func__);
-+
-+ if (todo_len > 1) {
-+ pending = list_entry(todolist->next, struct nfs4_layoutrecall,
-+ clr_perclnt);
-+
-+ parent = alloc_init_layoutrecall(&pending->cb, NULL,
-+ pending->clr_file);
-+ if (unlikely(!parent)) {
-+ /* We want forward progress. If parent cannot be
-+ * allocated take the first one as parent but don't
-+ * execute it. Caller must check for -EAGAIN, if so
-+ * When the partial recalls return,
-+ * nfsd_layout_recall_cb should be called again.
-+ */
-+ list_del_init(&pending->clr_perclnt);
-+ if (todo_len > 2) {
-+ parent = pending;
-+ } else {
-+ parent = NULL;
-+ put_layoutrecall(pending);
-+ }
-+ --todo_len;
-+ status = -ENOMEM;
-+ }
-+ }
-+
-+ while (!list_empty(todolist)) {
-+ pending = list_entry(todolist->next, struct nfs4_layoutrecall,
-+ clr_perclnt);
-+ list_del_init(&pending->clr_perclnt);
-+ dprintk("%s: clp %p cb_client %p fp %p\n", __func__,
-+ pending->clr_client,
-+ pending->clr_client->cl_cb_client,
-+ pending->clr_file);
-+ if (unlikely(!pending->clr_client->cl_cb_client)) {
-+ printk(KERN_INFO
-+ "%s: clientid %08x/%08x has no callback path\n",
-+ __func__,
-+ pending->clr_client->cl_clientid.cl_boot,
-+ pending->clr_client->cl_clientid.cl_id);
-+ put_layoutrecall(pending);
-+ continue;
-+ }
++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
++ u64 tmp;
+
-+ pending->clr_time = CURRENT_TIME;
-+ pending->clr_sb = sb;
-+ if (parent) {
-+ /* If we created a parent its initial ref count is 1.
-+ * We will need to de-ref it eventually. So we just
-+ * don't increment on behalf of the last one.
-+ */
-+ if (todo_len != 1)
-+ get_layoutrecall(parent);
-+ }
-+ pending->parent = parent;
-+ get_layoutrecall(pending);
-+ /* Add to list so corresponding layoutreturn can find req */
-+ list_add(&pending->clr_perclnt,
-+ &pending->clr_client->cl_layoutrecalls);
++ tmp = offset - flseg->pattern_offset;
++ do_div(tmp, flseg->stripe_unit);
++ tmp += flseg->first_stripe_index;
++ return do_div(tmp, flseg->dsaddr->stripe_count);
++}
+
-+ nfsd4_cb_layout(pending);
-+ --todo_len;
-+ }
++u32
++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset)
++{
++ u32 j;
+
-+ return status;
++ j = _nfs4_fl_calc_j_index(lseg, offset);
++ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
-+/*
-+ * Spawn a thread to perform a recall layout
-+ *
-+ */
-+int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode,
-+ struct nfsd4_pnfs_cb_layout *cbl)
++struct nfs_fh *
++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset)
+{
-+ int status;
-+ struct nfs4_file *lrfile = NULL;
-+ struct list_head todolist;
-+ unsigned todo_len = 0;
++ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
++ u32 i;
+
-+ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl);
-+ BUG_ON(!cbl);
-+ BUG_ON(cbl->cbl_recall_type != RETURN_FILE &&
-+ cbl->cbl_recall_type != RETURN_FSID &&
-+ cbl->cbl_recall_type != RETURN_ALL);
-+ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode);
-+ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ &&
-+ cbl->cbl_seg.iomode != IOMODE_RW &&
-+ cbl->cbl_seg.iomode != IOMODE_ANY);
++ if (flseg->stripe_type == STRIPE_SPARSE) {
++ if (flseg->num_fh == 1)
++ i = 0;
++ else if (flseg->num_fh == 0)
++ return NULL;
++ else
++ i = nfs4_fl_calc_ds_index(lseg, offset);
++ } else
++ i = _nfs4_fl_calc_j_index(lseg, offset);
++ return flseg->fh_array[i];
++}
+
-+ if (nfsd_serv == NULL) {
-+ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n");
-+ return -ENOENT;
-+ }
++struct nfs4_pnfs_ds *
++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
++{
++ struct nfs4_file_layout_dsaddr *dsaddr;
+
-+ nfs4_lock_state();
-+ status = -ENOENT;
-+ if (inode) {
-+ lrfile = find_file(inode);
-+ if (!lrfile) {
-+ dprintk("NFSD nfsd_layout_recall_cb: "
-+ "nfs4_file not found\n");
-+ goto err;
-+ }
-+ if (cbl->cbl_recall_type == RETURN_FSID)
-+ cbl->cbl_fsid = lrfile->fi_fsid;
++ dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
++ if (dsaddr->ds_list[ds_idx] == NULL) {
++ printk(KERN_ERR "%s: No data server for device id!\n",
++ __func__);
++ return NULL;
+ }
+
-+ INIT_LIST_HEAD(&todolist);
-+
-+ /* If no cookie provided by FS, return a default one */
-+ if (!cbl->cbl_cookie)
-+ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
++ if (!dsaddr->ds_list[ds_idx]->ds_clp) {
++ int err;
+
-+ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile);
-+ if (list_empty(&todolist)) {
-+ status = -ENOENT;
-+ } else {
-+ /* process todolist even if create_layout_recall_list
-+ * returned an error */
-+ int status2 = spawn_layout_recall(sb, &todolist, todo_len);
-+ if (status2)
-+ status = status2;
++ err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->layout->inode),
++ dsaddr->ds_list[ds_idx]);
++ if (err) {
++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n",
++ __func__, err);
++ return NULL;
++ }
+ }
-+
-+err:
-+ nfs4_unlock_state();
-+ if (lrfile)
-+ put_nfs4_file(lrfile);
-+ return (todo_len && status) ? -EAGAIN : status;
++ return dsaddr->ds_list[ds_idx];
+}
+diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
+index 089da5b..cc642dd 100644
+--- a/fs/nfs/nfs4proc.c
++++ b/fs/nfs/nfs4proc.c
+@@ -55,6 +55,7 @@
+ #include "internal.h"
+ #include "iostat.h"
+ #include "callback.h"
++#include "pnfs.h"
+
+ #define NFSDBG_FACILITY NFSDBG_PROC
+
+@@ -67,7 +68,7 @@ struct nfs4_opendata;
+ static int _nfs4_proc_open(struct nfs4_opendata *data);
+ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
+ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *);
+ static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+@@ -125,11 +126,12 @@ const u32 nfs4_pathconf_bitmap[2] = {
+ 0
+ };
+
+-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
+ | FATTR4_WORD0_MAXREAD
+ | FATTR4_WORD0_MAXWRITE
+ | FATTR4_WORD0_LEASE_TIME,
+- 0
++ FATTR4_WORD1_FS_LAYOUT_TYPES,
++ FATTR4_WORD2_LAYOUT_BLKSIZE
+ };
+
+ const u32 nfs4_fs_locations_bitmap[2] = {
+@@ -562,6 +564,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
+ }
+
+ int nfs4_setup_sequence(const struct nfs_server *server,
++ struct nfs4_session *ds_session,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ int cache_reply,
+@@ -570,6 +573,8 @@ int nfs4_setup_sequence(const struct nfs_server *server,
+ struct nfs4_session *session = nfs4_get_session(server);
+ int ret = 0;
+
++ if (ds_session)
++ session = ds_session;
+ if (session == NULL) {
+ args->sa_session = NULL;
+ res->sr_session = NULL;
+@@ -599,7 +604,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
+
+ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
+
+- if (nfs4_setup_sequence(data->seq_server, data->seq_args,
++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args,
+ data->seq_res, data->cache_reply, task))
+ return;
+ rpc_call_start(task);
+@@ -1378,7 +1383,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
+ nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
+ }
+ data->timestamp = jiffies;
+- if (nfs4_setup_sequence(data->o_arg.server,
++ if (nfs4_setup_sequence(data->o_arg.server, NULL,
+ &data->o_arg.seq_args,
+ &data->o_res.seq_res, 1, task))
+ return;
+@@ -1553,9 +1558,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
+ return 0;
+ }
+
+-static int nfs4_recover_expired_lease(struct nfs_server *server)
++int nfs4_recover_expired_lease(struct nfs_client *clp)
+ {
+- struct nfs_client *clp = server->nfs_client;
+ unsigned int loop;
+ int ret;
+
+@@ -1571,6 +1575,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
+ }
+ return ret;
+ }
++EXPORT_SYMBOL(nfs4_recover_expired_lease);
+
+ /*
+ * OPEN_EXPIRED:
+@@ -1660,7 +1665,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
+ dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+ goto out_err;
+ }
+- status = nfs4_recover_expired_lease(server);
++ status = nfs4_recover_expired_lease(server->nfs_client);
+ if (status != 0)
+ goto err_put_state_owner;
+ if (path->dentry->d_inode != NULL)
+@@ -1871,7 +1876,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
+ if (calldata->arg.fmode == 0)
+ break;
+ default:
+- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
+ nfs_release_seqid(calldata->arg.seqid);
+@@ -1916,7 +1921,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
+
+ nfs_fattr_init(calldata->res.fattr);
+ calldata->timestamp = jiffies;
+- if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL,
+ &calldata->arg.seq_args, &calldata->res.seq_res,
+ 1, task))
+ return;
+@@ -1979,8 +1984,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
+ path_get(path);
+ calldata->path = *path;
+
+- msg.rpc_argp = &calldata->arg,
+- msg.rpc_resp = &calldata->res,
++ msg.rpc_argp = &calldata->arg;
++ msg.rpc_resp = &calldata->res;
+ task_setup_data.callback_data = calldata;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+@@ -2337,6 +2342,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
+ struct nfs4_state *state = NULL;
+ int status;
+
++ if (pnfs_ld_layoutret_on_setattr(inode))
++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true);
+
-+struct create_device_notify_list_arg {
-+ struct list_head *todolist;
-+ struct nfsd4_pnfs_cb_dev_list *ndl;
-+};
-+
-+static int
-+create_device_notify_per_cl(struct nfs4_client *clp, void *p)
-+{
-+ struct nfs4_notify_device *cbnd;
-+ struct create_device_notify_list_arg *arg = p;
-+
-+ if (atomic_read(&clp->cl_deviceref) <= 0)
+ nfs_fattr_init(fattr);
+
+ /* Search for an existing open(O_WRITE) file */
+@@ -2664,7 +2672,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
+
+ if (!nfs4_sequence_done(task, &res->seq_res))
+ return 0;
+- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
+ return 0;
+ update_changeattr(dir, &res->cinfo);
+ nfs_post_op_update_inode(dir, res->dir_attr);
+@@ -3105,19 +3113,31 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+ {
+ struct nfs_server *server = NFS_SERVER(data->inode);
++ struct nfs_client *client = server->nfs_client;
+
+ dprintk("--> %s\n", __func__);
+
++#ifdef CONFIG_NFS_V4_1
++ if (data->pdata.pnfsflags & PNFS_NO_RPC)
+ return 0;
+
-+ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL);
-+ if (!cbnd)
-+ return -ENOMEM;
-+
-+ cbnd->nd_list = arg->ndl;
-+ cbnd->nd_client = clp;
-+ list_add(&cbnd->nd_perclnt, arg->todolist);
-+ return 0;
-+}
++ /* Is this a DS session */
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS read\n", __func__);
++ client = data->fldata.ds_nfs_client;
++ }
++#endif /* CONFIG_NFS_V4_1 */
+
-+/* Create a list of clients to send device notifications. */
-+int
-+create_device_notify_list(struct list_head *todolist,
-+ struct nfsd4_pnfs_cb_dev_list *ndl)
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+
+- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+- nfs_restart_rpc(task, server->nfs_client);
++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
++ nfs_restart_rpc(task, client);
+ return -EAGAIN;
+ }
+
+ nfs_invalidate_atime(data->inode);
+- if (task->tk_status > 0)
++ if (task->tk_status > 0 && client == server->nfs_client)
+ renew_lease(server, data->timestamp);
+ return 0;
+ }
+@@ -3128,20 +3148,56 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+ }
+
++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data)
+{
-+ int status;
-+ struct create_device_notify_list_arg arg = {
-+ .todolist = todolist,
-+ .ndl = ndl,
-+ };
-+
-+ nfs4_lock_state();
-+ status = filter_confirmed_clients(create_device_notify_per_cl, &arg);
-+ nfs4_unlock_state();
-+
-+ return status;
++#ifdef CONFIG_NFS_V4_1
++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
++ pnfs_need_layoutcommit(nfsi, data->args.context);
++#endif /* CONFIG_NFS_V4_1 */
+}
+
-+/*
-+ * For each client that a device, send a device notification.
-+ * XXX: Need to track which clients have which devices.
-+ */
-+int nfsd_device_notify_cb(struct super_block *sb,
-+ struct nfsd4_pnfs_cb_dev_list *ndl)
-+{
-+ struct nfs4_notify_device *cbnd;
-+ unsigned int notify_num = 0;
-+ int status2, status = 0;
-+ struct list_head todolist;
-+
-+ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list);
-+
-+ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len);
-+
-+ if (nfsd_serv == NULL)
-+ return -ENOENT;
-+
-+ INIT_LIST_HEAD(&todolist);
-+
-+ status = create_device_notify_list(&todolist, ndl);
+ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+ {
+ struct inode *inode = data->inode;
+-
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct nfs_client *client = server->nfs_client;
+
-+ while (!list_empty(&todolist)) {
-+ cbnd = list_entry(todolist.next, struct nfs4_notify_device,
-+ nd_perclnt);
-+ list_del_init(&cbnd->nd_perclnt);
-+ status2 = nfsd4_cb_notify_device(cbnd);
-+ pnfs_clear_device_notify(cbnd->nd_client);
-+ if (status2) {
-+ kfree(cbnd);
-+ status = status2;
-+ }
-+ notify_num++;
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+
+- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
++#ifdef CONFIG_NFS_V4_1
++ /* restore original count after retry? */
++ if (data->pdata.orig_count) {
++ dprintk("%s: restoring original count %u\n", __func__,
++ data->pdata.orig_count);
++ data->args.count = data->pdata.orig_count;
+ }
+
-+ dprintk("NFSD %s: status %d clients %u\n",
-+ __func__, status, notify_num);
-+ return status;
-+}
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-09-30 12:25:08.508288000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-09-30 12:25:08.510284000 -0400
-@@ -0,0 +1,461 @@
-+/******************************************************************************
-+ *
-+ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
-+ * (c) 2009 NetApp. All Rights Reserved.
-+ *
-+ * NetApp provides this source code under the GPL v2 License.
-+ * The GPL v2 license is available at
-+ * http://opensource.org/licenses/gpl-license.php.
-+ *
-+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ ******************************************************************************/
++ if (data->pdata.pnfsflags & PNFS_NO_RPC)
++ return 0;
+
-+#include <linux/nfs4.h>
-+#include <linux/nfsd/const.h>
-+#include <linux/nfsd/debug.h>
-+#include <linux/nfsd/nfs4pnfsdlm.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/sunrpc/clnt.h>
++ /* Is this a DS session */
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS write\n", __func__);
++ client = data->fldata.ds_nfs_client;
++ }
++#endif /* CONFIG_NFS_V4_1 */
+
-+#include "nfsfh.h"
-+#include "nfsd.h"
++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
++ nfs_restart_rpc(task, client);
+ return -EAGAIN;
+ }
+
-+#define NFSDDBG_FACILITY NFSDDBG_PROC
++ /*
++ * MDS write: renew lease
++ * DS write: update lastbyte written, mark for layout commit
++ */
+ if (task->tk_status >= 0) {
+- renew_lease(NFS_SERVER(inode), data->timestamp);
+- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
++ if (client == server->nfs_client) {
++ renew_lease(server, data->timestamp);
++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
++ } else
++ pnfs4_update_write_done(NFS_I(inode), data);
+ }
+ return 0;
+ }
+@@ -3154,21 +3210,42 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
+ data->res.server = server;
+ data->timestamp = jiffies;
+
++#ifdef CONFIG_NFS_V4_1
++ /* writes to DS use pnfs vector */
++ if (data->fldata.ds_nfs_client) {
++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE];
++ return;
++ }
++#endif /* CONFIG_NFS_V4_1 */
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
+ }
+
+ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+ {
+ struct inode *inode = data->inode;
+-
++ struct nfs_server *server = NFS_SERVER(data->inode);
++ struct nfs_client *client = server->nfs_client;
+
-+/* Just use a linked list. Do not expect more than 32 dlm_device_entries
-+ * the first implementation will just use one device per cluster file system
-+ */
++#ifdef CONFIG_NFS_V4_1
++ if (data->pdata.pnfsflags & PNFS_NO_RPC)
++ return 0;
+
-+static LIST_HEAD(dlm_device_list);
-+static DEFINE_SPINLOCK(dlm_device_list_lock);
++ /* Is this a DS session */
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS commit\n", __func__);
++ client = data->fldata.ds_nfs_client;
++ }
++#endif /* CONFIG_NFS_V4_1 */
+
-+struct dlm_device_entry {
-+ struct list_head dlm_dev_list;
-+ char disk_name[DISK_NAME_LEN];
-+ int num_ds;
-+ char ds_list[NFSD_DLM_DS_LIST_MAX];
-+};
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+
+- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) {
+ nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+ return -EAGAIN;
+ }
+- nfs_refresh_inode(inode, data->res.fattr);
++ if (client == server->nfs_client)
++ nfs_refresh_inode(inode, data->res.fattr);
+ return 0;
+ }
+
+@@ -3178,6 +3255,12 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
+
+ data->args.bitmask = server->cache_consistency_bitmask;
+ data->res.server = server;
++#if defined(CONFIG_NFS_V4_1)
++ if (data->fldata.ds_nfs_client) {
++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT];
++ return;
++ }
++#endif /* CONFIG_NFS_V4_1 */
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
+ }
+
+@@ -3475,9 +3558,10 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
+ }
+
+ static int
+-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp)
+ {
+- struct nfs_client *clp = server->nfs_client;
++ if (!clp)
++ clp = server->nfs_client;
+
+ if (task->tk_status >= 0)
+ return 0;
+@@ -3504,14 +3588,16 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+- dprintk("%s ERROR %d, Reset session\n", __func__,
+- task->tk_status);
++ dprintk("%s ERROR %d, Reset session. Exchangeid "
++ "flags 0x%x\n", __func__, task->tk_status,
++ clp->cl_exchange_flags);
+ nfs4_schedule_state_recovery(clp);
+ task->tk_status = 0;
+ return -EAGAIN;
+ #endif /* CONFIG_NFS_V4_1 */
+ case -NFS4ERR_DELAY:
+- nfs_inc_server_stats(server, NFSIOS_DELAY);
++ if (server)
++ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ case -NFS4ERR_GRACE:
+ case -EKEYEXPIRED:
+ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+@@ -3524,6 +3610,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
+ task->tk_status = nfs4_map_errors(task->tk_status);
+ return 0;
+ do_state_recovery:
++ if (is_ds_only_client(clp))
++ return 0;
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ nfs4_schedule_state_recovery(clp);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+@@ -3657,8 +3745,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
+ renew_lease(data->res.server, data->timestamp);
+ break;
+ default:
+- if (nfs4_async_handle_error(task, data->res.server, NULL) ==
+- -EAGAIN) {
++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL)
++ == -EAGAIN) {
+ nfs_restart_rpc(task, data->res.server->nfs_client);
+ return;
+ }
+@@ -3678,7 +3766,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
+
+ d_data = (struct nfs4_delegreturndata *)data;
+
+- if (nfs4_setup_sequence(d_data->res.server,
++ if (nfs4_setup_sequence(d_data->res.server, NULL,
+ &d_data->args.seq_args,
+ &d_data->res.seq_res, 1, task))
+ return;
+@@ -3913,7 +4001,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
+ case -NFS4ERR_EXPIRED:
+ break;
+ default:
+- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN)
+ nfs_restart_rpc(task,
+ calldata->server->nfs_client);
+ }
+@@ -3931,7 +4019,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
+ return;
+ }
+ calldata->timestamp = jiffies;
+- if (nfs4_setup_sequence(calldata->server,
++ if (nfs4_setup_sequence(calldata->server, NULL,
+ &calldata->arg.seq_args,
+ &calldata->res.seq_res, 1, task))
+ return;
+@@ -3973,8 +4061,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
+ return ERR_PTR(-ENOMEM);
+ }
+
+- msg.rpc_argp = &data->arg,
+- msg.rpc_resp = &data->res,
++ msg.rpc_argp = &data->arg;
++ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ return rpc_run_task(&task_setup_data);
+ }
+@@ -4086,7 +4174,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
+ } else
+ data->arg.new_lock_owner = 0;
+ data->timestamp = jiffies;
+- if (nfs4_setup_sequence(data->server,
++ if (nfs4_setup_sequence(data->server, NULL,
+ &data->arg.seq_args,
+ &data->res.seq_res, 1, task))
+ return;
+@@ -4211,8 +4299,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
+ data->arg.reclaim = NFS_LOCK_RECLAIM;
+ task_setup_data.callback_ops = &nfs4_recover_lock_ops;
+ }
+- msg.rpc_argp = &data->arg,
+- msg.rpc_resp = &data->res,
++ msg.rpc_argp = &data->arg;
++ msg.rpc_resp = &data->res;
+ task_setup_data.callback_data = data;
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+@@ -4557,7 +4645,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
+ nfs4_verifier verifier;
+ struct nfs41_exchange_id_args args = {
+ .client = clp,
+- .flags = clp->cl_exchange_flags,
++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R,
+ };
+ struct nfs41_exchange_id_res res = {
+ .client = clp,
+@@ -5081,7 +5169,7 @@ int nfs4_init_session(struct nfs_server *server)
+ session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
+ session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
+
+- ret = nfs4_recover_expired_lease(server);
++ ret = nfs4_recover_expired_lease(server->nfs_client);
+ if (!ret)
+ ret = nfs4_check_client_ready(clp);
+ return ret;
+@@ -5333,6 +5421,412 @@ out:
+ dprintk("<-- %s status=%d\n", __func__, status);
+ return status;
+ }
+
-+static struct dlm_device_entry *
-+_nfsd4_find_pnfs_dlm_device(char *disk_name)
++static void
++nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
-+ struct dlm_device_entry *dlm_pdev;
++ struct nfs4_layoutget *lgp = calldata;
++ struct inode *ino = lgp->args.inode;
++ struct nfs_inode *nfsi = NFS_I(ino);
++ struct nfs_server *server = NFS_SERVER(ino);
++ struct pnfs_layout_segment *lseg;
+
-+ dprintk("--> %s disk name %s\n", __func__, disk_name);
-+ spin_lock(&dlm_device_list_lock);
-+ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) {
-+ dprintk("%s Look for dlm_pdev %s\n", __func__,
-+ dlm_pdev->disk_name);
-+ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) {
-+ spin_unlock(&dlm_device_list_lock);
-+ return dlm_pdev;
-+ }
++ dprintk("--> %s\n", __func__);
++ spin_lock(&ino->i_lock);
++ lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range);
++ if (likely(!lseg)) {
++ spin_unlock(&ino->i_lock);
++ dprintk("%s: no lseg found, proceeding\n", __func__);
++ if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
++ &lgp->res.seq_res, 0, task))
++ rpc_call_start(task);
++ return;
+ }
-+ spin_unlock(&dlm_device_list_lock);
-+ return NULL;
-+}
-+
-+static struct dlm_device_entry *
-+nfsd4_find_pnfs_dlm_device(struct super_block *sb) {
-+ char dname[BDEVNAME_SIZE];
-+
-+ bdevname(sb->s_bdev, dname);
-+ return _nfsd4_find_pnfs_dlm_device(dname);
++ if (!lseg->valid) {
++ spin_unlock(&ino->i_lock);
++ dprintk("%s: invalid lseg found, waiting\n", __func__);
++ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
++ return;
++ }
++ get_lseg(lseg);
++ *lgp->lsegpp = lseg;
++ spin_unlock(&ino->i_lock);
++ dprintk("%s: valid lseg found, no rpc required\n", __func__);
++ rpc_exit(task, NFS4_OK);
+}
+
-+ssize_t
-+nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen)
++static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
-+ char *pos = buf;
-+ ssize_t size = 0;
-+ struct dlm_device_entry *dlm_pdev;
-+ int ret = -EINVAL;
++ struct nfs4_layoutget *lgp = calldata;
++ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
-+ spin_lock(&dlm_device_list_lock);
-+ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list)
-+ {
-+ int advanced;
-+ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list);
-+ if (advanced >= buflen - size)
-+ goto out;
-+ size += advanced;
-+ pos += advanced;
-+ }
-+ ret = size;
++ dprintk("--> %s\n", __func__);
+
-+out:
-+ spin_unlock(&dlm_device_list_lock);
-+ return ret;
++ if (!nfs4_sequence_done(task, &lgp->res.seq_res))
++ return;
++
++ switch (task->tk_status) {
++ case 0:
++ break;
++ case -NFS4ERR_LAYOUTTRYLATER:
++ case -NFS4ERR_RECALLCONFLICT:
++ task->tk_status = -NFS4ERR_DELAY;
++ /* Fall through */
++ default:
++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
++ rpc_restart_call_prepare(task);
++ return;
++ }
++ }
++ lgp->status = task->tk_status;
++ dprintk("<-- %s\n", __func__);
+}
+
-+bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds)
++static void nfs4_layoutget_release(void *calldata)
+{
-+ char *start = ds_list;
-+
-+ *num_ds = 0;
-+
-+ while (*start) {
-+ struct sockaddr_storage tempAddr;
-+ int ipLen = strcspn(start, ",");
++ struct nfs4_layoutget *lgp = calldata;
+
-+ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr)))
-+ return false;
-+ (*num_ds)++;
-+ start += ipLen + 1;
-+ }
-+ return true;
++ dprintk("--> %s\n", __func__);
++ put_layout_hdr(lgp->args.inode);
++ if (lgp->res.layout.buf != NULL)
++ free_page((unsigned long) lgp->res.layout.buf);
++ put_nfs_open_context(lgp->args.ctx);
++ kfree(calldata);
++ dprintk("<-- %s\n", __func__);
+}
+
-+/*
-+ * pnfs_dlm_device string format:
-+ * block-device-path:<ds1 ipv4 address>,<ds2 ipv4 address>
-+ *
-+ * Examples
-+ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with
-+ * two data servers for the dlm cluster file system mounted on /dev/sda.
-+ *
-+ * /dev/sda:192.168.1.96,192.168.1.100'
-+ * replaces the data server list for /dev/sda
-+ *
-+ * Only the deviceid == 1 is supported. Can add device id to
-+ * pnfs_dlm_device string when needed.
-+ *
-+ * Only the round robin each data server once stripe index is supported.
-+ */
-+int
-+nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len)
++static const struct rpc_call_ops nfs4_layoutget_call_ops = {
++ .rpc_call_prepare = nfs4_layoutget_prepare,
++ .rpc_call_done = nfs4_layoutget_done,
++ .rpc_release = nfs4_layoutget_release,
++};
+
++int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
-+ struct dlm_device_entry *new, *found;
-+ char *bufp = pnfs_dlm_device;
-+ char *endp = bufp + strlen(bufp);
-+ int err = -ENOMEM;
++ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
++ struct rpc_task *task;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
++ .rpc_argp = &lgp->args,
++ .rpc_resp = &lgp->res,
++ };
++ struct rpc_task_setup task_setup_data = {
++ .rpc_client = server->client,
++ .rpc_message = &msg,
++ .callback_ops = &nfs4_layoutget_call_ops,
++ .callback_data = lgp,
++ .flags = RPC_TASK_ASYNC,
++ };
++ int status = 0;
+
-+ dprintk("--> %s len %d\n", __func__, len);
++ dprintk("--> %s\n", __func__);
+
-+ new = kzalloc(sizeof(*new), GFP_KERNEL);
-+ if (!new)
-+ return err;
++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
++ if (lgp->res.layout.buf == NULL) {
++ nfs4_layoutget_release(lgp);
++ return -ENOMEM;
++ }
+
-+ err = -EINVAL;
-+ /* disk_name */
-+ /* FIXME: need to check for valid disk_name. search superblocks?
-+ * check for slash dev slash ?
-+ */
-+ len = strcspn(bufp, ":");
-+ if (len > DISK_NAME_LEN)
-+ goto out_free;
-+ memcpy(new->disk_name, bufp, len);
++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
++ task = rpc_run_task(&task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
++ status = nfs4_wait_for_completion_rpc_task(task);
++ if (status != 0)
++ goto out;
++ status = lgp->status;
++ if (status != 0)
++ goto out;
++ status = pnfs_layout_process(lgp);
++out:
++ rpc_put_task(task);
++ dprintk("<-- %s status=%d\n", __func__, status);
++ return status;
++}
+
-+ err = -EINVAL;
-+ bufp += len + 1;
-+ if (bufp >= endp)
-+ goto out_free;
++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data)
++{
++ struct nfs4_layoutcommit_data *ldata =
++ (struct nfs4_layoutcommit_data *)data;
++ struct nfs_server *server = NFS_SERVER(ldata->args.inode);
+
-+ /* data server list */
-+ /* FIXME: need to check for comma separated valid ip format */
-+ len = strcspn(bufp, ":");
-+ if (len > NFSD_DLM_DS_LIST_MAX)
-+ goto out_free;
-+ memcpy(new->ds_list, bufp, len);
++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args,
++ &ldata->res.seq_res, 1, task))
++ return;
++ rpc_call_start(task);
++}
+
++static void
++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
++{
++ struct nfs4_layoutcommit_data *data =
++ (struct nfs4_layoutcommit_data *)calldata;
++ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
-+ /* validate the ips */
-+ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds)))
-+ goto out_free;
++ if (!nfs4_sequence_done(task, &data->res.seq_res))
++ return;
+
-+ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__,
-+ new->disk_name, new->num_ds, new->ds_list);
++ if (RPC_ASSASSINATED(task))
++ return;
+
-+ found = _nfsd4_find_pnfs_dlm_device(new->disk_name);
-+ if (found) {
-+ /* FIXME: should compare found->ds_list with new->ds_list
-+ * and if it is different, kick off a CB_NOTIFY change
-+ * deviceid.
-+ */
-+ dprintk("%s pnfs_dlm_device %s:%s already in cache "
-+ " replace ds_list with new ds_list %s\n", __func__,
-+ found->disk_name, found->ds_list, new->ds_list);
-+ memset(found->ds_list, 0, DISK_NAME_LEN);
-+ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list));
-+ found->num_ds = new->num_ds;
-+ kfree(new);
-+ } else {
-+ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__,
-+ new->disk_name, new->ds_list);
-+ spin_lock(&dlm_device_list_lock);
-+ list_add(&new->dlm_dev_list, &dlm_device_list);
-+ spin_unlock(&dlm_device_list_lock);
-+ }
-+ dprintk("<-- %s Success\n", __func__);
-+ return 0;
++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
++ nfs_restart_rpc(task, server->nfs_client);
+
-+out_free:
-+ kfree(new);
-+ dprintk("<-- %s returns %d\n", __func__, err);
-+ return err;
++ data->status = task->tk_status;
+}
+
-+void nfsd4_pnfs_dlm_shutdown(void)
++static void nfs4_layoutcommit_release(void *lcdata)
+{
-+ struct dlm_device_entry *dlm_pdev, *next;
-+
-+ dprintk("--> %s\n", __func__);
++ struct nfs4_layoutcommit_data *data =
++ (struct nfs4_layoutcommit_data *)lcdata;
+
-+ spin_lock(&dlm_device_list_lock);
-+ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list,
-+ dlm_dev_list) {
-+ list_del(&dlm_pdev->dlm_dev_list);
-+ kfree(dlm_pdev);
-+ }
-+ spin_unlock(&dlm_device_list_lock);
++ pnfs_cleanup_layoutcommit(lcdata);
++ /* Matched by get_layout in pnfs_layoutcommit_inode */
++ put_layout_hdr(data->args.inode);
++ put_rpccred(data->cred);
++ kfree(lcdata);
+}
+
-+static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb,
-+ u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *res)
-+{
-+ if (layout_type != LAYOUT_NFSV4_1_FILES) {
-+ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' "
-+ "(type: %x)\n", __func__, layout_type);
-+ return -ENOTSUPP;
-+ }
++static const struct rpc_call_ops nfs4_layoutcommit_ops = {
++ .rpc_call_prepare = nfs4_layoutcommit_prepare,
++ .rpc_call_done = nfs4_layoutcommit_done,
++ .rpc_release = nfs4_layoutcommit_release,
++};
+
-+ res->gd_eof = 1;
-+ if (res->gd_cookie)
-+ return -ENOENT;
++/* Execute a layoutcommit to the server */
++int
++nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
++{
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
++ .rpc_argp = &data->args,
++ .rpc_resp = &data->res,
++ .rpc_cred = data->cred,
++ };
++ struct rpc_task_setup task_setup_data = {
++ .task = &data->task,
++ .rpc_client = NFS_CLIENT(data->args.inode),
++ .rpc_message = &msg,
++ .callback_ops = &nfs4_layoutcommit_ops,
++ .callback_data = data,
++ .flags = RPC_TASK_ASYNC,
++ };
++ struct rpc_task *task;
++ int status = 0;
+
-+ res->gd_cookie = 1;
-+ res->gd_verf = 1;
-+ res->gd_devid = 1;
-+ return 0;
-+}
++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu "
++ "type: %d issync %d\n",
++ data->task.tk_pid,
++ data->args.range.length,
++ data->args.range.offset,
++ data->args.lastbytewritten,
++ data->args.layout_type, issync);
+
-+static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb,
-+ struct exp_xdr_stream *xdr,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *devid)
-+{
-+ int err, len, i = 0;
-+ struct pnfs_filelayout_device fdev;
-+ struct pnfs_filelayout_devaddr *daddr;
-+ struct dlm_device_entry *dlm_pdev;
-+ char *bufp;
++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
++ task = rpc_run_task(&task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
++ if (!issync)
++ goto out;
++ status = nfs4_wait_for_completion_rpc_task(task);
++ if (status != 0)
++ goto out;
++ status = data->status;
++out:
++ dprintk("%s: status %d\n", __func__, status);
++ rpc_put_task(task);
++ return 0;
++}
+
-+ err = -ENOTSUPP;
-+ if (layout_type != LAYOUT_NFSV4_1_FILES) {
-+ dprintk("%s: ERROR: layout type isn't 'file' "
-+ "(type: %x)\n", __func__, layout_type);
-+ return err;
-+ }
++static void
++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
++{
++ struct nfs4_layoutreturn *lrp = calldata;
++ struct inode *ino = lrp->args.inode;
++ struct nfs_inode *nfsi = NFS_I(ino);
++ struct nfs_server *server = NFS_SERVER(ino);
+
-+ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO
-+ * with a gdia_device_id != 1 is invalid.
-+ */
-+ err = -EINVAL;
-+ if (devid->devid != 1) {
-+ dprintk("%s: WARNING: didn't receive a deviceid of "
-+ "1 (got: 0x%llx)\n", __func__, devid->devid);
-+ return err;
++ dprintk("--> %s\n", __func__);
++ if ((lrp->args.return_type == RETURN_FILE) &&
++ pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
++ dprintk("%s: waiting on barrier\n", __func__);
++ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
++ return;
+ }
-+
-+ /*
-+ * If the DS list has not been established, return -EINVAL
-+ */
-+ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb);
-+ if (!dlm_pdev) {
-+ dprintk("%s: DEBUG: disk %s Not Found\n", __func__,
-+ sb->s_bdev->bd_disk->disk_name);
-+ return err;
++ if (lrp->stateid) {
++ /* Forget the layout, without sending the return */
++ rpc_exit(task, 0);
++ return;
+ }
++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args,
++ &lrp->res.seq_res, 0, task))
++ return;
++ rpc_call_start(task);
++}
+
-+ dprintk("%s: Found disk %s with DS list |%s|\n",
-+ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list);
++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
++{
++ struct nfs4_layoutreturn *lrp = calldata;
++ struct inode *ino = lrp->args.inode;
++ struct nfs_server *server = NFS_SERVER(ino);
+
-+ memset(&fdev, '\0', sizeof(fdev));
-+ fdev.fl_device_length = dlm_pdev->num_ds;
++ dprintk("--> %s\n", __func__);
+
-+ err = -ENOMEM;
-+ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length;
-+ fdev.fl_device_list = kzalloc(len, GFP_KERNEL);
-+ if (!fdev.fl_device_list) {
-+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list "
-+ "buffer for %d DSes.\n", __func__, i);
-+ fdev.fl_device_length = 0;
-+ goto out;
-+ }
++ if (!nfs4_sequence_done(task, &lrp->res.seq_res))
++ return;
+
-+ /* Set a simple stripe indicie */
-+ fdev.fl_stripeindices_length = fdev.fl_device_length;
-+ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) *
-+ fdev.fl_stripeindices_length, GFP_KERNEL);
++ if (RPC_ASSASSINATED(task))
++ return;
+
-+ if (!fdev.fl_stripeindices_list) {
-+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices "
-+ "list buffer for %d DSes.\n", __func__, i);
-+ goto out;
-+ }
-+ for (i = 0; i < fdev.fl_stripeindices_length; i++)
-+ fdev.fl_stripeindices_list[i] = i;
++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
++ nfs_restart_rpc(task, server->nfs_client);
+
-+ /* Transfer the data server list with a single multipath entry */
-+ bufp = dlm_pdev->ds_list;
-+ for (i = 0; i < fdev.fl_device_length; i++) {
-+ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL);
-+ if (!daddr) {
-+ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device "
-+ "addr buffer.\n", __func__);
-+ goto out;
-+ }
++ dprintk("<-- %s\n", __func__);
++}
+
-+ daddr->r_netid.data = "tcp";
-+ daddr->r_netid.len = 3;
++static void nfs4_layoutreturn_release(void *calldata)
++{
++ struct nfs4_layoutreturn *lrp = calldata;
++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
-+ len = strcspn(bufp, ",");
-+ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL);
-+ memcpy(daddr->r_addr.data, bufp, len);
-+ /*
-+ * append the port number. interpreted as two more bytes
-+ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049.
-+ */
-+ memcpy(daddr->r_addr.data + len, ".8.1", 4);
-+ daddr->r_addr.len = len + 4;
++ dprintk("--> %s return_type %d lo %p\n", __func__,
++ lrp->args.return_type, lo);
+
-+ fdev.fl_device_list[i].fl_multipath_length = 1;
-+ fdev.fl_device_list[i].fl_multipath_list = daddr;
++ pnfs_layoutreturn_release(lrp);
++ kfree(calldata);
++ dprintk("<-- %s\n", __func__);
++}
+
-+ dprintk("%s: encoding DS |%s|\n", __func__, bufp);
++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
++ .rpc_call_prepare = nfs4_layoutreturn_prepare,
++ .rpc_call_done = nfs4_layoutreturn_done,
++ .rpc_release = nfs4_layoutreturn_release,
++};
+
-+ bufp += len + 1;
-+ }
++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
++{
++ struct inode *ino = lrp->args.inode;
++ struct nfs_server *server = NFS_SERVER(ino);
++ struct rpc_task *task;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
++ .rpc_argp = &lrp->args,
++ .rpc_resp = &lrp->res,
++ };
++ struct rpc_task_setup task_setup_data = {
++ .rpc_client = server->client,
++ .rpc_message = &msg,
++ .callback_ops = &nfs4_layoutreturn_call_ops,
++ .callback_data = lrp,
++ .flags = RPC_TASK_ASYNC,
++ };
++ int status = 0;
+
-+ /* have nfsd encode the device info */
-+ err = filelayout_encode_devinfo(xdr, &fdev);
++ dprintk("--> %s\n", __func__);
++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
++ task = rpc_run_task(&task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
++ if (!issync)
++ goto out;
++ status = nfs4_wait_for_completion_rpc_task(task);
++ if (status != 0)
++ goto out;
++ status = task->tk_status;
+out:
-+ for (i = 0; i < fdev.fl_device_length; i++)
-+ kfree(fdev.fl_device_list[i].fl_multipath_list);
-+ kfree(fdev.fl_device_list);
-+ kfree(fdev.fl_stripeindices_list);
-+ dprintk("<-- %s returns %d\n", __func__, err);
-+ return err;
-+}
-+
-+static int get_stripe_unit(int blocksize)
-+{
-+ if (blocksize >= NFSSVC_MAXBLKSIZE)
-+ return blocksize;
-+ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
++ dprintk("<-- %s\n", __func__);
++ rpc_put_task(task);
++ return status;
+}
+
+/*
-+ * Look up inode block device in pnfs_dlm_device list.
-+ * Hash on the inode->i_ino and number of data servers.
++ * Retrieve the list of Data Server devices from the MDS.
+ */
-+static int dlm_ino_hash(struct inode *ino)
++static int _nfs4_getdevicelist(struct nfs_server *server,
++ const struct nfs_fh *fh,
++ struct pnfs_devicelist *devlist)
+{
-+ struct dlm_device_entry *de;
-+ u32 hash_mask = 0;
++ struct nfs4_getdevicelist_args args = {
++ .fh = fh,
++ .layoutclass = server->pnfs_curr_ld->id,
++ };
++ struct nfs4_getdevicelist_res res = {
++ .devlist = devlist,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
++ .rpc_argp = &args,
++ .rpc_resp = &res,
++ .rpc_cred = nfs4_get_machine_cred(server->nfs_client),
++ };
++ int status;
+
-+ /* If can't find the inode block device in the pnfs_dlm_deivce list
-+ * then don't hand out a layout
-+ */
-+ de = nfsd4_find_pnfs_dlm_device(ino->i_sb);
-+ if (!de)
-+ return -1;
-+ hash_mask = de->num_ds - 1;
-+ return ino->i_ino & hash_mask;
++ dprintk("--> %s\n", __func__);
++ status = nfs4_call_sync(server, &msg, &args, &res, 0);
++ put_rpccred(msg.rpc_cred);
++ dprintk("<-- %s status=%d\n", __func__, status);
++ return status;
+}
+
-+static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode,
-+ struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_layoutget_arg *args,
-+ struct nfsd4_pnfs_layoutget_res *res)
++int nfs4_proc_getdevicelist(struct nfs_server *server,
++ const struct nfs_fh *fh,
++ struct pnfs_devicelist *devlist)
+{
-+ struct pnfs_filelayout_layout *layout = NULL;
-+ struct knfsd_fh *fhp = NULL;
-+ int index;
-+ enum nfsstat4 rc = NFS4_OK;
++ struct nfs4_exception exception = { };
++ int err;
+
-+ dprintk("%s: LAYOUT_GET\n", __func__);
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_getdevicelist(server, fh, devlist),
++ &exception);
++ } while (exception.retry);
+
-+ /* DLM exported file systems only support layouts for READ */
-+ if (res->lg_seg.iomode == IOMODE_RW)
-+ return NFS4ERR_BADIOMODE;
++ dprintk("%s: err=%d, num_devs=%u\n", __func__,
++ err, devlist->num_devs);
++
++ return err;
++}
++EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
++
++static int
++_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
++{
++ struct nfs4_getdeviceinfo_args args = {
++ .pdev = pdev,
++ };
++ struct nfs4_getdeviceinfo_res res = {
++ .pdev = pdev,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
++ .rpc_argp = &args,
++ .rpc_resp = &res,
++ .rpc_cred = nfs4_get_machine_cred(server->nfs_client),
++ };
++ int status;
++
++ dprintk("--> %s\n", __func__);
++ status = nfs4_call_sync(server, &msg, &args, &res, 0);
++ put_rpccred(msg.rpc_cred);
++ dprintk("<-- %s status=%d\n", __func__, status);
+
-+ index = dlm_ino_hash(inode);
-+ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index,
-+ inode->i_ino);
-+ if (index < 0)
-+ return NFS4ERR_LAYOUTUNAVAILABLE;
++ return status;
++}
+
-+ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
-+ /* Always give out whole file layouts */
-+ res->lg_seg.offset = 0;
-+ res->lg_seg.length = NFS4_MAX_UINT64;
-+ /* Always give out READ ONLY layouts */
-+ res->lg_seg.iomode = IOMODE_READ;
++int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
++{
++ struct nfs4_exception exception = { };
++ int err;
+
-+ layout = kzalloc(sizeof(*layout), GFP_KERNEL);
-+ if (layout == NULL) {
-+ rc = NFS4ERR_LAYOUTTRYLATER;
-+ goto error;
-+ }
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_proc_getdeviceinfo(server, pdev),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
-+ /* Set file layout response args */
-+ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
-+ layout->lg_stripe_type = STRIPE_SPARSE;
-+ layout->lg_commit_through_mds = false;
-+ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
-+ layout->lg_fh_length = 1;
-+ layout->device_id.sbid = args->lg_sbid;
-+ layout->device_id.devid = 1; /*FSFTEMP*/
-+ layout->lg_first_stripe_index = index; /*FSFTEMP*/
-+ layout->lg_pattern_offset = 0;
+ #endif /* CONFIG_NFS_V4_1 */
+
+ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
+diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
+index 72b6c58..b57f41f 100644
+--- a/fs/nfs/nfs4renewd.c
++++ b/fs/nfs/nfs4renewd.c
+@@ -64,7 +64,7 @@ nfs4_renew_state(struct work_struct *work)
+ ops = clp->cl_mvops->state_renewal_ops;
+ dprintk("%s: start\n", __func__);
+ /* Are there any active superblocks? */
+- if (list_empty(&clp->cl_superblocks))
++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp))
+ goto out;
+ spin_lock(&clp->cl_lock);
+ lease = clp->cl_lease_time;
+diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
+index 3e2f19b..3168d77 100644
+--- a/fs/nfs/nfs4state.c
++++ b/fs/nfs/nfs4state.c
+@@ -53,6 +53,7 @@
+ #include "callback.h"
+ #include "delegation.h"
+ #include "internal.h"
++#include "pnfs.h"
+
+ #define OPENOWNER_POOL_SIZE 8
+
+@@ -126,6 +127,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
+ int status;
+ struct nfs_fsinfo fsinfo;
+
++ if (is_ds_only_client(clp)) {
++ nfs4_schedule_state_renewal(clp);
++ return 0;
++ }
+
-+ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
-+ if (fhp == NULL) {
-+ rc = NFS4ERR_LAYOUTTRYLATER;
-+ goto error;
+ status = nfs4_proc_get_lease_time(clp, &fsinfo);
+ if (status == 0) {
+ /* Update lease time and schedule renewal */
+@@ -182,6 +188,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
+ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+ {
+ int status;
++ u32 req_exchange_flags = clp->cl_exchange_flags;
+
+ nfs4_begin_drain_session(clp);
+ status = nfs4_proc_exchange_id(clp, cred);
+@@ -190,6 +197,16 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
+ status = nfs4_proc_create_session(clp);
+ if (status != 0)
+ goto out;
++ if (is_ds_only_session(req_exchange_flags)) {
++ clp->cl_exchange_flags &=
++ ~(EXCHGID4_FLAG_USE_PNFS_MDS | EXCHGID4_FLAG_USE_NON_PNFS);
++ if (!is_ds_only_session(clp->cl_exchange_flags)) {
++ nfs4_destroy_session(clp->cl_session);
++ clp->cl_session = NULL;
++ status = -ENOTSUPP;
++ goto out;
++ }
+ }
+ nfs41_setup_state_renewal(clp);
+ nfs_mark_client_ready(clp, NFS_CS_READY);
+ out:
+@@ -583,8 +600,24 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
+ if (!call_close) {
+ nfs4_put_open_state(state);
+ nfs4_put_state_owner(owner);
+- } else
++ } else {
++ u32 roc_iomode;
++ struct nfs_inode *nfsi = NFS_I(state->inode);
+
-+ memcpy(fhp, args->lg_fh, sizeof(*fhp));
-+ pnfs_fh_mark_ds(fhp);
-+ layout->lg_fh_list = fhp;
++ if (has_layout(nfsi) &&
++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
++ struct pnfs_layout_range range = {
++ .iomode = roc_iomode,
++ .offset = 0,
++ .length = NFS4_MAX_UINT64,
++ };
+
-+ /* Call nfsd to encode layout */
-+ rc = filelayout_encode_layout(xdr, layout);
-+exit:
-+ kfree(layout);
-+ kfree(fhp);
-+ return rc;
++ pnfs_return_layout(state->inode, &range, NULL,
++ RETURN_FILE, wait);
++ }
++
+ nfs4_do_close(path, state, gfp_mask, wait);
++ }
+ }
+
+ void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
+@@ -1447,6 +1480,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
+ }
+ clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
++ pnfs_destroy_all_layouts(clp);
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
+index 08ef912..30ce2ed 100644
+--- a/fs/nfs/nfs4xdr.c
++++ b/fs/nfs/nfs4xdr.c
+@@ -52,6 +52,7 @@
+ #include <linux/nfs_idmap.h>
+ #include "nfs4_fs.h"
+ #include "internal.h"
++#include "pnfs.h"
+
+ #define NFSDBG_FACILITY NFSDBG_XDR
+
+@@ -89,7 +90,7 @@ static int nfs4_stat_to_errno(int);
+ #define encode_getfh_maxsz (op_encode_hdr_maxsz)
+ #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \
+ ((3+NFS4_FHSIZE) >> 2))
+-#define nfs4_fattr_bitmap_maxsz 3
++#define nfs4_fattr_bitmap_maxsz 4
+ #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+ #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
+ #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+@@ -111,7 +112,11 @@ static int nfs4_stat_to_errno(int);
+ #define encode_restorefh_maxsz (op_encode_hdr_maxsz)
+ #define decode_restorefh_maxsz (op_decode_hdr_maxsz)
+ #define encode_fsinfo_maxsz (encode_getattr_maxsz)
+-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11)
++/* The 5 accounts for the PNFS attributes, and assumes that at most three
++ * layout types will be returned.
++ */
++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
++ nfs4_fattr_bitmap_maxsz + 8 + 5)
+ #define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
+ #define decode_renew_maxsz (op_decode_hdr_maxsz)
+ #define encode_setclientid_maxsz \
+@@ -310,6 +315,41 @@ static int nfs4_stat_to_errno(int);
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
+ #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
+ #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
++ encode_verifier_maxsz)
++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
++ 2 /* nfs_cookie4 gdlr_cookie */ + \
++ decode_verifier_maxsz \
++ /* verifier4 gdlr_verifier */ + \
++ 1 /* gdlr_deviceid_list count */ + \
++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
++ NFS4_DEVICEID4_SIZE) \
++ /* gdlr_deviceid_list */ + \
++ 1 /* bool gdlr_eof */)
++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
++ XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
++ 1 /* layout type */ + \
++ 1 /* opaque devaddr4 length */ + \
++ /* devaddr4 payload is read into page */ \
++ 1 /* notification bitmap length */ + \
++ 1 /* notification bitmap */)
++#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
++ encode_stateid_maxsz)
++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
++ decode_stateid_maxsz + \
++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
++#define encode_layoutcommit_maxsz (18 + \
++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
++ op_encode_hdr_maxsz + \
++ encode_stateid_maxsz)
++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz)
++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
++ encode_stateid_maxsz + \
++ 1 /* FIXME: opaque lrf_body always empty at
++ *the moment */)
++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
++ 1 + decode_stateid_maxsz)
+ #else /* CONFIG_NFS_V4_1 */
+ #define encode_sequence_maxsz 0
+ #define decode_sequence_maxsz 0
+@@ -699,6 +739,60 @@ static int nfs4_stat_to_errno(int);
+ #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_reclaim_complete_maxsz)
++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
++ encode_sequence_maxsz + \
++ encode_putfh_maxsz + \
++ encode_getdevicelist_maxsz)
++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
++ decode_sequence_maxsz + \
++ decode_putfh_maxsz + \
++ decode_getdevicelist_maxsz)
++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
++ encode_sequence_maxsz +\
++ encode_getdeviceinfo_maxsz)
++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
++ decode_sequence_maxsz + \
++ decode_getdeviceinfo_maxsz)
++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
++ encode_sequence_maxsz + \
++ encode_putfh_maxsz + \
++ encode_layoutget_maxsz)
++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
++ decode_sequence_maxsz + \
++ decode_putfh_maxsz + \
++ decode_layoutget_maxsz)
++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
++ encode_sequence_maxsz +\
++ encode_putfh_maxsz + \
++ encode_layoutcommit_maxsz + \
++ encode_getattr_maxsz)
++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
++ decode_sequence_maxsz + \
++ decode_putfh_maxsz + \
++ decode_layoutcommit_maxsz + \
++ decode_getattr_maxsz)
++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
++ encode_sequence_maxsz + \
++ encode_putfh_maxsz + \
++ encode_layoutreturn_maxsz)
++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
++ decode_sequence_maxsz + \
++ decode_putfh_maxsz + \
++ decode_layoutreturn_maxsz)
++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \
++ encode_sequence_maxsz +\
++ encode_putfh_maxsz + \
++ encode_write_maxsz)
++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \
++ decode_sequence_maxsz + \
++ decode_putfh_maxsz + \
++ decode_write_maxsz)
++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ encode_commit_maxsz)
++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \
++ decode_putfh_maxsz + \
++ decode_commit_maxsz)
+
+ const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
+ compound_encode_hdr_maxsz +
+@@ -1003,6 +1097,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
+ hdr->replen += decode_getattr_maxsz;
+ }
+
++static void
++encode_getattr_three(struct xdr_stream *xdr,
++ uint32_t bm0, uint32_t bm1, uint32_t bm2,
++ struct compound_hdr *hdr)
++{
++ __be32 *p;
+
-+error:
-+ res->lg_seg.length = 0;
-+ goto exit;
++ p = reserve_space(xdr, 4);
++ *p = cpu_to_be32(OP_GETATTR);
++ if (bm2) {
++ p = reserve_space(xdr, 16);
++ *p++ = cpu_to_be32(3);
++ *p++ = cpu_to_be32(bm0);
++ *p++ = cpu_to_be32(bm1);
++ *p = cpu_to_be32(bm2);
++ } else if (bm1) {
++ p = reserve_space(xdr, 12);
++ *p++ = cpu_to_be32(2);
++ *p++ = cpu_to_be32(bm0);
++ *p = cpu_to_be32(bm1);
++ } else {
++ p = reserve_space(xdr, 8);
++ *p++ = cpu_to_be32(1);
++ *p = cpu_to_be32(bm0);
++ }
++ hdr->nops++;
++ hdr->replen += decode_getattr_maxsz;
+}
+
-+static int
-+nfsd4_pnfs_dlm_layouttype(struct super_block *sb)
+ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+ {
+ encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+@@ -1011,8 +1134,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
+
+ static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+ {
+- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
++ encode_getattr_three(xdr,
++ bitmask[0] & nfs4_fsinfo_bitmap[0],
++ bitmask[1] & nfs4_fsinfo_bitmap[1],
++ bitmask[2] & nfs4_fsinfo_bitmap[2],
++ hdr);
+ }
+
+ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
+@@ -1726,6 +1852,155 @@ static void encode_sequence(struct xdr_stream *xdr,
+ #endif /* CONFIG_NFS_V4_1 */
+ }
+
++#ifdef CONFIG_NFS_V4_1
++static void
++encode_getdevicelist(struct xdr_stream *xdr,
++ const struct nfs4_getdevicelist_args *args,
++ struct compound_hdr *hdr)
+{
-+ return LAYOUT_NFSV4_1_FILES;
-+}
-+
-+/* For use by DLM cluster file systems exported by pNFSD */
-+const struct pnfs_export_operations pnfs_dlm_export_ops = {
-+ .layout_type = nfsd4_pnfs_dlm_layouttype,
-+ .get_device_info = nfsd4_pnfs_dlm_getdevinfo,
-+ .get_device_iter = nfsd4_pnfs_dlm_getdeviter,
-+ .layout_get = nfsd4_pnfs_dlm_layoutget,
-+};
-+EXPORT_SYMBOL(pnfs_dlm_export_ops);
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-09-30 12:25:08.513286000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c 2010-09-30 12:25:08.515285000 -0400
-@@ -0,0 +1,620 @@
-+/*
-+* linux/fs/nfsd/nfs4pnfsds.c
-+*
-+* Copyright (c) 2005 The Regents of the University of Michigan.
-+* All rights reserved.
-+*
-+* Andy Adamson <andros at umich.edu>
-+*
-+* Redistribution and use in source and binary forms, with or without
-+* modification, are permitted provided that the following conditions
-+* are met:
-+*
-+* 1. Redistributions of source code must retain the above copyright
-+* notice, this list of conditions and the following disclaimer.
-+* 2. Redistributions in binary form must reproduce the above copyright
-+* notice, this list of conditions and the following disclaimer in the
-+* documentation and/or other materials provided with the distribution.
-+* 3. Neither the name of the University nor the names of its
-+* contributors may be used to endorse or promote products derived
-+* from this software without specific prior written permission.
-+*
-+* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*
-+*/
-+#if defined(CONFIG_PNFSD)
-+
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
-+
-+#include <linux/param.h>
-+#include <linux/sunrpc/svc.h>
-+#include <linux/sunrpc/debug.h>
-+#include <linux/nfs4.h>
-+#include <linux/exportfs.h>
-+#include <linux/sched.h>
-+
-+#include "nfsd.h"
-+#include "pnfsd.h"
-+#include "state.h"
-+
-+/*
-+ *******************
-+ * PNFS
-+ *******************
-+ */
-+/*
-+ * Hash tables for pNFS Data Server state
-+ *
-+ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using
-+ * this data server (DS).
-+ *
-+ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained
-+ * from any MDS.
-+ *
-+ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained
-+ * from any MDS.
-+ *
-+ */
-+/* Hash tables for clientid state */
-+#define CLIENT_HASH_BITS 4
-+#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
-+#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-+
-+#define clientid_hashval(id) \
-+ ((id) & CLIENT_HASH_MASK)
-+
-+/* hash table for pnfs_ds_stateid */
-+#define STATEID_HASH_BITS 10
-+#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
-+#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
-+
-+#define stateid_hashval(owner_id, file_id) \
-+ (((owner_id) + (file_id)) & STATEID_HASH_MASK)
-+
-+static struct list_head mds_id_tbl;
-+static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE];
-+static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE];
-+
-+static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp);
-+static inline void put_ds_mdsid(struct pnfs_mds_id *mdp);
++ __be32 *p;
++ nfs4_verifier dummy = {
++ .data = "dummmmmy",
++ };
+
-+/* Mutex for data server state. Needs to be separate from
-+ * mds state mutex since a node can be both mds and ds */
-+static DEFINE_MUTEX(ds_mutex);
-+static struct thread_info *ds_mutex_owner;
++ p = reserve_space(xdr, 20);
++ *p++ = cpu_to_be32(OP_GETDEVICELIST);
++ *p++ = cpu_to_be32(args->layoutclass);
++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
++ xdr_encode_hyper(p, 0ULL); /* cookie */
++ encode_nfs4_verifier(xdr, &dummy);
++ hdr->nops++;
++ hdr->replen += decode_getdevicelist_maxsz;
++}
+
+static void
-+ds_lock_state(void)
++encode_getdeviceinfo(struct xdr_stream *xdr,
++ const struct nfs4_getdeviceinfo_args *args,
++ struct compound_hdr *hdr)
+{
-+ mutex_lock(&ds_mutex);
-+ ds_mutex_owner = current_thread_info();
++ __be32 *p;
++
++ p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
++ *p++ = cpu_to_be32(OP_GETDEVICEINFO);
++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
++ NFS4_DEVICEID4_SIZE);
++ *p++ = cpu_to_be32(args->pdev->layout_type);
++ *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
++ *p++ = cpu_to_be32(0); /* bitmap length 0 */
++ hdr->nops++;
++ hdr->replen += decode_getdeviceinfo_maxsz;
+}
+
+static void
-+ds_unlock_state(void)
++encode_layoutget(struct xdr_stream *xdr,
++ const struct nfs4_layoutget_args *args,
++ struct compound_hdr *hdr)
+{
-+ BUG_ON(ds_mutex_owner != current_thread_info());
-+ ds_mutex_owner = NULL;
-+ mutex_unlock(&ds_mutex);
++ nfs4_stateid stateid;
++ __be32 *p;
++
++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
++ *p++ = cpu_to_be32(OP_LAYOUTGET);
++ *p++ = cpu_to_be32(0); /* Signal layout available */
++ *p++ = cpu_to_be32(args->type);
++ *p++ = cpu_to_be32(args->range.iomode);
++ p = xdr_encode_hyper(p, args->range.offset);
++ p = xdr_encode_hyper(p, args->range.length);
++ p = xdr_encode_hyper(p, args->minlength);
++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
++ args->ctx->state);
++ p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
++ *p = cpu_to_be32(args->maxcount);
++
++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
++ __func__,
++ args->type,
++ args->range.iomode,
++ (unsigned long)args->range.offset,
++ (unsigned long)args->range.length,
++ args->maxcount);
++ hdr->nops++;
++ hdr->replen += decode_layoutget_maxsz;
+}
+
+static int
-+cmp_clid(const clientid_t *cl1, const clientid_t *cl2)
++encode_layoutcommit(struct xdr_stream *xdr,
++ const struct nfs4_layoutcommit_args *args,
++ struct compound_hdr *hdr)
+{
-+ return (cl1->cl_boot == cl2->cl_boot) &&
-+ (cl1->cl_id == cl2->cl_id);
-+}
++ __be32 *p;
+
-+void
-+nfs4_pnfs_state_init(void)
-+{
-+ int i;
++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__,
++ args->range.length, args->range.offset, args->lastbytewritten,
++ args->layout_type);
+
-+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
-+ INIT_LIST_HEAD(&mds_clid_hashtbl[i]);
++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE);
++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
++ p = xdr_encode_hyper(p, args->range.offset);
++ p = xdr_encode_hyper(p, args->range.length);
++ *p++ = cpu_to_be32(0); /* reclaim */
++ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
++ p = xdr_encode_hyper(p, args->lastbytewritten);
++ *p = cpu_to_be32(args->time_modify_changed != 0);
++ if (args->time_modify_changed) {
++ p = reserve_space(xdr, 12);
++ *p++ = cpu_to_be32(0);
++ *p++ = cpu_to_be32(args->time_modify.tv_sec);
++ *p = cpu_to_be32(args->time_modify.tv_nsec);
++ }
++
++ p = reserve_space(xdr, 4);
++ *p = cpu_to_be32(args->layout_type);
+
-+ for (i = 0; i < STATEID_HASH_SIZE; i++)
-+ INIT_LIST_HEAD(&ds_stid_hashtbl[i]);
++ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit) {
++ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit(
++ NFS_I(args->inode)->layout, xdr, args);
++ } else {
++ p = reserve_space(xdr, 4);
++ xdr_encode_opaque(p, NULL, 0);
++ }
+
-+ INIT_LIST_HEAD(&mds_id_tbl);
++ hdr->nops++;
++ hdr->replen += decode_layoutcommit_maxsz;
++ return 0;
+}
+
-+static struct pnfs_mds_id *
-+find_pnfs_mds_id(u32 mdsid)
++static void
++encode_layoutreturn(struct xdr_stream *xdr,
++ const struct nfs4_layoutreturn_args *args,
++ struct compound_hdr *hdr)
+{
-+ struct pnfs_mds_id *local = NULL;
++ nfs4_stateid stateid;
++ __be32 *p;
+
-+ dprintk("pNFSD: %s\n", __func__);
-+ list_for_each_entry(local, &mds_id_tbl, di_hash) {
-+ if (local->di_mdsid == mdsid)
-+ return local;
++ p = reserve_space(xdr, 20);
++ *p++ = cpu_to_be32(OP_LAYOUTRETURN);
++ *p++ = cpu_to_be32(args->reclaim);
++ *p++ = cpu_to_be32(args->layout_type);
++ *p++ = cpu_to_be32(args->range.iomode);
++ *p = cpu_to_be32(args->return_type);
++ if (args->return_type == RETURN_FILE) {
++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
++ p = xdr_encode_hyper(p, args->range.offset);
++ p = xdr_encode_hyper(p, args->range.length);
++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
++ NULL);
++ p = xdr_encode_opaque_fixed(p, &stateid.data,
++ NFS4_STATEID_SIZE);
++ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
++ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
++ NFS_I(args->inode)->layout, xdr, args);
++ } else {
++ p = reserve_space(xdr, 4);
++ *p = cpu_to_be32(0);
++ }
+ }
-+ return NULL;
++ hdr->nops++;
++ hdr->replen += decode_layoutreturn_maxsz;
+}
++#endif /* CONFIG_NFS_V4_1 */
+
-+static struct pnfs_ds_clientid *
-+find_pnfs_ds_clientid(const clientid_t *clid)
+ /*
+ * END OF "GENERIC" ENCODE ROUTINES.
+ */
+@@ -2374,7 +2649,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ encode_compound_hdr(&xdr, req, &hdr);
+@@ -2513,7 +2788,7 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
+ };
+- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ encode_compound_hdr(&xdr, req, &hdr);
+@@ -2543,6 +2818,153 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
+ return 0;
+ }
+
++/*
++ * Encode GETDEVICELIST request
++ */
++static int
++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p,
++ struct nfs4_getdevicelist_args *args)
+{
-+ struct pnfs_ds_clientid *local = NULL;
-+ unsigned int hashval;
-+
-+ dprintk("pNFSD: %s\n", __func__);
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+ hashval = clientid_hashval(clid->cl_id);
-+ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) {
-+ if (cmp_clid(&local->dc_mdsclid, clid))
-+ return local;
-+ }
-+ return NULL;
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_putfh(&xdr, args->fh, &hdr);
++ encode_getdevicelist(&xdr, args, &hdr);
++ encode_nops(&hdr);
++ return 0;
+}
+
-+static struct pnfs_ds_stateid *
-+find_pnfs_ds_stateid(stateid_t *stid)
++/*
++ * Encode GETDEVICEINFO request
++ */
++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
++ struct nfs4_getdeviceinfo_args *args)
+{
-+ struct pnfs_ds_stateid *local = NULL;
-+ u32 st_id = stid->si_stateownerid;
-+ u32 f_id = stid->si_fileid;
-+ unsigned int hashval;
-+
-+ dprintk("pNFSD: %s\n", __func__);
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+ hashval = stateid_hashval(st_id, f_id);
-+ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash)
-+ if ((local->ds_stid.si_stateownerid == st_id) &&
-+ (local->ds_stid.si_fileid == f_id) &&
-+ (local->ds_stid.si_boot == stid->si_boot)) {
-+ stateid_t *sid = &local->ds_stid;
-+ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n",
-+ __func__, local, local->ds_flags,
-+ STATEID_VAL(sid));
-+ return local;
-+ }
-+ return NULL;
-+}
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_getdeviceinfo(&xdr, args, &hdr);
+
-+static void
-+release_ds_mdsid(struct kref *kref)
-+{
-+ struct pnfs_mds_id *mdp =
-+ container_of(kref, struct pnfs_mds_id, di_ref);
-+ dprintk("pNFSD: %s\n", __func__);
++ /* set up reply kvec. Subtract notification bitmap max size (2)
++ * so that notification bitmap is put in xdr_buf tail */
++ xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
++ args->pdev->pages, args->pdev->pgbase,
++ args->pdev->pglen);
+
-+ list_del(&mdp->di_hash);
-+ list_del(&mdp->di_mdsclid);
-+ kfree(mdp);
++ encode_nops(&hdr);
++ return 0;
+}
+
-+static void
-+release_ds_clientid(struct kref *kref)
++/*
++ * Encode LAYOUTGET request
++ */
++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
++ struct nfs4_layoutget_args *args)
+{
-+ struct pnfs_ds_clientid *dcp =
-+ container_of(kref, struct pnfs_ds_clientid, dc_ref);
-+ struct pnfs_mds_id *mdp;
-+ dprintk("pNFSD: %s\n", __func__);
-+
-+ mdp = find_pnfs_mds_id(dcp->dc_mdsid);
-+ if (mdp)
-+ put_ds_mdsid(mdp);
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+ list_del(&dcp->dc_hash);
-+ list_del(&dcp->dc_stateid);
-+ list_del(&dcp->dc_permdsid);
-+ kfree(dcp);
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
++ encode_layoutget(&xdr, args, &hdr);
++ encode_nops(&hdr);
++ return 0;
+}
+
-+static void
-+release_ds_stateid(struct kref *kref)
++/*
++ * Encode LAYOUTCOMMIT request
++ */
++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p,
++ struct nfs4_layoutcommit_args *args)
+{
-+ struct pnfs_ds_stateid *dsp =
-+ container_of(kref, struct pnfs_ds_stateid, ds_ref);
-+ struct pnfs_ds_clientid *dcp;
-+ dprintk("pNFS %s: dsp %p\n", __func__, dsp);
-+
-+ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid);
-+ if (dcp)
-+ put_ds_clientid(dcp);
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+ list_del(&dsp->ds_hash);
-+ list_del(&dsp->ds_perclid);
-+ kfree(dsp);
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_putfh(&xdr, args->fh, &hdr);
++ encode_layoutcommit(&xdr, args, &hdr);
++ encode_getfattr(&xdr, args->bitmask, &hdr);
++ encode_nops(&hdr);
++ return 0;
+}
+
-+static inline void
-+put_ds_clientid(struct pnfs_ds_clientid *dcp)
++/*
++ * Encode LAYOUTRETURN request
++ */
++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p,
++ struct nfs4_layoutreturn_args *args)
+{
-+ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
-+ atomic_read(&dcp->dc_ref.refcount));
-+ kref_put(&dcp->dc_ref, release_ds_clientid);
-+}
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+static inline void
-+get_ds_clientid(struct pnfs_ds_clientid *dcp)
-+{
-+ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
-+ atomic_read(&dcp->dc_ref.refcount));
-+ kref_get(&dcp->dc_ref);
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
++ encode_layoutreturn(&xdr, args, &hdr);
++ encode_nops(&hdr);
++ return 0;
+}
+
-+static inline void
-+put_ds_mdsid(struct pnfs_mds_id *mdp)
++/*
++ * Encode a pNFS File Layout Data Server WRITE request
++ */
++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p,
++ struct nfs_writeargs *args)
+{
-+ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
-+ atomic_read(&mdp->di_ref.refcount));
-+ kref_put(&mdp->di_ref, release_ds_mdsid);
-+}
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+static inline void
-+get_ds_mdsid(struct pnfs_mds_id *mdp)
-+{
-+ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
-+ atomic_read(&mdp->di_ref.refcount));
-+ kref_get(&mdp->di_ref);
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_putfh(&xdr, args->fh, &hdr);
++ encode_write(&xdr, args, &hdr);
++ encode_nops(&hdr);
++ return 0;
+}
+
-+static inline void
-+put_ds_stateid(struct pnfs_ds_stateid *dsp)
++/*
++ * Encode a pNFS File Layout Data Server COMMIT request
++ */
++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p,
++ struct nfs_writeargs *args)
+{
-+ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
-+ atomic_read(&dsp->ds_ref.refcount));
-+ kref_put(&dsp->ds_ref, release_ds_stateid);
-+}
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
++ };
+
-+static inline void
-+get_ds_stateid(struct pnfs_ds_stateid *dsp)
-+{
-+ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
-+ atomic_read(&dsp->ds_ref.refcount));
-+ kref_get(&dsp->ds_ref);
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, req, &hdr);
++ encode_sequence(&xdr, &args->seq_args, &hdr);
++ encode_putfh(&xdr, args->fh, &hdr);
++ encode_commit(&xdr, args, &hdr);
++ encode_nops(&hdr);
++ return 0;
+}
-+
-+void
-+nfs4_pnfs_state_shutdown(void)
+ #endif /* CONFIG_NFS_V4_1 */
+
+ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+@@ -2643,14 +3065,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
+
+- bitmap[0] = bitmap[1] = 0;
++ bitmap[0] = bitmap[1] = bitmap[2] = 0;
+ p = xdr_inline_decode(xdr, (bmlen << 2));
+ if (unlikely(!p))
+ goto out_overflow;
+ if (bmlen > 0) {
+ bitmap[0] = be32_to_cpup(p++);
+- if (bmlen > 1)
+- bitmap[1] = be32_to_cpup(p);
++ if (bmlen > 1) {
++ bitmap[1] = be32_to_cpup(p++);
++ if (bmlen > 2)
++ bitmap[2] = be32_to_cpup(p);
++ }
+ }
+ return 0;
+ out_overflow:
+@@ -2679,8 +3104,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
+ decode_attr_bitmap(xdr, bitmask);
+ bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
+ } else
+- bitmask[0] = bitmask[1] = 0;
+- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
++ bitmask[0] = bitmask[1] = bitmask[2] = 0;
++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
++ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+ }
+
+@@ -3665,7 +4091,7 @@ out_overflow:
+ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
+ {
+ __be32 *savep;
+- uint32_t attrlen, bitmap[2] = {0};
++ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3691,7 +4117,7 @@ xdr_error:
+ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+ {
+ __be32 *savep;
+- uint32_t attrlen, bitmap[2] = {0};
++ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3723,7 +4149,7 @@ xdr_error:
+ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
+ {
+ __be32 *savep;
+- uint32_t attrlen, bitmap[2] = {0};
++ uint32_t attrlen, bitmap[3] = {0};
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3749,7 +4175,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ {
+ __be32 *savep;
+ uint32_t attrlen,
+- bitmap[2] = {0},
++ bitmap[3] = {0},
+ type;
+ int status;
+ umode_t fmode = 0;
+@@ -3868,11 +4294,87 @@ xdr_error:
+ return status;
+ }
+
++/*
++ * Decode potentially multiple layout types. Currently we only support
++ * one layout driver per file system.
++ */
++static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
++ uint32_t *layouttype)
+{
-+ struct pnfs_ds_stateid *dsp;
-+ int i;
++ uint32_t *p;
++ int num;
+
-+ dprintk("pNFSD %s: -->\n", __func__);
++ p = xdr_inline_decode(xdr, 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ num = be32_to_cpup(p);
+
-+ ds_lock_state();
-+ for (i = 0; i < STATEID_HASH_SIZE; i++) {
-+ while (!list_empty(&ds_stid_hashtbl[i])) {
-+ dsp = list_entry(ds_stid_hashtbl[i].next,
-+ struct pnfs_ds_stateid, ds_hash);
-+ put_ds_stateid(dsp);
-+ }
++ /* pNFS is not supported by the underlying file system */
++ if (num == 0) {
++ *layouttype = 0;
++ return 0;
+ }
-+ ds_unlock_state();
++ if (num > 1)
++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
++ "per filesystem not supported\n", __func__);
++
++ /* Decode and set first layout type, move xdr->p past unused types */
++ p = xdr_inline_decode(xdr, num * 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ *layouttype = be32_to_cpup(p);
++ return 0;
++out_overflow:
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
+}
+
-+static struct pnfs_mds_id *
-+alloc_init_mds_id(struct pnfs_get_state *gsp)
++/*
++ * The type of file system exported.
++ * Note we must ensure that layouttype is set in any non-error case.
++ */
++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
++ uint32_t *layouttype)
+{
-+ struct pnfs_mds_id *mdp;
-+
-+ dprintk("pNFSD: %s\n", __func__);
++ int status = 0;
+
-+ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL);
-+ if (!mdp)
-+ return NULL;
-+ INIT_LIST_HEAD(&mdp->di_hash);
-+ INIT_LIST_HEAD(&mdp->di_mdsclid);
-+ list_add(&mdp->di_hash, &mds_id_tbl);
-+ mdp->di_mdsid = gsp->dsid;
-+ mdp->di_mdsboot = 0;
-+ kref_init(&mdp->di_ref);
-+ return mdp;
++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
++ return -EIO;
++ if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
++ status = decode_first_pnfs_layout_type(xdr, layouttype);
++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
++ } else
++ *layouttype = 0;
++ return status;
+}
+
-+static struct pnfs_ds_clientid *
-+alloc_init_ds_clientid(struct pnfs_get_state *gsp)
++/*
++ * The prefered block size for layout directed io
++ */
++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
++ uint32_t *res)
+{
-+ struct pnfs_mds_id *mdp;
-+ struct pnfs_ds_clientid *dcp;
-+ clientid_t *clid = (clientid_t *)&gsp->clid;
-+ unsigned int hashval = clientid_hashval(clid->cl_id);
++ __be32 *p;
+
-+ dprintk("pNFSD: %s\n", __func__);
++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
++ *res = 0;
++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
++ p = xdr_inline_decode(xdr, 4);
++ if (unlikely(!p)) {
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
++ }
++ *res = be32_to_cpup(p);
++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
++ }
++ return 0;
++}
+
+ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+ {
+ __be32 *savep;
+- uint32_t attrlen, bitmap[2];
++ uint32_t attrlen, bitmap[3];
+ int status;
+
+ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+@@ -3894,6 +4396,12 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
+ if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
+ goto xdr_error;
+ fsinfo->wtpref = fsinfo->wtmax;
++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
++ if (status)
++ goto xdr_error;
++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
++ if (status)
++ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+ xdr_error:
+@@ -4382,7 +4890,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
+ {
+ __be32 *savep;
+ uint32_t attrlen,
+- bitmap[2] = {0};
++ bitmap[3] = {0};
+ struct kvec *iov = req->rq_rcv_buf.head;
+ int status;
+
+@@ -4731,16 +5239,238 @@ out_overflow:
+ #endif /* CONFIG_NFS_V4_1 */
+ }
+
++#if defined(CONFIG_NFS_V4_1)
+ /*
+- * END OF "GENERIC" DECODE ROUTINES.
+- */
+-
+-/*
+- * Decode OPEN_DOWNGRADE response
++ * TODO: Need to handle case when EOF != true;
+ */
+-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
++static int decode_getdevicelist(struct xdr_stream *xdr,
++ struct pnfs_devicelist *res)
+ {
+- struct xdr_stream xdr;
++ __be32 *p;
++ int status, i;
++ struct nfs_writeverf verftemp;
+
-+ mdp = find_pnfs_mds_id(gsp->dsid);
-+ if (!mdp) {
-+ mdp = alloc_init_mds_id(gsp);
-+ if (!mdp)
-+ return NULL;
-+ } else {
-+ get_ds_mdsid(mdp);
-+ }
++ status = decode_op_hdr(xdr, OP_GETDEVICELIST);
++ if (status)
++ return status;
+
-+ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL);
-+ if (!dcp)
-+ return NULL;
++ p = xdr_inline_decode(xdr, 8 + 8 + 4);
++ if (unlikely(!p))
++ goto out_overflow;
+
-+ INIT_LIST_HEAD(&dcp->dc_hash);
-+ INIT_LIST_HEAD(&dcp->dc_stateid);
-+ INIT_LIST_HEAD(&dcp->dc_permdsid);
-+ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]);
-+ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid);
-+ dcp->dc_mdsclid = *clid;
-+ kref_init(&dcp->dc_ref);
-+ dcp->dc_mdsid = gsp->dsid;
-+ return dcp;
-+}
++ /* TODO: Skip cookie for now */
++ p += 2;
+
-+static struct pnfs_ds_stateid *
-+alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp)
-+{
-+ struct pnfs_ds_stateid *dsp;
-+ u32 st_id = stidp->si_stateownerid;
-+ u32 f_id = stidp->si_fileid;
-+ unsigned int hashval;
++ /* Read verifier */
++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+
-+ dprintk("pNFSD: %s\n", __func__);
++ res->num_devs = be32_to_cpup(p);
+
-+ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL);
-+ if (!dsp)
-+ return dsp;
++ dprintk("%s: num_dev %d\n", __func__, res->num_devs);
+
-+ INIT_LIST_HEAD(&dsp->ds_hash);
-+ INIT_LIST_HEAD(&dsp->ds_perclid);
-+ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t));
-+ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle);
-+ dsp->ds_access = 0;
-+ dsp->ds_status = 0;
-+ dsp->ds_flags = 0L;
-+ kref_init(&dsp->ds_ref);
-+ set_bit(DS_STATEID_NEW, &dsp->ds_flags);
-+ clear_bit(DS_STATEID_VALID, &dsp->ds_flags);
-+ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
-+ init_waitqueue_head(&dsp->ds_waitq);
++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM)
++ return -NFS4ERR_REP_TOO_BIG;
+
-+ hashval = stateid_hashval(st_id, f_id);
-+ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]);
-+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
-+ return dsp;
++ p = xdr_inline_decode(xdr,
++ res->num_devs * NFS4_DEVICEID4_SIZE + 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ for (i = 0; i < res->num_devs; i++)
++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
++ NFS4_DEVICEID4_SIZE);
++ res->eof = be32_to_cpup(p);
++ return 0;
++out_overflow:
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
+}
+
-+static int
-+update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh,
-+ struct pnfs_get_state *gsp)
++static int decode_getdeviceinfo(struct xdr_stream *xdr,
++ struct pnfs_device *pdev)
+{
-+ struct pnfs_ds_clientid *dcp;
-+ int new = 0;
-+
-+ dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++ __be32 *p;
++ uint32_t len, type;
++ int status;
+
-+ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid);
-+ if (!dcp) {
-+ dcp = alloc_init_ds_clientid(gsp);
-+ if (!dcp)
-+ return 1;
-+ new = 1;
++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
++ if (status) {
++ if (status == -ETOOSMALL) {
++ p = xdr_inline_decode(xdr, 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ pdev->mincount = be32_to_cpup(p);
++ dprintk("%s: Min count too small. mincnt = %u\n",
++ __func__, pdev->mincount);
++ }
++ return status;
+ }
-+ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) {
-+ list_add(&dsp->ds_perclid, &dcp->dc_stateid);
-+ if (!new)
-+ get_ds_clientid(dcp);
++
++ p = xdr_inline_decode(xdr, 8);
++ if (unlikely(!p))
++ goto out_overflow;
++ type = be32_to_cpup(p++);
++ if (type != pdev->layout_type) {
++ dprintk("%s: layout mismatch req: %u pdev: %u\n",
++ __func__, pdev->layout_type, type);
++ return -EINVAL;
+ }
++ /*
++ * Get the length of the opaque device_addr4. xdr_read_pages places
++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
++ * and places the remaining xdr data in xdr_buf->tail
++ */
++ pdev->mincount = be32_to_cpup(p);
++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+
-+ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t));
-+ dsp->ds_access = gsp->access;
-+ dsp->ds_status = 0;
-+ dsp->ds_verifier[0] = gsp->verifier[0];
-+ dsp->ds_verifier[1] = gsp->verifier[1];
-+ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t));
-+ set_bit(DS_STATEID_VALID, &dsp->ds_flags);
-+ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
-+ clear_bit(DS_STATEID_NEW, &dsp->ds_flags);
++ /* Parse notification bitmap, verifying that it is zero. */
++ p = xdr_inline_decode(xdr, 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ len = be32_to_cpup(p);
++ if (len) {
++ int i;
++
++ p = xdr_inline_decode(xdr, 4 * len);
++ if (unlikely(!p))
++ goto out_overflow;
++ for (i = 0; i < len; i++, p++) {
++ if (be32_to_cpup(p)) {
++ dprintk("%s: notifications not supported\n",
++ __func__);
++ return -EIO;
++ }
++ }
++ }
+ return 0;
++out_overflow:
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
+}
+
-+int
-+nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs)
++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
++ struct nfs4_layoutget_res *res)
+{
-+ stateid_t *stid = (stateid_t *)&gs->stid;
-+ struct pnfs_ds_stateid *dsp;
++ __be32 *p;
++ int status;
++ u32 layout_count;
+
-+ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__,
-+ STATEID_VAL(stid));
++ status = decode_op_hdr(xdr, OP_LAYOUTGET);
++ if (status)
++ return status;
++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
++ if (unlikely(!p))
++ goto out_overflow;
++ res->return_on_close = be32_to_cpup(p++);
++ p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
++ layout_count = be32_to_cpup(p);
++ if (!layout_count) {
++ dprintk("%s: server responded with empty layout array\n",
++ __func__);
++ return -EINVAL;
++ }
+
-+ ds_lock_state();
-+ dsp = find_pnfs_ds_stateid(stid);
-+ if (dsp)
-+ put_ds_stateid(dsp);
-+ ds_unlock_state();
++ p = xdr_inline_decode(xdr, 24);
++ if (unlikely(!p))
++ goto out_overflow;
++ p = xdr_decode_hyper(p, &res->range.offset);
++ p = xdr_decode_hyper(p, &res->range.length);
++ res->range.iomode = be32_to_cpup(p++);
++ res->type = be32_to_cpup(p++);
+
-+ dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
++ if (unlikely(status))
++ return status;
+
-+ if (dsp)
-+ return 0;
-+ return -ENOENT;
++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
++ __func__,
++ (unsigned long)res->range.offset,
++ (unsigned long)res->range.length,
++ res->range.iomode,
++ res->type,
++ res->layout.len);
++
++ /* nfs4_proc_layoutget allocated a single page */
++ if (res->layout.len > PAGE_SIZE)
++ return -ENOMEM;
++ memcpy(res->layout.buf, p, res->layout.len);
++
++ if (layout_count > 1) {
++ /* We only handle a length one array at the moment. Any
++ * further entries are just ignored. Note that this means
++ * the client may see a response that is less than the
++ * minimum it requested.
++ */
++ dprintk("%s: server responded with %d layouts, dropping tail\n",
++ __func__, layout_count);
++ }
++
++ return 0;
++out_overflow:
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
+}
+
-+/* Retrieves and validates stateid.
-+ * If stateid exists and its fields match, return it.
-+ * If stateid exists but either the generation or
-+ * ownerids don't match, check with mds to see if it is valid.
-+ * If the stateid doesn't exist, the first thread creates a
-+ * invalid *marker* stateid, then checks to see if the
-+ * stateid exists on the mds. If so, it validates the *marker*
-+ * stateid and updates its fields. Subsequent threads that
-+ * find the *marker* stateid wait until it is valid or an error
-+ * occurs.
-+ * Called with ds_state_lock.
-+ */
-+static struct pnfs_ds_stateid *
-+nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp)
-+{
-+ struct inode *ino = cfh->fh_dentry->d_inode;
-+ struct super_block *sb;
-+ struct pnfs_ds_stateid *dsp = NULL;
-+ struct pnfs_get_state gs = {
-+ .access = 0,
-+ };
-+ int status = 0, waiter = 0;
++static int decode_layoutreturn(struct xdr_stream *xdr,
++ struct nfs4_layoutreturn_res *res)
++{
++ __be32 *p;
++ int status;
+
-+ dprintk("pNFSD: %s -->\n", __func__);
++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
++ if (status)
++ return status;
++ p = xdr_inline_decode(xdr, 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ res->valid = true;
++ res->lrs_present = be32_to_cpup(p);
++ if (res->lrs_present)
++ status = decode_stateid(xdr, &res->stateid);
++ return status;
++out_overflow:
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
++}
+
-+ dsp = find_pnfs_ds_stateid(stidp);
-+ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) &&
-+ (stidp->si_generation == dsp->ds_stid.si_generation))
-+ goto out_noput;
++static int decode_layoutcommit(struct xdr_stream *xdr,
++ struct rpc_rqst *req,
++ struct nfs4_layoutcommit_res *res)
++{
++ __be32 *p;
++ int status;
+
-+ sb = ino->i_sb;
-+ if (!sb || !sb->s_pnfs_op->get_state)
-+ goto out_noput;
++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
++ if (status)
++ return status;
+
-+ /* Uninitialize current state if it exists yet it doesn't match.
-+ * If it is already invalid, another thread is checking state */
-+ if (dsp) {
-+ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags))
-+ waiter = 1;
-+ } else {
-+ dsp = alloc_init_ds_stateid(cfh, stidp);
-+ if (!dsp)
-+ goto out_noput;
++ p = xdr_inline_decode(xdr, 4);
++ if (unlikely(!p))
++ goto out_overflow;
++ res->sizechanged = be32_to_cpup(p);
++
++ if (res->sizechanged) {
++ p = xdr_inline_decode(xdr, 8);
++ if (unlikely(!p))
++ goto out_overflow;
++ xdr_decode_hyper(p, &res->newsize);
+ }
++ return 0;
++out_overflow:
++ print_overflow_msg(__func__, xdr);
++ return -EIO;
++}
++#endif /* CONFIG_NFS_V4_1 */
+
-+ dprintk("pNFSD: %s Starting loop\n", __func__);
-+ get_ds_stateid(dsp);
-+ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
-+ ds_unlock_state();
++/*
++ * END OF "GENERIC" DECODE ROUTINES.
++ */
+
-+ /* Another thread is checking the state */
-+ if (waiter) {
-+ dprintk("pNFSD: %s waiting\n", __func__);
-+ wait_event_interruptible_timeout(dsp->ds_waitq,
-+ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) ||
-+ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)),
-+ msecs_to_jiffies(1024));
-+ dprintk("pNFSD: %s awake\n", __func__);
-+ ds_lock_state();
-+ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
-+ goto out;
++/*
++ * Decode OPEN_DOWNGRADE response
++ */
++static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
++{
++ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+@@ -5758,6 +6488,186 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
+ status = decode_reclaim_complete(&xdr, (void *)NULL);
+ return status;
+ }
+
-+ continue;
-+ }
++/*
++ * Decode GETDEVICELIST response
++ */
++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs4_getdevicelist_res *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
+
-+ /* Validate stateid on mds */
-+ dprintk("pNFSD: %s Checking state on MDS\n", __func__);
-+ memcpy(&gs.stid, stidp, sizeof(stateid_t));
-+ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs);
-+ dprintk("pNFSD: %s from MDS status %d\n", __func__, status);
-+ ds_lock_state();
-+ /* if !status and stateid is valid, update id and mark valid */
-+ if (status || update_ds_stateid(dsp, cfh, &gs)) {
-+ set_bit(DS_STATEID_ERROR, &dsp->ds_flags);
-+ /* remove invalid stateid from list */
-+ put_ds_stateid(dsp);
-+ wake_up(&dsp->ds_waitq);
-+ goto out;
-+ }
++ dprintk("encoding getdevicelist!\n");
+
-+ wake_up(&dsp->ds_waitq);
-+ }
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status != 0)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status != 0)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status != 0)
++ goto out;
++ status = decode_getdevicelist(&xdr, res->devlist);
+out:
-+ if (dsp)
-+ put_ds_stateid(dsp);
-+out_noput:
-+ if (dsp)
-+ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n",
-+ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid));
-+ /* If error, return null */
-+ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
-+ dsp = NULL;
-+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
-+ return dsp;
++ return status;
+}
+
-+int
-+nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid)
++/*
++ * Decode GETDEVINFO response
++ */
++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs4_getdeviceinfo_res *res)
+{
-+ struct pnfs_ds_stateid *dsp;
-+ int status = 0;
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
+
-+ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__,
-+ STATEID_VAL(stateid));
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status != 0)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status != 0)
++ goto out;
++ status = decode_getdeviceinfo(&xdr, res->pdev);
++out:
++ return status;
++}
+
-+ /* Must release state lock while verifying stateid on mds */
-+ nfs4_unlock_state();
-+ ds_lock_state();
-+ dsp = nfsv4_ds_get_state(cfh, stateid);
-+ if (dsp) {
-+ get_ds_stateid(dsp);
-+ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__,
-+ STATEID_VAL(&dsp->ds_stid));
++/*
++ * Decode LAYOUTGET response
++ */
++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs4_layoutget_res *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
+
-+ dprintk("NFSD: %s: dsp %p fh_size %u:%u "
-+ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] "
-+ "gen %x:%x\n",
-+ __func__, dsp,
-+ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size,
-+ ((unsigned *)&cfh->fh_handle.fh_base)[0],
-+ ((unsigned *)&cfh->fh_handle.fh_base)[1],
-+ ((unsigned *)&cfh->fh_handle.fh_base)[2],
-+ ((unsigned *)&cfh->fh_handle.fh_base)[3],
-+ ((unsigned *)&dsp->ds_fh.fh_base)[0],
-+ ((unsigned *)&dsp->ds_fh.fh_base)[1],
-+ ((unsigned *)&dsp->ds_fh.fh_base)[2],
-+ ((unsigned *)&dsp->ds_fh.fh_base)[3],
-+ stateid->si_generation, dsp->ds_stid.si_generation);
-+ }
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_layoutget(&xdr, rqstp, res);
++out:
++ return status;
++}
+
-+ if (!dsp ||
-+ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) ||
-+ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base,
-+ dsp->ds_fh.fh_size) != 0) ||
-+ (stateid->si_generation > dsp->ds_stid.si_generation))
-+ status = nfserr_bad_stateid;
-+ else if (stateid->si_generation < dsp->ds_stid.si_generation)
-+ status = nfserr_old_stateid;
++/*
++ * Decode LAYOUTRETURN response
++ */
++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs4_layoutreturn_res *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
+
-+ if (dsp)
-+ put_ds_stateid(dsp);
-+ ds_unlock_state();
-+ nfs4_lock_state();
-+ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status));
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_layoutreturn(&xdr, res);
++out:
++ return status;
++}
++
++/*
++ * Decode LAYOUTCOMMIT response
++ */
++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs4_layoutcommit_res *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_layoutcommit(&xdr, rqstp, res);
++ if (status)
++ goto out;
++ decode_getfattr(&xdr, res->fattr, res->server,
++ !RPC_IS_ASYNC(rqstp->rq_task));
++out:
+ return status;
+}
+
-+void
-+nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p)
++/*
++ * Decode pNFS File Layout Data Server WRITE response
++ */
++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs_writeres *res)
+{
-+ struct pnfs_ds_stateid *dsp = NULL;
-+
-+ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid);
-+
-+ ds_lock_state();
-+ if (stateid != NULL) {
-+ dsp = find_pnfs_ds_stateid(stateid);
-+ if (dsp)
-+ get_ds_stateid(dsp);
-+ }
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
+
-+ /* XXX: Should we fetch the stateid or wait if some other
-+ * thread is currently retrieving the stateid ? */
-+ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
-+ *p++ = dsp->ds_verifier[0];
-+ *p++ = dsp->ds_verifier[1];
-+ put_ds_stateid(dsp);
-+ } else {
-+ /* must be on MDS */
-+ ds_unlock_state();
-+ sb->s_pnfs_op->get_verifier(sb, p);
-+ ds_lock_state();
-+ p += 2;
-+ }
-+ ds_unlock_state();
-+ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
-+ return;
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_write(&xdr, res);
++ if (!status)
++ return res->count;
++out:
++ return status;
+}
+
-+#endif /* CONFIG_PNFSD */
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4proc.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4proc.c 2010-09-30 12:25:08.521285000 -0400
-@@ -34,10 +34,14 @@
- */
- #include <linux/file.h>
- #include <linux/slab.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd4_block.h>
-
- #include "cache.h"
- #include "xdr4.h"
- #include "vfs.h"
-+#include "pnfsd.h"
-
- #define NFSDDBG_FACILITY NFSDDBG_PROC
-
-@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc
- * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
- */
- status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
-+#if defined(CONFIG_SPNFS)
-+ if (!status && spnfs_enabled()) {
-+ struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
-+
-+ status = spnfs_open(inode, open);
-+ if (status) {
-+ dprintk(
-+ "nfsd: pNFS could not be enabled for inode: %lu\n",
-+ inode->i_ino);
-+ /*
-+ * XXX When there's a failure then need to indicate to
-+ * future ops that no pNFS is available. Should I save
-+ * the status in the inode? It's kind of a big hammer.
-+ * But there may be no stripes available?
-+ */
-+ }
-+ }
-+#endif /* CONFIG_SPNFS */
- out:
- if (open->op_stateowner) {
- nfs4_get_stateowner(open->op_stateowner);
-@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str
- &access->ac_supported);
- }
-
-+static void
-+nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf)
++/*
++ * Decode pNFS File Layout Data Server COMMIT response
++ */
++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p,
++ struct nfs_writeres *res)
+{
-+ u32 *p = (u32 *)verf->data;
-+
-+#if defined(CONFIG_PNFSD)
-+ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) {
-+ nfs4_ds_get_verifier(NULL, sb, p);
-+ return;
-+ }
-+#endif /* CONFIG_PNFSD */
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
+
-+ *p++ = nfssvc_boot.tv_sec;
-+ *p++ = nfssvc_boot.tv_usec;
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_sequence(&xdr, &res->seq_res, rqstp);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_commit(&xdr, res);
++out:
++ return status;
+}
-+
- static __be32
- nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- struct nfsd4_commit *commit)
- {
- __be32 status;
-
-- u32 *p = (u32 *)commit->co_verf.data;
-- *p++ = nfssvc_boot.tv_sec;
-- *p++ = nfssvc_boot.tv_usec;
--
-+ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
-+ &commit->co_verf);
- status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
- commit->co_count);
- if (status == nfserr_symlink)
-@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru
- {
- stateid_t *stateid = &write->wr_stateid;
- struct file *filp = NULL;
-- u32 *p;
- __be32 status = nfs_ok;
- unsigned long cnt;
-
-@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru
+ #endif /* CONFIG_NFS_V4_1 */
- cnt = write->wr_buflen;
- write->wr_how_written = write->wr_stable_how;
-- p = (u32 *)write->wr_verifier.data;
-- *p++ = nfssvc_boot.tv_sec;
-- *p++ = nfssvc_boot.tv_usec;
+ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
+@@ -5936,6 +6846,13 @@ struct rpc_procinfo nfs4_procedures[] = {
+ PROC(SEQUENCE, enc_sequence, dec_sequence),
+ PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
+ PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
++ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
++ PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite),
++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit),
+ #endif /* CONFIG_NFS_V4_1 */
+ };
-+ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
-+ &write->wr_verifier);
-+#if defined(CONFIG_SPNFS)
-+#if defined(CONFIG_SPNFS_BLOCK)
-+ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) {
-+ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode,
-+ RETURN_FILE, write->wr_offset, write->wr_buflen);
-+ if (!status) {
-+ status = nfsd_write(rqstp, &cstate->current_fh, filp,
-+ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-+ &cnt, &write->wr_how_written);
-+ }
-+ } else
-+#endif
-+
-+ if (spnfs_enabled()) {
-+ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode,
-+ write->wr_offset, write->wr_buflen, write->wr_vlen,
-+ rqstp);
-+ if (status == nfs_ok) {
-+ /* DMXXX: HACK to get filesize set */
-+ /* write one byte at offset+length-1 */
-+ struct kvec k[1];
-+ char zero = 0;
-+ unsigned long cnt = 1;
+diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
+new file mode 100644
+index 0000000..9addfe8
+--- /dev/null
++++ b/fs/nfs/objlayout/Kbuild
+@@ -0,0 +1,11 @@
++#
++# Makefile for the pNFS Objects Layout Driver kernel module
++#
++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o
++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
++
++#
++# Panasas pNFS Layout Driver kernel module
++#
++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o
++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o
+diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
+new file mode 100644
+index 0000000..e945ace
+--- /dev/null
++++ b/fs/nfs/objlayout/objio_osd.c
+@@ -0,0 +1,1060 @@
++/*
++ * objio_osd.c
++ *
++ * pNFS Objects layout implementation over open-osd initiator library
++ *
++ * Copyright (C) 2009 Panasas Inc.
++ * All rights reserved.
++ *
++ * Benny Halevy <bharrosh at panasas.com>
++ * Boaz Harrosh <bharrosh at panasas.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2
++ * See the file COPYING included with this distribution for more details.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the Panasas company nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
+
-+ k[0].iov_base = (void *)&zero;
-+ k[0].iov_len = 1;
-+ nfsd_write(rqstp, &cstate->current_fh, filp,
-+ write->wr_offset+write->wr_buflen-1, k, 1,
-+ &cnt, &write->wr_how_written);
-+ }
-+ } else /* we're not an MDS */
-+ status = nfsd_write(rqstp, &cstate->current_fh, filp,
-+ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-+ &cnt, &write->wr_how_written);
-+#else
- status = nfsd_write(rqstp, &cstate->current_fh, filp,
- write->wr_offset, rqstp->rq_vec, write->wr_vlen,
- &cnt, &write->wr_how_written);
-+#endif /* CONFIG_SPNFS */
++#include <linux/module.h>
++#include <scsi/scsi_device.h>
++#include <scsi/osd_attributes.h>
++#include <scsi/osd_initiator.h>
++#include <scsi/osd_sec.h>
++#include <scsi/osd_sense.h>
+
- if (filp)
- fput(filp);
-
-@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str
- return status == nfserr_same ? nfs_ok : status;
- }
-
-+#if defined(CONFIG_PNFSD)
++#include "objlayout.h"
+
-+static __be32
-+nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp,
-+ unsigned int layout_type)
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++
++#define _LLU(x) ((unsigned long long)x)
++
++enum { BIO_MAX_PAGES_KMALLOC =
++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
++};
++
++/* A per mountpoint struct currently for device cache */
++struct objio_mount_type {
++ struct list_head dev_list;
++ spinlock_t dev_list_lock;
++};
++
++struct _dev_ent {
++ struct list_head list;
++ struct nfs4_deviceid d_id;
++ struct osd_dev *od;
++};
++
++static void _dev_list_remove_all(struct objio_mount_type *omt)
+{
-+ int status, type;
++ spin_lock(&omt->dev_list_lock);
+
-+ /* check to see if pNFS is supported. */
-+ status = nfserr_layoutunavailable;
-+ if (exp && exp->ex_pnfs == 0) {
-+ dprintk("%s: Underlying file system "
-+ "is not exported over pNFS\n", __func__);
-+ goto out;
-+ }
-+ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) {
-+ dprintk("%s: Underlying file system "
-+ "does not support pNFS\n", __func__);
-+ goto out;
-+ }
++ while (!list_empty(&omt->dev_list)) {
++ struct _dev_ent *de = list_entry(omt->dev_list.next,
++ struct _dev_ent, list);
+
-+ type = sb->s_pnfs_op->layout_type(sb);
++ list_del_init(&de->list);
++ osduld_put_device(de->od);
++ kfree(de);
++ }
+
-+ /* check to see if requested layout type is supported. */
-+ status = nfserr_unknown_layouttype;
-+ if (!type)
-+ dprintk("BUG: %s: layout_type 0 is reserved and must not be "
-+ "used by filesystem\n", __func__);
-+ else if (type != layout_type)
-+ dprintk("%s: requested layout type %d "
-+ "does not match supported type %d\n",
-+ __func__, layout_type, type);
-+ else
-+ status = nfs_ok;
-+out:
-+ return status;
++ spin_unlock(&omt->dev_list_lock);
+}
+
-+static __be32
-+nfsd4_getdevlist(struct svc_rqst *rqstp,
-+ struct nfsd4_compound_state *cstate,
-+ struct nfsd4_pnfs_getdevlist *gdlp)
++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt,
++ struct nfs4_deviceid *d_id)
+{
-+ struct super_block *sb;
-+ struct svc_fh *current_fh = &cstate->current_fh;
-+ int status;
++ struct list_head *le;
+
-+ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n",
-+ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices,
-+ gdlp->gd_cookie, gdlp->gd_verf);
++ list_for_each(le, &omt->dev_list) {
++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list);
+
++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id)))
++ return de->od;
++ }
+
-+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+ if (status)
-+ goto out;
++ return NULL;
++}
+
-+ status = nfserr_inval;
-+ sb = current_fh->fh_dentry->d_inode->i_sb;
-+ if (!sb)
-+ goto out;
++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt,
++ struct nfs4_deviceid *d_id)
++{
++ struct osd_dev *od;
+
-+ /* We must be able to encode at list one device */
-+ if (!gdlp->gd_maxdevices)
-+ goto out;
++ spin_lock(&omt->dev_list_lock);
++ od = ___dev_list_find(omt, d_id);
++ spin_unlock(&omt->dev_list_lock);
++ return od;
++}
+
-+ /* Ensure underlying file system supports pNFS and,
-+ * if so, the requested layout type
-+ */
-+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+ gdlp->gd_layout_type);
-+ if (status)
-+ goto out;
++static int _dev_list_add(struct objio_mount_type *omt,
++ struct nfs4_deviceid *d_id, struct osd_dev *od)
++{
++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL);
+
-+ /* Do nothing if underlying file system does not support
-+ * getdevicelist */
-+ if (!sb->s_pnfs_op->get_device_iter) {
-+ status = nfserr_notsupp;
++ if (!de)
++ return -ENOMEM;
++
++ spin_lock(&omt->dev_list_lock);
++
++ if (___dev_list_find(omt, d_id)) {
++ kfree(de);
+ goto out;
+ }
+
-+ /* Set up arguments so device can be retrieved at encode time */
-+ gdlp->gd_fhp = &cstate->current_fh;
++ de->d_id = *d_id;
++ de->od = od;
++ list_add(&de->list, &omt->dev_list);
++
+out:
-+ return status;
++ spin_unlock(&omt->dev_list_lock);
++ return 0;
+}
+
-+static __be32
-+nfsd4_getdevinfo(struct svc_rqst *rqstp,
-+ struct nfsd4_compound_state *cstate,
-+ struct nfsd4_pnfs_getdevinfo *gdp)
++struct objio_segment {
++ struct pnfs_osd_layout *layout;
++
++ unsigned mirrors_p1;
++ unsigned stripe_unit;
++ unsigned group_width; /* Data stripe_units without integrity comps */
++ u64 group_depth;
++ unsigned group_count;
++
++ unsigned num_comps;
++ /* variable length */
++ struct osd_dev *ods[1];
++};
++
++struct objio_state;
++typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
++
++struct objio_state {
++ /* Generic layer */
++ struct objlayout_io_state ol_state;
++
++ struct objio_segment *objio_seg;
++
++ struct kref kref;
++ objio_done_fn done;
++ void *private;
++
++ unsigned long length;
++ unsigned numdevs; /* Actually used devs in this IO */
++ /* A per-device variable array of size numdevs */
++ struct _objio_per_comp {
++ struct bio *bio;
++ struct osd_request *or;
++ unsigned long length;
++ u64 offset;
++ unsigned dev;
++ } per_dev[];
++};
++
++/* Send and wait for a get_device_info of devices in the layout,
++ then look them up with the osd_initiator library */
++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay,
++ struct objio_segment *objio_seg, unsigned comp)
+{
-+ struct super_block *sb;
-+ int status;
-+ clientid_t clid;
++ struct pnfs_osd_layout *layout = objio_seg->layout;
++ struct pnfs_osd_deviceaddr *deviceaddr;
++ struct nfs4_deviceid *d_id;
++ struct osd_dev *od;
++ struct osd_dev_info odi;
++ struct objio_mount_type *omt = NFS_SERVER(pnfslay->inode)->pnfs_ld_data;
++ int err;
+
-+ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n",
-+ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid,
-+ gdp->gd_devid.devid, gdp->gd_maxcount);
++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id;
+
-+ status = nfserr_inval;
-+ sb = find_sbid_id(gdp->gd_devid.sbid);
-+ dprintk("%s: sb %p\n", __func__, sb);
-+ if (!sb) {
-+ status = nfserr_noent;
++ od = _dev_list_find(omt, d_id);
++ if (od)
++ return od;
++
++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr);
++ if (unlikely(err)) {
++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err);
++ return ERR_PTR(err);
++ }
++
++ odi.systemid_len = deviceaddr->oda_systemid.len;
++ if (odi.systemid_len > sizeof(odi.systemid)) {
++ err = -EINVAL;
++ goto out;
++ } else if (odi.systemid_len)
++ memcpy(odi.systemid, deviceaddr->oda_systemid.data,
++ odi.systemid_len);
++ odi.osdname_len = deviceaddr->oda_osdname.len;
++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
++
++ if (!odi.osdname_len && !odi.systemid_len) {
++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
++ __func__);
++ err = -ENODEV;
+ goto out;
+ }
+
-+ /* Ensure underlying file system supports pNFS and,
-+ * if so, the requested layout type
-+ */
-+ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type);
-+ if (status)
++ od = osduld_info_lookup(&odi);
++ if (unlikely(IS_ERR(od))) {
++ err = PTR_ERR(od);
++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+ goto out;
++ }
+
-+ /* Set up arguments so device can be retrieved at encode time */
-+ gdp->gd_sb = sb;
++ _dev_list_add(omt, d_id, od);
+
-+ /* Update notifications */
-+ copy_clientid(&clid, cstate->session);
-+ pnfs_set_device_notify(&clid, gdp->gd_notify_types);
+out:
-+ return status;
++ dprintk("%s: return=%d\n", __func__, err);
++ objlayout_put_deviceinfo(deviceaddr);
++ return err ? ERR_PTR(err) : od;
+}
+
-+static __be32
-+nfsd4_layoutget(struct svc_rqst *rqstp,
-+ struct nfsd4_compound_state *cstate,
-+ struct nfsd4_pnfs_layoutget *lgp)
++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
++ struct objio_segment *objio_seg)
+{
-+ int status;
-+ struct super_block *sb;
-+ struct svc_fh *current_fh = &cstate->current_fh;
-+
-+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+ if (status)
-+ goto out;
-+
-+ status = nfserr_inval;
-+ sb = current_fh->fh_dentry->d_inode->i_sb;
-+ if (!sb)
-+ goto out;
++ struct pnfs_osd_layout *layout = objio_seg->layout;
++ unsigned i, num_comps = layout->olo_num_comps;
++ int err;
+
-+ /* Ensure underlying file system supports pNFS and,
-+ * if so, the requested layout type
-+ */
-+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+ lgp->lg_seg.layout_type);
-+ if (status)
-+ goto out;
++ /* lookup all devices */
++ for (i = 0; i < num_comps; i++) {
++ struct osd_dev *od;
+
-+ status = nfserr_badiomode;
-+ if (lgp->lg_seg.iomode != IOMODE_READ &&
-+ lgp->lg_seg.iomode != IOMODE_RW) {
-+ dprintk("pNFS %s: invalid iomode %d\n", __func__,
-+ lgp->lg_seg.iomode);
-+ goto out;
++ od = _device_lookup(pnfslay, objio_seg, i);
++ if (unlikely(IS_ERR(od))) {
++ err = PTR_ERR(od);
++ goto out;
++ }
++ objio_seg->ods[i] = od;
+ }
++ objio_seg->num_comps = num_comps;
++ err = 0;
+
-+ /* Set up arguments so layout can be retrieved at encode time */
-+ lgp->lg_fhp = current_fh;
-+ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session);
-+ status = nfs_ok;
+out:
-+ return status;
++ dprintk("%s: return=%d\n", __func__, err);
++ return err;
+}
+
-+static __be32
-+nfsd4_layoutcommit(struct svc_rqst *rqstp,
-+ struct nfsd4_compound_state *cstate,
-+ struct nfsd4_pnfs_layoutcommit *lcp)
++static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
-+ int status;
-+ struct inode *ino = NULL;
-+ struct iattr ia;
-+ struct super_block *sb;
-+ struct svc_fh *current_fh = &cstate->current_fh;
++ struct pnfs_osd_data_map *data_map = &layout->olo_map;
++ u64 stripe_length;
++ u32 group_width;
+
-+ dprintk("NFSD: nfsd4_layoutcommit \n");
-+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+ if (status)
-+ goto out;
++/* FIXME: Only raid0 for now. if not go through MDS */
++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
++ printk(KERN_ERR "Only RAID_0 for now\n");
++ return -ENOTSUPP;
++ }
++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
++ data_map->odm_num_comps, data_map->odm_mirror_cnt);
++ return -EINVAL;
++ }
+
-+ status = nfserr_inval;
-+ ino = current_fh->fh_dentry->d_inode;
-+ if (!ino)
-+ goto out;
++ if (data_map->odm_group_width)
++ group_width = data_map->odm_group_width;
++ else
++ group_width = data_map->odm_num_comps /
++ (data_map->odm_mirror_cnt + 1);
+
-+ status = nfserr_inval;
-+ sb = ino->i_sb;
-+ if (!sb)
-+ goto out;
++ stripe_length = (u64)data_map->odm_stripe_unit * group_width;
++ if (stripe_length >= (1ULL << 32)) {
++ printk(KERN_ERR "Total Stripe length(0x%llx)"
++ " >= 32bit is not supported\n", _LLU(stripe_length));
++ return -ENOTSUPP;
++ }
+
-+ /* Ensure underlying file system supports pNFS and,
-+ * if so, the requested layout type
-+ */
-+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+ lcp->args.lc_seg.layout_type);
-+ if (status)
-+ goto out;
++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
++ printk(KERN_ERR "Stripe Unit(0x%llx)"
++ " must be Multples of PAGE_SIZE(0x%lx)\n",
++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
++ return -ENOTSUPP;
++ }
+
-+ /* This will only extend the file length. Do a quick
-+ * check to see if there is any point in waiting for the update
-+ * locks.
-+ * TODO: Is this correct for all back ends?
-+ */
-+ dprintk("%s:new offset: %d new size: %llu old size: %lld\n",
-+ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1,
-+ ino->i_size);
++ return 0;
++}
+
-+ /* Set clientid from sessionid */
-+ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session);
-+ lcp->res.lc_size_chg = 0;
-+ if (sb->s_pnfs_op->layout_commit) {
-+ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res);
-+ dprintk("%s:layout_commit result %d\n", __func__, status);
-+ } else {
-+ fh_lock(current_fh);
-+ if ((lcp->args.lc_newoffset == 0) ||
-+ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) {
-+ status = 0;
-+ lcp->res.lc_size_chg = 0;
-+ fh_unlock(current_fh);
-+ goto out;
-+ }
++int objio_alloc_lseg(void **outp,
++ struct pnfs_layout_hdr *pnfslay,
++ struct pnfs_layout_segment *lseg,
++ struct pnfs_osd_layout *layout)
++{
++ struct objio_segment *objio_seg;
++ int err;
+
-+ /* Try our best to update the file size */
-+ dprintk("%s: Modifying file size\n", __func__);
-+ ia.ia_valid = ATTR_SIZE;
-+ ia.ia_size = lcp->args.lc_last_wr + 1;
-+ status = notify_change(current_fh->fh_dentry, &ia);
-+ fh_unlock(current_fh);
-+ dprintk("%s:notify_change result %d\n", __func__, status);
-+ }
++ err = _verify_data_map(layout);
++ if (unlikely(err))
++ return err;
+
-+ if (!status && lcp->res.lc_size_chg &&
-+ EX_ISSYNC(current_fh->fh_export)) {
-+ dprintk("%s: Synchronously writing inode size %llu\n",
-+ __func__, ino->i_size);
-+ write_inode_now(ino, 1);
-+ lcp->res.lc_newsize = i_size_read(ino);
++ objio_seg = kzalloc(sizeof(*objio_seg) +
++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
++ GFP_KERNEL);
++ if (!objio_seg)
++ return -ENOMEM;
++
++ objio_seg->layout = layout;
++ err = objio_devices_lookup(pnfslay, objio_seg);
++ if (err)
++ goto free_seg;
++
++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
++ if (layout->olo_map.odm_group_width) {
++ objio_seg->group_width = layout->olo_map.odm_group_width;
++ objio_seg->group_depth = layout->olo_map.odm_group_depth;
++ objio_seg->group_count = layout->olo_map.odm_num_comps /
++ objio_seg->mirrors_p1 /
++ objio_seg->group_width;
++ } else {
++ objio_seg->group_width = layout->olo_map.odm_num_comps /
++ objio_seg->mirrors_p1;
++ objio_seg->group_depth = -1;
++ objio_seg->group_count = 1;
+ }
-+out:
-+ return status;
++
++ *outp = objio_seg;
++ return 0;
++
++free_seg:
++ dprintk("%s: Error: return %d\n", __func__, err);
++ kfree(objio_seg);
++ *outp = NULL;
++ return err;
+}
+
-+static __be32
-+nfsd4_layoutreturn(struct svc_rqst *rqstp,
-+ struct nfsd4_compound_state *cstate,
-+ struct nfsd4_pnfs_layoutreturn *lrp)
++void objio_free_lseg(void *p)
+{
-+ int status;
-+ struct super_block *sb;
-+ struct svc_fh *current_fh = &cstate->current_fh;
++ struct objio_segment *objio_seg = p;
+
-+ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
-+ if (status)
-+ goto out;
++ kfree(objio_seg);
++}
++
++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++{
++ struct objio_segment *objio_seg = seg;
++ struct objio_state *ios;
++ const unsigned first_size = sizeof(*ios) +
++ objio_seg->num_comps * sizeof(ios->per_dev[0]);
++ const unsigned sec_size = objio_seg->num_comps *
++ sizeof(ios->ol_state.ioerrs[0]);
++
++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
++ ios = kzalloc(first_size + sec_size, GFP_KERNEL);
++ if (unlikely(!ios))
++ return -ENOMEM;
++
++ ios->objio_seg = objio_seg;
++ ios->ol_state.ioerrs = ((void *)ios) + first_size;
++ ios->ol_state.num_comps = objio_seg->num_comps;
++
++ *outp = &ios->ol_state;
++ return 0;
++}
++
++void objio_free_io_state(struct objlayout_io_state *ol_state)
++{
++ struct objio_state *ios = container_of(ol_state, struct objio_state,
++ ol_state);
+
-+ status = nfserr_inval;
-+ sb = current_fh->fh_dentry->d_inode->i_sb;
-+ if (!sb)
-+ goto out;
++ kfree(ios);
++}
+
-+ /* Ensure underlying file system supports pNFS and,
-+ * if so, the requested layout type
-+ */
-+ status = nfsd4_layout_verify(sb, current_fh->fh_export,
-+ lrp->args.lr_seg.layout_type);
-+ if (status)
-+ goto out;
++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
++{
++ switch (oep) {
++ case OSD_ERR_PRI_NO_ERROR:
++ return (enum pnfs_osd_errno)0;
+
-+ status = nfserr_inval;
-+ if (lrp->args.lr_return_type != RETURN_FILE &&
-+ lrp->args.lr_return_type != RETURN_FSID &&
-+ lrp->args.lr_return_type != RETURN_ALL) {
-+ dprintk("pNFS %s: invalid return_type %d\n", __func__,
-+ lrp->args.lr_return_type);
-+ goto out;
-+ }
++ case OSD_ERR_PRI_CLEAR_PAGES:
++ BUG_ON(1);
++ return 0;
+
-+ status = nfserr_inval;
-+ if (lrp->args.lr_seg.iomode != IOMODE_READ &&
-+ lrp->args.lr_seg.iomode != IOMODE_RW &&
-+ lrp->args.lr_seg.iomode != IOMODE_ANY) {
-+ dprintk("pNFS %s: invalid iomode %d\n", __func__,
-+ lrp->args.lr_seg.iomode);
-+ goto out;
++ case OSD_ERR_PRI_RESOURCE:
++ return PNFS_OSD_ERR_RESOURCE;
++ case OSD_ERR_PRI_BAD_CRED:
++ return PNFS_OSD_ERR_BAD_CRED;
++ case OSD_ERR_PRI_NO_ACCESS:
++ return PNFS_OSD_ERR_NO_ACCESS;
++ case OSD_ERR_PRI_UNREACHABLE:
++ return PNFS_OSD_ERR_UNREACHABLE;
++ case OSD_ERR_PRI_NOT_FOUND:
++ return PNFS_OSD_ERR_NOT_FOUND;
++ case OSD_ERR_PRI_NO_SPACE:
++ return PNFS_OSD_ERR_NO_SPACE;
++ default:
++ WARN_ON(1);
++ /* fallthrough */
++ case OSD_ERR_PRI_EIO:
++ return PNFS_OSD_ERR_EIO;
+ }
-+
-+ /* Set clientid from sessionid */
-+ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session);
-+ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE);
-+ status = nfs4_pnfs_return_layout(sb, current_fh, lrp);
-+out:
-+ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n",
-+ __func__, status, lrp->args.lr_return_type, lrp->lrs_present);
-+ return status;
+}
-+#endif /* CONFIG_PNFSD */
+
- /*
- * NULL call.
- */
-@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[
- .op_flags = ALLOWED_WITHOUT_FH,
- .op_name = "OP_RECLAIM_COMPLETE",
- },
-+#if defined(CONFIG_PNFSD)
-+ [OP_GETDEVICELIST] = {
-+ .op_func = (nfsd4op_func)nfsd4_getdevlist,
-+ .op_name = "OP_GETDEVICELIST",
-+ },
-+ [OP_GETDEVICEINFO] = {
-+ .op_func = (nfsd4op_func)nfsd4_getdevinfo,
-+ .op_flags = ALLOWED_WITHOUT_FH,
-+ .op_name = "OP_GETDEVICEINFO",
-+ },
-+ [OP_LAYOUTGET] = {
-+ .op_func = (nfsd4op_func)nfsd4_layoutget,
-+ .op_name = "OP_LAYOUTGET",
-+ },
-+ [OP_LAYOUTCOMMIT] = {
-+ .op_func = (nfsd4op_func)nfsd4_layoutcommit,
-+ .op_name = "OP_LAYOUTCOMMIT",
-+ },
-+ [OP_LAYOUTRETURN] = {
-+ .op_func = (nfsd4op_func)nfsd4_layoutreturn,
-+ .op_name = "OP_LAYOUTRETURN",
-+ },
-+#endif /* CONFIG_PNFSD */
- };
-
- static const char *nfsd4_op_name(unsigned opnum)
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4state.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4state.c.orig 2010-09-30 12:22:45.271045000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4state.c 2010-09-30 12:25:08.529287000 -0400
-@@ -42,6 +42,8 @@
- #include "xdr4.h"
- #include "vfs.h"
-
-+#include "pnfsd.h"
++static void _clear_bio(struct bio *bio)
++{
++ struct bio_vec *bv;
++ unsigned i;
+
- #define NFSDDBG_FACILITY NFSDDBG_PROC
-
- /* Globals */
-@@ -59,8 +61,6 @@ static u64 current_sessionid = 1;
- #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
-
- /* forward declarations */
--static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
--static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
- static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
- static void nfs4_set_recdir(char *recdir);
-
-@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir
-
- /* Currently used for almost all code touching nfsv4 state: */
- static DEFINE_MUTEX(client_mutex);
-+struct task_struct *client_mutex_owner;
-
- /*
- * Currently used for the del_recall_lru and file hash table. In an
-@@ -85,11 +86,21 @@ void
- nfs4_lock_state(void)
- {
- mutex_lock(&client_mutex);
-+ client_mutex_owner = current;
-+}
++ __bio_for_each_segment(bv, bio, i, 0) {
++ unsigned this_count = bv->bv_len;
+
-+#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current)
++ if (likely(PAGE_SIZE == this_count))
++ clear_highpage(bv->bv_page);
++ else
++ zero_user(bv->bv_page, bv->bv_offset, this_count);
++ }
++}
+
-+void
-+nfs4_bug_on_unlocked_state(void)
++static int _io_check(struct objio_state *ios, bool is_write)
+{
-+ BUG_ON(client_mutex_owner != current);
- }
-
- void
- nfs4_unlock_state(void)
- {
-+ client_mutex_owner = NULL;
- mutex_unlock(&client_mutex);
- }
-
-@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbyt
-
- static struct list_head del_recall_lru;
-
--static inline void
-+inline void
- put_nfs4_file(struct nfs4_file *fi)
- {
- if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
-@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi)
- }
- }
-
--static inline void
-+inline void
- get_nfs4_file(struct nfs4_file *fi)
- {
- atomic_inc(&fi->fi_ref);
-@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct
-
- static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
- {
-- if (fp->fi_fds[oflag]) {
-- fput(fp->fi_fds[oflag]);
-- fp->fi_fds[oflag] = NULL;
-- }
-+ struct file *fd = fp->fi_fds[oflag];
++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
++ int lin_ret = 0;
++ int i;
+
-+ if (!fd)
-+ return;
++ for (i = 0; i < ios->numdevs; i++) {
++ struct osd_sense_info osi;
++ struct osd_request *or = ios->per_dev[i].or;
++ int ret;
+
-+ fp->fi_fds[oflag] = NULL;
-+ BUG_ON_UNLOCKED_STATE();
-+ nfs4_unlock_state(); /* allow nested layout recall/return */
-+ fput(fd);
-+ nfs4_lock_state();
- }
-
- static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
-@@ -308,8 +325,8 @@ static DEFINE_SPINLOCK(client_lock);
- * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
- * used in reboot/reset lease grace period processing
- *
-- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
-- * setclientid_confirmed info.
-+ * conf_id_hashtbl[], and conf_str_hashtbl[] hold
-+ * confirmed setclientid_confirmed info.
- *
- * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
- * setclientid info.
-@@ -334,6 +351,7 @@ static void unhash_generic_stateid(struc
- list_del(&stp->st_hash);
- list_del(&stp->st_perfile);
- list_del(&stp->st_perstateowner);
-+ release_pnfs_ds_dev_list(stp);
- }
-
- static void free_generic_stateid(struct nfs4_stateid *stp)
-@@ -856,6 +874,8 @@ expire_client(struct nfs4_client *clp)
- struct nfs4_delegation *dp;
- struct list_head reaplist;
-
-+ BUG_ON_UNLOCKED_STATE();
++ if (!or)
++ continue;
+
- INIT_LIST_HEAD(&reaplist);
- spin_lock(&recall_lock);
- while (!list_empty(&clp->cl_delegations)) {
-@@ -875,6 +895,7 @@ expire_client(struct nfs4_client *clp)
- sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
- release_openowner(sop);
- }
-+ pnfs_expire_client(clp);
- nfsd4_set_callback_client(clp, NULL);
- if (clp->cl_cb_conn.cb_xprt)
- svc_xprt_put(clp->cl_cb_conn.cb_xprt);
-@@ -887,6 +908,13 @@ expire_client(struct nfs4_client *clp)
- spin_unlock(&client_lock);
- }
-
-+void expire_client_lock(struct nfs4_client *clp)
-+{
-+ nfs4_lock_state();
-+ expire_client(clp);
-+ nfs4_unlock_state();
++ ret = osd_req_decode_sense(or, &osi);
++ if (likely(!ret))
++ continue;
++
++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
++ /* start read offset passed endof file */
++ BUG_ON(is_write);
++ _clear_bio(ios->per_dev[i].bio);
++ dprintk("%s: start read offset passed end of file "
++ "offset=0x%llx, length=0x%lx\n", __func__,
++ _LLU(ios->per_dev[i].offset),
++ ios->per_dev[i].length);
++
++ continue; /* we recovered */
++ }
++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev,
++ osd_pri_2_pnfs_err(osi.osd_err_pri),
++ ios->per_dev[i].offset,
++ ios->per_dev[i].length,
++ is_write);
++
++ if (osi.osd_err_pri >= oep) {
++ oep = osi.osd_err_pri;
++ lin_ret = ret;
++ }
++ }
++
++ return lin_ret;
+}
+
- static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
- {
- memcpy(target->cl_verifier.data, source->data,
-@@ -976,6 +1004,11 @@ static struct nfs4_client *create_client
- INIT_LIST_HEAD(&clp->cl_strhash);
- INIT_LIST_HEAD(&clp->cl_openowners);
- INIT_LIST_HEAD(&clp->cl_delegations);
-+#if defined(CONFIG_PNFSD)
-+ INIT_LIST_HEAD(&clp->cl_layouts);
-+ INIT_LIST_HEAD(&clp->cl_layoutrecalls);
-+ atomic_set(&clp->cl_deviceref, 0);
-+#endif /* CONFIG_PNFSD */
- INIT_LIST_HEAD(&clp->cl_sessions);
- INIT_LIST_HEAD(&clp->cl_lru);
- clp->cl_time = get_seconds();
-@@ -1025,7 +1058,7 @@ move_to_confirmed(struct nfs4_client *cl
- renew_client(clp);
- }
-
--static struct nfs4_client *
-+struct nfs4_client *
- find_confirmed_client(clientid_t *clid)
- {
- struct nfs4_client *clp;
-@@ -1095,6 +1128,24 @@ find_unconfirmed_client_by_str(const cha
- return NULL;
- }
-
-+int
-+filter_confirmed_clients(int (* func)(struct nfs4_client *, void *),
-+ void *arg)
++/*
++ * Common IO state helpers.
++ */
++static void _io_free(struct objio_state *ios)
+{
-+ struct nfs4_client *clp, *next;
-+ int i, status = 0;
++ unsigned i;
+
-+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
-+ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i],
-+ cl_strhash) {
-+ status = func(clp, arg);
-+ if (status)
-+ break;
++ for (i = 0; i < ios->numdevs; i++) {
++ struct _objio_per_comp *per_dev = &ios->per_dev[i];
++
++ if (per_dev->or) {
++ osd_end_request(per_dev->or);
++ per_dev->or = NULL;
+ }
+
-+ return status;
++ if (per_dev->bio) {
++ bio_put(per_dev->bio);
++ per_dev->bio = NULL;
++ }
++ }
+}
+
- static void
- gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
- {
-@@ -1227,8 +1278,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co
- static void
- nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
- {
-- /* pNFS is not supported */
-+#if defined(CONFIG_PNFSD)
-+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS |
-+ EXCHGID4_FLAG_USE_PNFS_DS;
-+#else /* CONFIG_PNFSD */
- new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
-+#endif /* CONFIG_PNFSD */
-
- /* Referrals are supported, Migration is not. */
- new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
-@@ -1418,6 +1473,13 @@ nfsd4_create_session(struct svc_rqst *rq
- struct nfsd4_clid_slot *cs_slot = NULL;
- int status = 0;
-
-+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
-+ /* XXX hack to get local ip address */
-+ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local,
-+ sizeof(pnfsd_lexp_addr));
-+ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen;
-+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)
++{
++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index;
++ unsigned max_dev = min_dev + ios->ol_state.num_comps;
+
- nfs4_lock_state();
- unconf = find_unconfirmed_client(&cr_ses->clientid);
- conf = find_confirmed_client(&cr_ses->clientid);
-@@ -1457,25 +1519,26 @@ nfsd4_create_session(struct svc_rqst *rq
- cs_slot->sl_seqid++; /* from 0 to 1 */
- move_to_confirmed(unconf);
-
-- if (cr_ses->flags & SESSION4_BACK_CHAN) {
-- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-- svc_xprt_get(rqstp->rq_xprt);
-- rpc_copy_addr(
-- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
-- sa);
-- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-- unconf->cl_cb_conn.cb_minorversion =
-- cstate->minorversion;
-- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-- unconf->cl_cb_seq_nr = 1;
-- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
-- }
-+ if (is_ds_only_session(unconf->cl_exchange_flags))
-+ cr_ses->flags &= ~SESSION4_BACK_CHAN;
++ BUG_ON(dev < min_dev || max_dev <= dev);
++ return ios->objio_seg->ods[dev - min_dev];
++}
+
- conf = unconf;
- } else {
- status = nfserr_stale_clientid;
- goto out;
- }
-
-+ if (cr_ses->flags & SESSION4_BACK_CHAN) {
-+ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-+ svc_xprt_get(rqstp->rq_xprt);
-+ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa);
-+ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
-+ conf->cl_cb_conn.cb_minorversion = cstate->minorversion;
-+ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
-+ conf->cl_cb_seq_nr = 1;
-+ nfsd4_probe_callback(conf, &conf->cl_cb_conn);
-+ }
++struct _striping_info {
++ u64 obj_offset;
++ u64 group_length;
++ u64 total_group_length;
++ u64 Major;
++ unsigned dev;
++ unsigned unit_off;
++};
+
- /*
- * We do not support RDMA or persistent sessions
- */
-@@ -1863,7 +1926,7 @@ out:
-
- /* OPEN Share state helper functions */
- static inline struct nfs4_file *
--alloc_init_file(struct inode *ino)
-+alloc_init_file(struct inode *ino, struct svc_fh *current_fh)
- {
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(ino);
-@@ -1879,6 +1942,16 @@ alloc_init_file(struct inode *ino)
- fp->fi_had_conflict = false;
- memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
- memset(fp->fi_access, 0, sizeof(fp->fi_access));
-+#if defined(CONFIG_PNFSD)
-+ INIT_LIST_HEAD(&fp->fi_layouts);
-+ INIT_LIST_HEAD(&fp->fi_layout_states);
-+ fp->fi_fsid.major = current_fh->fh_export->ex_fsid;
-+ fp->fi_fsid.minor = 0;
-+ fp->fi_fhlen = current_fh->fh_handle.fh_size;
-+ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval));
-+ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base,
-+ fp->fi_fhlen);
-+#endif /* CONFIG_PNFSD */
- spin_lock(&recall_lock);
- list_add(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&recall_lock);
-@@ -1887,7 +1960,7 @@ alloc_init_file(struct inode *ino)
- return NULL;
- }
-
--static void
-+void
- nfsd4_free_slab(struct kmem_cache **slab)
- {
- if (*slab == NULL)
-@@ -1903,6 +1976,7 @@ nfsd4_free_slabs(void)
- nfsd4_free_slab(&file_slab);
- nfsd4_free_slab(&stateid_slab);
- nfsd4_free_slab(&deleg_slab);
-+ nfsd4_free_pnfs_slabs();
- }
-
- static int
-@@ -1924,6 +1998,8 @@ nfsd4_init_slabs(void)
- sizeof(struct nfs4_delegation), 0, 0, NULL);
- if (deleg_slab == NULL)
- goto out_nomem;
-+ if (nfsd4_init_pnfs_slabs())
-+ goto out_nomem;
- return 0;
- out_nomem:
- nfsd4_free_slabs();
-@@ -1997,6 +2073,9 @@ init_stateid(struct nfs4_stateid *stp, s
- INIT_LIST_HEAD(&stp->st_perstateowner);
- INIT_LIST_HEAD(&stp->st_lockowners);
- INIT_LIST_HEAD(&stp->st_perfile);
-+#if defined(CONFIG_PNFSD)
-+ INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
-+#endif /* CONFIG_PNFSD */
- list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
- list_add(&stp->st_perstateowner, &sop->so_stateids);
- list_add(&stp->st_perfile, &fp->fi_stateids);
-@@ -2038,6 +2117,7 @@ find_openstateowner_str(unsigned int has
- {
- struct nfs4_stateowner *so = NULL;
-
-+ BUG_ON_UNLOCKED_STATE();
- list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
- if (same_owner_str(so, &open->op_owner, &open->op_clientid))
- return so;
-@@ -2046,7 +2126,7 @@ find_openstateowner_str(unsigned int has
- }
-
- /* search file_hashtbl[] for file */
--static struct nfs4_file *
-+struct nfs4_file *
- find_file(struct inode *ino)
- {
- unsigned int hashval = file_hashval(ino);
-@@ -2064,6 +2144,18 @@ find_file(struct inode *ino)
- return NULL;
- }
-
-+struct nfs4_file *
-+find_alloc_file(struct inode *ino, struct svc_fh *current_fh)
++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
++ struct _striping_info *si)
+{
-+ struct nfs4_file *fp;
++ u32 stripe_unit = ios->objio_seg->stripe_unit;
++ u32 group_width = ios->objio_seg->group_width;
++ u64 group_depth = ios->objio_seg->group_depth;
++ u32 U = stripe_unit * group_width;
+
-+ fp = find_file(ino);
-+ if (fp)
-+ return fp;
++ u64 T = U * group_depth;
++ u64 S = T * ios->objio_seg->group_count;
++ u64 M = div64_u64(file_offset, S);
++
++ /*
++ G = (L - (M * S)) / T
++ H = (L - (M * S)) % T
++ */
++ u64 LmodU = file_offset - M * S;
++ u32 G = div64_u64(LmodU, T);
++ u64 H = LmodU - G * T;
++
++ u32 N = div_u64(H, U);
++
++ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
++ si->obj_offset = si->unit_off + (N * stripe_unit) +
++ (M * group_depth * stripe_unit);
++
++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
++ si->dev *= ios->objio_seg->mirrors_p1;
+
-+ return alloc_init_file(ino, current_fh);
++ si->group_length = T - H;
++ si->total_group_length = T;
++ si->Major = M;
+}
+
- static inline int access_valid(u32 x, u32 minorversion)
- {
- if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
-@@ -2592,7 +2684,7 @@ nfsd4_process_open2(struct svc_rqst *rqs
- if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
- goto out;
- status = nfserr_resource;
-- fp = alloc_init_file(ino);
-+ fp = alloc_init_file(ino, current_fh);
- if (fp == NULL)
- goto out;
- }
-@@ -2813,7 +2905,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct
- return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
- }
-
--static int
-+int
- STALE_STATEID(stateid_t *stateid)
- {
- if (stateid->si_boot == boot_time)
-@@ -2823,6 +2915,16 @@ STALE_STATEID(stateid_t *stateid)
- return 1;
- }
-
-+__be32
-+nfs4_check_stateid(stateid_t *stateid)
++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len)
+{
-+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-+ return nfserr_bad_stateid;
-+ if (STALE_STATEID(stateid))
-+ return nfserr_stale_stateid;
-+ return 0;
-+}
++ unsigned pg = *cur_pg;
++ struct request_queue *q =
++ osd_request_queue(_io_od(ios, per_dev->dev));
+
- static inline int
- access_permit_read(unsigned long access_bmap)
- {
-@@ -2934,6 +3036,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_
- if (grace_disallows_io(ino))
- return nfserr_grace;
-
-+#if defined(CONFIG_PNFSD)
-+ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) {
-+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-+ status = nfserr_bad_stateid;
-+ else
-+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
-+ {
-+ dprintk("%s Don't check DS stateid\n", __func__);
-+ return 0;
++ per_dev->length += cur_len;
++
++ if (per_dev->bio == NULL) {
++ unsigned stripes = ios->ol_state.num_comps /
++ ios->objio_seg->mirrors_p1;
++ unsigned pages_in_stripe = stripes *
++ (ios->objio_seg->stripe_unit / PAGE_SIZE);
++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
++ stripes;
++
++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
++ if (unlikely(!per_dev->bio)) {
++ dprintk("Faild to allocate BIO size=%u\n", bio_size);
++ return -ENOMEM;
+ }
-+#else /* CONFIG_GFS2_FS_LOCKING_DLM */
-+ status = nfs4_preprocess_pnfs_ds_stateid(current_fh,
-+ stateid);
-+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
-+ goto out;
+ }
-+#endif /* CONFIG_PNFSD */
+
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
-
-@@ -3015,13 +3135,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co
- *stpp = NULL;
- *sopp = NULL;
-
-- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
-- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
-- return nfserr_bad_stateid;
-- }
--
-- if (STALE_STATEID(stateid))
-- return nfserr_stale_stateid;
-+ status = nfs4_check_stateid(stateid);
-+ if (status)
-+ return status;
-
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
-@@ -3295,11 +3411,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp
- if (nfsd4_has_session(cstate))
- flags |= HAS_SESSION;
- nfs4_lock_state();
-- status = nfserr_bad_stateid;
-- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-- goto out;
-- status = nfserr_stale_stateid;
-- if (STALE_STATEID(stateid))
-+ status = nfs4_check_stateid(stateid);
-+ if (status)
- goto out;
- status = nfserr_bad_stateid;
- if (!is_delegation_stateid(stateid))
-@@ -3328,26 +3441,6 @@ out:
- #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
- #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
-
--static inline u64
--end_offset(u64 start, u64 len)
--{
-- u64 end;
--
-- end = start + len;
-- return end >= start ? end: NFS4_MAX_UINT64;
--}
--
--/* last octet in a range */
--static inline u64
--last_byte_offset(u64 start, u64 len)
--{
-- u64 end;
--
-- BUG_ON(!len);
-- end = start + len;
-- return end > start ? end - 1: NFS4_MAX_UINT64;
--}
--
- #define lockownerid_hashval(id) \
- ((id) & LOCK_HASH_MASK)
-
-@@ -3364,7 +3457,7 @@ static struct list_head lock_ownerid_has
- static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
- static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
-
--static struct nfs4_stateid *
-+struct nfs4_stateid *
- find_stateid(stateid_t *stid, int flags)
- {
- struct nfs4_stateid *local;
-@@ -3393,7 +3486,7 @@ find_stateid(stateid_t *stid, int flags)
- return NULL;
- }
-
--static struct nfs4_delegation *
-+struct nfs4_delegation *
- find_delegation_stateid(struct inode *ino, stateid_t *stid)
- {
- struct nfs4_file *fp;
-@@ -3524,6 +3617,9 @@ alloc_init_lock_stateid(struct nfs4_stat
- INIT_LIST_HEAD(&stp->st_perfile);
- INIT_LIST_HEAD(&stp->st_perstateowner);
- INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
-+#if defined(CONFIG_PNFSD)
-+ INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
-+#endif /* CONFIG_PNFSD */
- list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
- list_add(&stp->st_perfile, &fp->fi_stateids);
- list_add(&stp->st_perstateowner, &sop->so_stateids);
-@@ -4100,6 +4196,9 @@ nfs4_state_init(void)
- INIT_LIST_HEAD(&client_lru);
- INIT_LIST_HEAD(&del_recall_lru);
- reclaim_str_hashtbl_size = 0;
-+#if defined(CONFIG_PNFSD)
-+ nfs4_pnfs_state_init();
-+#endif /* CONFIG_PNFSD */
- return 0;
- }
-
-@@ -4204,6 +4303,7 @@ __nfs4_state_shutdown(void)
- }
-
- nfsd4_shutdown_recdir();
-+ nfs4_pnfs_state_shutdown();
- }
-
- void
-diff -up linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c
---- linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c.orig 2010-09-30 12:22:45.277048000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c 2010-09-30 12:25:08.536289000 -0400
-@@ -47,9 +47,14 @@
- #include <linux/nfsd_idmap.h>
- #include <linux/nfs4_acl.h>
- #include <linux/sunrpc/svcauth_gss.h>
-+#include <linux/exportfs.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd4_block.h>
-
- #include "xdr4.h"
- #include "vfs.h"
-+#include "pnfsd.h"
-
- #define NFSDDBG_FACILITY NFSDDBG_XDR
-
-@@ -1244,6 +1249,138 @@ static __be32 nfsd4_decode_reclaim_compl
- DECODE_TAIL;
- }
-
-+#if defined(CONFIG_PNFSD)
-+static __be32
-+nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp,
-+ struct nfsd4_pnfs_getdevlist *gdevl)
++ while (cur_len > 0) {
++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
++ unsigned added_len;
++
++ BUG_ON(ios->ol_state.nr_pages <= pg);
++ cur_len -= pglen;
++
++ added_len = bio_add_pc_page(q, per_dev->bio,
++ ios->ol_state.pages[pg], pglen, pgbase);
++ if (unlikely(pglen != added_len))
++ return -ENOMEM;
++ pgbase = 0;
++ ++pg;
++ }
++ BUG_ON(cur_len);
++
++ *cur_pg = pg;
++ return 0;
++}
++
++static int _prepare_one_group(struct objio_state *ios, u64 length,
++ struct _striping_info *si, unsigned first_comp,
++ unsigned *last_pg)
++{
++ unsigned stripe_unit = ios->objio_seg->stripe_unit;
++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1;
++ unsigned dev = si->dev;
++ unsigned first_dev = dev - (dev % devs_in_group);
++ unsigned comp = first_comp + (dev - first_dev);
++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
++ unsigned cur_pg = *last_pg;
++ int ret = 0;
++
++ while (length) {
++ struct _objio_per_comp *per_dev = &ios->per_dev[comp];
++ unsigned cur_len, page_off = 0;
++
++ if (!per_dev->length) {
++ per_dev->dev = dev;
++ if (dev < si->dev) {
++ per_dev->offset = si->obj_offset + stripe_unit -
++ si->unit_off;
++ cur_len = stripe_unit;
++ } else if (dev == si->dev) {
++ per_dev->offset = si->obj_offset;
++ cur_len = stripe_unit - si->unit_off;
++ page_off = si->unit_off & ~PAGE_MASK;
++ BUG_ON(page_off &&
++ (page_off != ios->ol_state.pgbase));
++ } else { /* dev > si->dev */
++ per_dev->offset = si->obj_offset - si->unit_off;
++ cur_len = stripe_unit;
++ }
++
++ if (max_comp < comp)
++ max_comp = comp;
++
++ dev += mirrors_p1;
++ dev = (dev % devs_in_group) + first_dev;
++ } else {
++ cur_len = stripe_unit;
++ }
++ if (cur_len >= length)
++ cur_len = length;
++
++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
++ cur_len);
++ if (unlikely(ret))
++ goto out;
++
++ comp += mirrors_p1;
++ comp = (comp % devs_in_group) + first_comp;
++
++ length -= cur_len;
++ ios->length += cur_len;
++ }
++out:
++ ios->numdevs = max_comp + mirrors_p1;
++ *last_pg = cur_pg;
++ return ret;
++}
++
++static int _io_rw_pagelist(struct objio_state *ios)
+{
-+ DECODE_HEAD;
++ u64 length = ios->ol_state.count;
++ struct _striping_info si;
++ unsigned devs_in_group = ios->objio_seg->group_width *
++ ios->objio_seg->mirrors_p1;
++ unsigned first_comp = 0;
++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps;
++ unsigned last_pg = 0;
++ int ret = 0;
+
-+ READ_BUF(16 + sizeof(nfs4_verifier));
-+ READ32(gdevl->gd_layout_type);
-+ READ32(gdevl->gd_maxdevices);
-+ READ64(gdevl->gd_cookie);
-+ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
++ _calc_stripe_info(ios, ios->ol_state.offset, &si);
++ while (length) {
++ if (length < si.group_length)
++ si.group_length = length;
+
-+ DECODE_TAIL;
++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp,
++ &last_pg);
++ if (unlikely(ret))
++ goto out;
++
++ length -= si.group_length;
++
++ si.group_length = si.total_group_length;
++ si.unit_off = 0;
++ ++si.Major;
++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit *
++ ios->objio_seg->group_depth;
++
++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
++ si.dev %= num_comps;
++
++ first_comp += devs_in_group;
++ first_comp %= num_comps;
++ }
++
++out:
++ if (!ios->length)
++ return ret;
++
++ return 0;
+}
+
-+static __be32
-+nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp,
-+ struct nfsd4_pnfs_getdevinfo *gdev)
++static ssize_t _sync_done(struct objio_state *ios)
+{
-+ u32 num;
-+ DECODE_HEAD;
++ struct completion *waiting = ios->private;
+
-+ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid));
-+ READ64(gdev->gd_devid.sbid);
-+ READ64(gdev->gd_devid.devid);
-+ READ32(gdev->gd_layout_type);
-+ READ32(gdev->gd_maxcount);
-+ READ32(num);
-+ if (num) {
-+ READ_BUF(4);
-+ READ32(gdev->gd_notify_types);
-+ } else {
-+ gdev->gd_notify_types = 0;
++ complete(waiting);
++ return 0;
++}
++
++static void _last_io(struct kref *kref)
++{
++ struct objio_state *ios = container_of(kref, struct objio_state, kref);
++
++ ios->done(ios);
++}
++
++static void _done_io(struct osd_request *or, void *p)
++{
++ struct objio_state *ios = p;
++
++ kref_put(&ios->kref, _last_io);
++}
++
++static ssize_t _io_exec(struct objio_state *ios)
++{
++ DECLARE_COMPLETION_ONSTACK(wait);
++ ssize_t status = 0; /* sync status */
++ unsigned i;
++ objio_done_fn saved_done_fn = ios->done;
++ bool sync = ios->ol_state.sync;
++
++ if (sync) {
++ ios->done = _sync_done;
++ ios->private = &wait;
+ }
+
-+ DECODE_TAIL;
++ kref_init(&ios->kref);
++
++ for (i = 0; i < ios->numdevs; i++) {
++ struct osd_request *or = ios->per_dev[i].or;
++
++ if (!or)
++ continue;
++
++ kref_get(&ios->kref);
++ osd_execute_request_async(or, _done_io, ios);
++ }
++
++ kref_put(&ios->kref, _last_io);
++
++ if (sync) {
++ wait_for_completion(&wait);
++ status = saved_done_fn(ios);
++ }
++
++ return status;
+}
+
-+static __be32
-+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
-+ struct nfsd4_pnfs_layoutget *lgp)
++/*
++ * read
++ */
++static ssize_t _read_done(struct objio_state *ios)
+{
-+ DECODE_HEAD;
++ ssize_t status;
++ int ret = _io_check(ios, false);
+
-+ READ_BUF(36);
-+ READ32(lgp->lg_signal);
-+ READ32(lgp->lg_seg.layout_type);
-+ READ32(lgp->lg_seg.iomode);
-+ READ64(lgp->lg_seg.offset);
-+ READ64(lgp->lg_seg.length);
-+ READ64(lgp->lg_minlength);
-+ nfsd4_decode_stateid(argp, &lgp->lg_sid);
-+ READ_BUF(4);
-+ READ32(lgp->lg_maxcount);
++ _io_free(ios);
+
-+ DECODE_TAIL;
++ if (likely(!ret))
++ status = ios->length;
++ else
++ status = ret;
++
++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
++ return status;
+}
+
-+static __be32
-+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
-+ struct nfsd4_pnfs_layoutcommit *lcp)
++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
-+ DECODE_HEAD;
-+ u32 timechange;
++ struct osd_request *or = NULL;
++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
++ unsigned dev = per_dev->dev;
++ struct pnfs_osd_object_cred *cred =
++ &ios->objio_seg->layout->olo_comps[dev];
++ struct osd_obj_id obj = {
++ .partition = cred->oc_object_id.oid_partition_id,
++ .id = cred->oc_object_id.oid_object_id,
++ };
++ int ret;
+
-+ READ_BUF(20);
-+ READ64(lcp->args.lc_seg.offset);
-+ READ64(lcp->args.lc_seg.length);
-+ READ32(lcp->args.lc_reclaim);
-+ nfsd4_decode_stateid(argp, &lcp->lc_sid);
-+ READ_BUF(4);
-+ READ32(lcp->args.lc_newoffset);
-+ if (lcp->args.lc_newoffset) {
-+ READ_BUF(8);
-+ READ64(lcp->args.lc_last_wr);
-+ } else
-+ lcp->args.lc_last_wr = 0;
-+ READ_BUF(4);
-+ READ32(timechange);
-+ if (timechange) {
-+ READ_BUF(12);
-+ READ64(lcp->args.lc_mtime.seconds);
-+ READ32(lcp->args.lc_mtime.nseconds);
-+ } else {
-+ lcp->args.lc_mtime.seconds = 0;
-+ lcp->args.lc_mtime.nseconds = 0;
++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
++ if (unlikely(!or)) {
++ ret = -ENOMEM;
++ goto err;
+ }
-+ READ_BUF(8);
-+ READ32(lcp->args.lc_seg.layout_type);
-+ /* XXX: saving XDR'ed layout update. Since we don't have the
-+ * current_fh yet, and therefore no export_ops, we can't call
-+ * the layout specific decode routines. File and pVFS2
-+ * do not use the layout update....
-+ */
-+ READ32(lcp->args.lc_up_len);
-+ if (lcp->args.lc_up_len > 0) {
-+ READ_BUF(lcp->args.lc_up_len);
-+ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len);
++ per_dev->or = or;
++
++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
++
++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
++ if (ret) {
++ dprintk("%s: Faild to osd_finalize_request() => %d\n",
++ __func__, ret);
++ goto err;
+ }
+
-+ DECODE_TAIL;
++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
++ per_dev->length);
++
++err:
++ return ret;
++}
++
++static ssize_t _read_exec(struct objio_state *ios)
++{
++ unsigned i;
++ int ret;
++
++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
++ if (!ios->per_dev[i].length)
++ continue;
++ ret = _read_mirrors(ios, i);
++ if (unlikely(ret))
++ goto err;
++ }
++
++ ios->done = _read_done;
++ return _io_exec(ios); /* In sync mode exec returns the io status */
++
++err:
++ _io_free(ios);
++ return ret;
+}
+
-+static __be32
-+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
-+ struct nfsd4_pnfs_layoutreturn *lrp)
++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
-+ DECODE_HEAD;
++ struct objio_state *ios = container_of(ol_state, struct objio_state,
++ ol_state);
++ int ret;
+
-+ READ_BUF(16);
-+ READ32(lrp->args.lr_reclaim);
-+ READ32(lrp->args.lr_seg.layout_type);
-+ READ32(lrp->args.lr_seg.iomode);
-+ READ32(lrp->args.lr_return_type);
-+ if (lrp->args.lr_return_type == RETURN_FILE) {
-+ READ_BUF(16);
-+ READ64(lrp->args.lr_seg.offset);
-+ READ64(lrp->args.lr_seg.length);
-+ nfsd4_decode_stateid(argp, &lrp->lr_sid);
-+ READ_BUF(4);
-+ READ32(lrp->args.lrf_body_len);
-+ if (lrp->args.lrf_body_len > 0) {
-+ READ_BUF(lrp->args.lrf_body_len);
-+ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len);
-+ }
-+ }
++ ret = _io_rw_pagelist(ios);
++ if (unlikely(ret))
++ return ret;
+
-+ DECODE_TAIL;
++ return _read_exec(ios);
+}
-+#endif /* CONFIG_PNFSD */
-+
- static __be32
- nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
- {
-@@ -1345,11 +1482,19 @@ static nfsd4_dec nfsd41_dec_ops[] = {
- [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
-+#if defined(CONFIG_PNFSD)
-+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo,
-+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist,
-+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
-+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
-+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
-+#else /* CONFIG_PNFSD */
- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
-+#endif /* CONFIG_PNFSD */
- [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
- [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
-@@ -2150,6 +2295,36 @@ out_acl:
- }
- WRITE64(stat.ino);
- }
-+#if defined(CONFIG_PNFSD)
-+ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-+ struct super_block *sb = dentry->d_inode->i_sb;
-+ int type = 0;
+
-+ /* Query the filesystem for supported pNFS layout types.
-+ * Currently, we only support one layout type per file system.
-+ * The export_ops->layout_type() returns the pnfs_layouttype4.
-+ */
-+ buflen -= 4;
-+ if (buflen < 0) /* length */
-+ goto out_resource;
++/*
++ * write
++ */
++static ssize_t _write_done(struct objio_state *ios)
++{
++ ssize_t status;
++ int ret = _io_check(ios, true);
+
-+ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type)
-+ type = sb->s_pnfs_op->layout_type(sb);
-+ if (type) {
-+ if ((buflen -= 4) < 0) /* type */
-+ goto out_resource;
-+ WRITE32(1); /* length */
-+ WRITE32(type); /* type */
-+ } else
-+ WRITE32(0); /* length */
-+ }
++ _io_free(ios);
+
-+ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
-+ if ((buflen -= 4) < 0)
-+ goto out_resource;
-+ WRITE32(stat.blksize);
++ if (likely(!ret)) {
++ /* FIXME: should be based on the OSD's persistence model
++ * See OSD2r05 Section 4.13 Data persistence model */
++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC;
++ status = ios->length;
++ } else {
++ status = ret;
+ }
-+#endif /* CONFIG_PNFSD */
- if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- WRITE32(3);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
-@@ -2380,6 +2555,10 @@ nfsd4_encode_commit(struct nfsd4_compoun
- if (!nfserr) {
- RESERVE_SPACE(8);
- WRITEMEM(commit->co_verf.data, 8);
-+ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n",
-+ ((u32 *)(&commit->co_verf.data))[0],
-+ ((u32 *)(&commit->co_verf.data))[1]);
+
- ADJUST_ARGS();
- }
- return nfserr;
-@@ -2634,6 +2813,13 @@ nfsd4_encode_read(struct nfsd4_compoundr
- }
- read->rd_vlen = v;
-
-+#if defined(CONFIG_SPNFS)
-+ if (spnfs_enabled())
-+ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode,
-+ read->rd_offset, &maxcount, read->rd_vlen,
-+ resp->rqstp);
-+ else /* we're not an MDS */
-+#endif /* CONFIG_SPNFS */
- nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
- read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
- &maxcount);
-@@ -2940,6 +3126,9 @@ nfsd4_encode_write(struct nfsd4_compound
- WRITE32(write->wr_bytes_written);
- WRITE32(write->wr_how_written);
- WRITEMEM(write->wr_verifier.data, 8);
-+ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n",
-+ ((u32 *)(&write->wr_verifier.data))[0],
-+ ((u32 *)(&write->wr_verifier.data))[1]);
- ADJUST_ARGS();
- }
- return nfserr;
-@@ -3083,6 +3272,343 @@ nfsd4_encode_sequence(struct nfsd4_compo
- return 0;
- }
-
-+#if defined(CONFIG_PNFSD)
++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
++ return status;
++}
+
-+/* Uses the export interface to iterate through the available devices
-+ * and encodes them on the response stream.
-+ */
-+static __be32
-+nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp,
-+ struct nfsd4_pnfs_getdevlist *gdevl,
-+ unsigned int *dev_count)
++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
-+ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb;
-+ __be32 nfserr;
-+ int status;
-+ __be32 *p;
-+ struct nfsd4_pnfs_dev_iter_res res = {
-+ .gd_cookie = gdevl->gd_cookie,
-+ .gd_verf = gdevl->gd_verf,
-+ .gd_eof = 0
-+ };
-+ u64 sbid;
++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
++ unsigned dev = ios->per_dev[cur_comp].dev;
++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1;
++ int ret;
+
-+ dprintk("%s: Begin\n", __func__);
++ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
++ struct osd_request *or = NULL;
++ struct pnfs_osd_object_cred *cred =
++ &ios->objio_seg->layout->olo_comps[dev];
++ struct osd_obj_id obj = {
++ .partition = cred->oc_object_id.oid_partition_id,
++ .id = cred->oc_object_id.oid_object_id,
++ };
++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
++ struct bio *bio;
+
-+ sbid = find_create_sbid(sb);
-+ *dev_count = 0;
-+ do {
-+ status = sb->s_pnfs_op->get_device_iter(sb,
-+ gdevl->gd_layout_type,
-+ &res);
-+ if (status) {
-+ if (status == -ENOENT) {
-+ res.gd_eof = 1;
-+ /* return success */
-+ break;
++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
++ if (unlikely(!or)) {
++ ret = -ENOMEM;
++ goto err;
++ }
++ per_dev->or = or;
++
++ if (per_dev != master_dev) {
++ bio = bio_kmalloc(GFP_KERNEL,
++ master_dev->bio->bi_max_vecs);
++ if (unlikely(!bio)) {
++ dprintk("Faild to allocate BIO size=%u\n",
++ master_dev->bio->bi_max_vecs);
++ ret = -ENOMEM;
++ goto err;
+ }
-+ nfserr = nfserrno(status);
-+ goto out_err;
++
++ __bio_clone(bio, master_dev->bio);
++ bio->bi_bdev = NULL;
++ bio->bi_next = NULL;
++ per_dev->bio = bio;
++ per_dev->dev = dev;
++ per_dev->length = master_dev->length;
++ per_dev->offset = master_dev->offset;
++ } else {
++ bio = master_dev->bio;
++ /* FIXME: bio_set_dir() */
++ bio->bi_rw |= REQ_WRITE;
+ }
+
-+ /* Encode device id and layout type */
-+ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid));
-+ WRITE64((__be64)sbid);
-+ WRITE64(res.gd_devid); /* devid minor */
-+ ADJUST_ARGS();
-+ (*dev_count)++;
-+ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof);
-+ gdevl->gd_cookie = res.gd_cookie;
-+ gdevl->gd_verf = res.gd_verf;
-+ gdevl->gd_eof = res.gd_eof;
-+ nfserr = nfs_ok;
-+out_err:
-+ dprintk("%s: Encoded %u devices\n", __func__, *dev_count);
-+ return nfserr;
++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
++
++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
++ if (ret) {
++ dprintk("%s: Faild to osd_finalize_request() => %d\n",
++ __func__, ret);
++ goto err;
++ }
++
++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
++ per_dev->length);
++ }
++
++err:
++ return ret;
+}
+
-+/* Encodes the response of get device list.
-+*/
-+static __be32
-+nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr,
-+ struct nfsd4_pnfs_getdevlist *gdevl)
++static ssize_t _write_exec(struct objio_state *ios)
+{
-+ unsigned int dev_count = 0, lead_count;
-+ u32 *p_in = resp->p;
-+ __be32 *p;
++ unsigned i;
++ int ret;
+
-+ dprintk("%s: err %d\n", __func__, nfserr);
-+ if (nfserr)
-+ return nfserr;
++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
++ if (!ios->per_dev[i].length)
++ continue;
++ ret = _write_mirrors(ios, i);
++ if (unlikely(ret))
++ goto err;
++ }
+
-+ /* Ensure we have room for cookie, verifier, and devlist len,
-+ * which we will backfill in after we encode as many devices as possible
-+ */
-+ lead_count = 8 + sizeof(nfs4_verifier) + 4;
-+ RESERVE_SPACE(lead_count);
-+ /* skip past these values */
-+ p += XDR_QUADLEN(lead_count);
-+ ADJUST_ARGS();
++ ios->done = _write_done;
++ return _io_exec(ios); /* In sync mode exec returns the io->status */
+
-+ /* Iterate over as many device ids as possible on the xdr stream */
-+ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count);
-+ if (nfserr)
-+ goto out_err;
++err:
++ _io_free(ios);
++ return ret;
++}
+
-+ /* Backfill in cookie, verf and number of devices encoded */
-+ p = p_in;
-+ WRITE64(gdevl->gd_cookie);
-+ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
-+ WRITE32(dev_count);
++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
++{
++ struct objio_state *ios = container_of(ol_state, struct objio_state,
++ ol_state);
++ int ret;
+
-+ /* Skip over devices */
-+ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid));
-+ ADJUST_ARGS();
++ /* TODO: ios->stable = stable; */
++ ret = _io_rw_pagelist(ios);
++ if (unlikely(ret))
++ return ret;
+
-+ /* are we at the end of devices? */
-+ RESERVE_SPACE(4);
-+ WRITE32(gdevl->gd_eof);
-+ ADJUST_ARGS();
++ return _write_exec(ios);
++}
+
-+ dprintk("%s: done.\n", __func__);
++/*
++ * Policy Operations
++ */
+
-+ nfserr = nfs_ok;
-+out:
-+ return nfserr;
-+out_err:
-+ p = p_in;
-+ ADJUST_ARGS();
-+ goto out;
++/*
++ * Get the max [rw]size
++ */
++static ssize_t
++objlayout_get_blocksize(void)
++{
++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE;
++
++ return sz;
+}
+
-+/* For a given device id, have the file system retrieve and encode the
-+ * associated device. For file layout, the encoding function is
-+ * passed down to the file system. The file system then has the option
-+ * of using this encoding function or one of its own.
++/*
++ * Don't gather across stripes, but rather gather (coalesce) up to
++ * the stripe size.
+ *
-+ * Note: the file system must return the XDR size of struct device_addr4
-+ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the
-+ * gdir_mincount calculation.
++ * FIXME: change interface to use merge_align, merge_count
+ */
-+static __be32
-+nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-+ struct nfsd4_pnfs_getdevinfo *gdev)
-+{
-+ struct super_block *sb;
-+ int maxcount = 0, type_notify_len = 12;
-+ __be32 *p, *p_save = NULL, *p_in = resp->p;
-+ struct exp_xdr_stream xdr;
++static struct pnfs_layoutdriver_type objlayout_type = {
++ .id = LAYOUT_OSD2_OBJECTS,
++ .name = "LAYOUT_OSD2_OBJECTS",
++ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
-+ dprintk("%s: err %d\n", __func__, nfserr);
-+ if (nfserr)
-+ return nfserr;
++ .initialize_mountpoint = objlayout_initialize_mountpoint,
++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
+
-+ sb = gdev->gd_sb;
++ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
++ .free_layout_hdr = objlayout_free_layout_hdr,
+
-+ if (gdev->gd_maxcount != 0) {
-+ /* FIXME: this will be bound by the session max response */
-+ maxcount = svc_max_payload(resp->rqstp);
-+ if (maxcount > gdev->gd_maxcount)
-+ maxcount = gdev->gd_maxcount;
++ .alloc_lseg = objlayout_alloc_lseg,
++ .free_lseg = objlayout_free_lseg,
+
-+ /* Ensure have room for type and notify field */
-+ maxcount -= type_notify_len;
-+ if (maxcount < 0) {
-+ nfserr = -ETOOSMALL;
-+ goto toosmall;
-+ }
-+ }
++ .get_blocksize = objlayout_get_blocksize,
+
-+ RESERVE_SPACE(4);
-+ WRITE32(gdev->gd_layout_type);
-+ ADJUST_ARGS();
++ .read_pagelist = objlayout_read_pagelist,
++ .write_pagelist = objlayout_write_pagelist,
++ .commit = objlayout_commit,
+
-+ /* If maxcount is 0 then just update notifications */
-+ if (gdev->gd_maxcount == 0)
-+ goto handle_notifications;
++ .encode_layoutcommit = objlayout_encode_layoutcommit,
++ .encode_layoutreturn = objlayout_encode_layoutreturn,
++};
+
-+ xdr.p = p_save = resp->p;
-+ xdr.end = resp->end;
-+ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
-+ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
++void *objio_init_mt(void)
++{
++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL);
+
-+ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type,
-+ &gdev->gd_devid);
-+ if (nfserr)
-+ goto err;
++ if (!omt)
++ return ERR_PTR(-ENOMEM);
+
-+ /* The file system should never write 0 bytes without
-+ * returning an error
-+ */
-+ BUG_ON(xdr.p == p_save);
-+ BUG_ON(xdr.p > xdr.end);
++ INIT_LIST_HEAD(&omt->dev_list);
++ spin_lock_init(&omt->dev_list_lock);
++ return omt;
++}
+
-+ /* Update the xdr stream with the number of bytes encoded
-+ * by the file system.
-+ */
-+ p = xdr.p;
-+ ADJUST_ARGS();
++void objio_fini_mt(void *mountid)
++{
++ _dev_list_remove_all(mountid);
++ kfree(mountid);
++}
+
-+handle_notifications:
-+ /* Encode supported device notifications */
-+ RESERVE_SPACE(4);
-+ if (sb->s_pnfs_op->set_device_notify) {
-+ struct pnfs_devnotify_arg dn_args;
++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
++MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
++MODULE_LICENSE("GPL");
+
-+ dn_args.dn_layout_type = gdev->gd_layout_type;
-+ dn_args.dn_devid = gdev->gd_devid;
-+ dn_args.dn_notify_types = gdev->gd_notify_types;
-+ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args);
-+ if (nfserr)
-+ goto err;
-+ WRITE32(dn_args.dn_notify_types);
-+ } else {
-+ WRITE32(0);
-+ }
-+ ADJUST_ARGS();
++static int __init
++objlayout_init(void)
++{
++ int ret = pnfs_register_layoutdriver(&objlayout_type);
+
-+out:
-+ return nfserrno(nfserr);
-+toosmall:
-+ dprintk("%s: maxcount too small\n", __func__);
-+ RESERVE_SPACE(4);
-+ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len);
-+ ADJUST_ARGS();
-+ goto out;
-+err:
-+ /* Rewind to the beginning */
-+ p = p_in;
-+ ADJUST_ARGS();
-+ if (nfserr == -ETOOSMALL)
-+ goto toosmall;
-+ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr);
-+ goto out;
++ if (ret)
++ printk(KERN_INFO
++ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
++ __func__, ret);
++ else
++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
++ __func__);
++ return ret;
++}
++
++static void __exit
++objlayout_exit(void)
++{
++ pnfs_unregister_layoutdriver(&objlayout_type);
++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
++ __func__);
+}
+
-+static __be32
-+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp,
-+ __be32 nfserr,
-+ struct nfsd4_pnfs_layoutget *lgp)
-+{
-+ int maxcount, leadcount;
-+ struct super_block *sb;
-+ struct exp_xdr_stream xdr;
-+ __be32 *p, *p_save, *p_start = resp->p;
++module_init(objlayout_init);
++module_exit(objlayout_exit);
+diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
+new file mode 100644
+index 0000000..b647577
+--- /dev/null
++++ b/fs/nfs/objlayout/objlayout.c
+@@ -0,0 +1,773 @@
++/*
++ * objlayout.c
++ *
++ * pNFS layout driver for Panasas OSDs
++ *
++ * Copyright (C) 2007-2009 Panasas Inc.
++ * All rights reserved.
++ *
++ * Benny Halevy <bhalevy at panasas.com>
++ * Boaz Harrosh <bharrosh at panasas.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2
++ * See the file COPYING included with this distribution for more details.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the Panasas company nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <scsi/osd_initiator.h>
++#include "objlayout.h"
+
-+ dprintk("%s: err %d\n", __func__, nfserr);
-+ if (nfserr)
-+ return nfserr;
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb;
-+ maxcount = PAGE_SIZE;
-+ if (maxcount > lgp->lg_maxcount)
-+ maxcount = lgp->lg_maxcount;
++struct pnfs_client_operations *pnfs_client_ops;
+
-+ /* Check for space on xdr stream */
-+ leadcount = 36 + sizeof(stateid_opaque_t);
-+ RESERVE_SPACE(leadcount);
-+ /* encode layout metadata after file system encodes layout */
-+ p += XDR_QUADLEN(leadcount);
-+ ADJUST_ARGS();
++/*
++ * Create a objlayout layout structure for the given inode and return it.
++ */
++struct pnfs_layout_hdr *
++objlayout_alloc_layout_hdr(struct inode *inode)
++{
++ struct objlayout *objlay;
+
-+ /* Ensure have room for ret_on_close, off, len, iomode, type */
-+ maxcount -= leadcount;
-+ if (maxcount < 0) {
-+ printk(KERN_ERR "%s: buffer too small\n", __func__);
-+ nfserr = nfserr_toosmall;
-+ goto err;
++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
++ if (objlay) {
++ spin_lock_init(&objlay->lock);
++ INIT_LIST_HEAD(&objlay->err_list);
+ }
++ dprintk("%s: Return %p\n", __func__, objlay);
++ return &objlay->pnfs_layout;
++}
+
-+ /* Set xdr info so file system can encode layout */
-+ xdr.p = p_save = resp->p;
-+ xdr.end = resp->end;
-+ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
-+ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
++/*
++ * Free an objlayout layout structure
++ */
++void
++objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
++{
++ struct objlayout *objlay = OBJLAYOUT(lo);
+
-+ /* Retrieve, encode, and merge layout; process stateid */
-+ nfserr = nfs4_pnfs_get_layout(lgp, &xdr);
-+ if (nfserr)
-+ goto err;
++ dprintk("%s: objlay %p\n", __func__, objlay);
+
-+ /* Ensure file system returned enough bytes for the client
-+ * to access.
-+ */
-+ if (lgp->lg_seg.length < lgp->lg_minlength) {
-+ nfserr = nfserr_badlayout;
-+ goto err;
-+ }
++ WARN_ON(!list_empty(&objlay->err_list));
++ kfree(objlay);
++}
+
-+ /* The file system should never write 0 bytes without
-+ * returning an error
-+ */
-+ BUG_ON(xdr.p == p_save);
++/*
++ * Unmarshall layout and store it in pnfslay.
++ */
++struct pnfs_layout_segment *
++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
++ struct nfs4_layoutget_res *lgr)
++{
++ int status;
++ void *layout = lgr->layout.buf;
++ struct objlayout_segment *objlseg;
++ struct pnfs_osd_layout *pnfs_osd_layout;
+
-+ /* Rewind to beginning and encode attrs */
-+ resp->p = p_start;
-+ RESERVE_SPACE(4);
-+ WRITE32(lgp->lg_roc); /* return on close */
-+ ADJUST_ARGS();
-+ nfsd4_encode_stateid(resp, &lgp->lg_sid);
-+ RESERVE_SPACE(28);
-+ /* Note: response logr_layout array count, always one for now */
-+ WRITE32(1);
-+ WRITE64(lgp->lg_seg.offset);
-+ WRITE64(lgp->lg_seg.length);
-+ WRITE32(lgp->lg_seg.iomode);
-+ WRITE32(lgp->lg_seg.layout_type);
++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout);
+
-+ /* Update the xdr stream with the number of bytes written
-+ * by the file system
-+ */
-+ p = xdr.p;
-+ ADJUST_ARGS();
++ BUG_ON(!layout);
+
-+ return nfs_ok;
-+err:
-+ resp->p = p_start;
-+ return nfserr;
-+}
++ status = -ENOMEM;
++ objlseg = kzalloc(sizeof(*objlseg) +
++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL);
++ if (!objlseg)
++ goto err;
+
-+static __be32
-+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
-+ struct nfsd4_pnfs_layoutcommit *lcp)
-+{
-+ __be32 *p;
++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout);
+
-+ if (nfserr)
-+ goto out;
++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg,
++ pnfs_osd_layout);
++ if (status)
++ goto err;
+
-+ RESERVE_SPACE(4);
-+ WRITE32(lcp->res.lc_size_chg);
-+ ADJUST_ARGS();
-+ if (lcp->res.lc_size_chg) {
-+ RESERVE_SPACE(8);
-+ WRITE64(lcp->res.lc_newsize);
-+ ADJUST_ARGS();
-+ }
-+out:
-+ return nfserr;
++ dprintk("%s: Return %p\n", __func__, &objlseg->lseg);
++ return &objlseg->lseg;
++
++ err:
++ kfree(objlseg);
++ return ERR_PTR(status);
+}
+
-+static __be32
-+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
-+ struct nfsd4_pnfs_layoutreturn *lrp)
++/*
++ * Free a layout segement
++ */
++void
++objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
-+ __be32 *p;
++ struct objlayout_segment *objlseg;
+
-+ if (nfserr)
-+ goto out;
++ dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
-+ RESERVE_SPACE(4);
-+ WRITE32(lrp->lrs_present != 0); /* got stateid? */
-+ ADJUST_ARGS();
-+ if (lrp->lrs_present)
-+ nfsd4_encode_stateid(resp, &lrp->lr_sid);
-+out:
-+ return nfserr;
-+}
-+#endif /* CONFIG_PNFSD */
++ if (unlikely(!lseg))
++ return;
+
- static __be32
- nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
- {
-@@ -3143,11 +3669,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
- [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
-+#if defined(CONFIG_PNFSD)
-+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo,
-+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist,
-+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
-+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
-+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
-+#else /* CONFIG_PNFSD */
- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
-+#endif /* CONFIG_PNFSD */
- [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
- [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
-diff -up linux-2.6.35.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.35.noarch/fs/nfsd/nfsctl.c
---- linux-2.6.35.noarch/fs/nfsd/nfsctl.c.orig 2010-09-30 12:22:45.283046000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfsctl.c 2010-09-30 12:25:08.542285000 -0400
-@@ -13,10 +13,15 @@
- #include <linux/nfsd/syscall.h>
- #include <linux/lockd/lockd.h>
- #include <linux/sunrpc/clnt.h>
-+#include <linux/nfsd/nfs4pnfsdlm.h>
-
- #include "nfsd.h"
- #include "cache.h"
-
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
-+#include <linux/nfsd4_spnfs.h>
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
++ objlseg = container_of(lseg, struct objlayout_segment, lseg);
++ objio_free_lseg(objlseg->internal);
++ kfree(objlseg);
++}
+
- /*
- * We have a single directory with 9 nodes in it.
- */
-@@ -49,6 +54,9 @@ enum {
- NFSD_Gracetime,
- NFSD_RecoveryDir,
- #endif
-+#ifdef CONFIG_PNFSD
-+ NFSD_pnfs_dlm_device,
-+#endif
- };
-
- /*
-@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi
- static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
- static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
- #endif
-+#ifdef CONFIG_PNFSD
-+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size);
-+#endif
-
- static ssize_t (*write_op[])(struct file *, char *, size_t) = {
- [NFSD_Svc] = write_svc,
-@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file
- [NFSD_Gracetime] = write_gracetime,
- [NFSD_RecoveryDir] = write_recoverydir,
- #endif
-+#ifdef CONFIG_PNFSD
-+ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device,
-+#endif
- };
-
- static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
-@@ -1347,6 +1361,68 @@ static ssize_t write_recoverydir(struct
-
- #endif
-
-+#ifdef CONFIG_PNFSD
++/*
++ * I/O Operations
++ */
++static inline u64
++end_offset(u64 start, u64 len)
++{
++ u64 end;
+
-+static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf,
-+ size_t size)
++ end = start + len;
++ return end >= start ? end : NFS4_MAX_UINT64;
++}
++
++/* last octet in a range */
++static inline u64
++last_byte_offset(u64 start, u64 len)
+{
-+ char *mesg = buf;
-+ char *pnfs_dlm_device;
-+ int max_size = NFSD_PNFS_DLM_DEVICE_MAX;
-+ int len, ret = 0;
++ u64 end;
+
-+ if (size > 0) {
-+ ret = -EINVAL;
-+ if (size > max_size || buf[size-1] != '\n')
-+ return ret;
-+ buf[size-1] = 0;
++ BUG_ON(!len);
++ end = start + len;
++ return end > start ? end - 1 : NFS4_MAX_UINT64;
++}
+
-+ pnfs_dlm_device = mesg;
-+ len = qword_get(&mesg, pnfs_dlm_device, size);
-+ if (len <= 0)
-+ return ret;
++static struct objlayout_io_state *
++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
++ struct page **pages,
++ unsigned pgbase,
++ unsigned nr_pages,
++ loff_t offset,
++ size_t count,
++ struct pnfs_layout_segment *lseg,
++ void *rpcdata)
++{
++ struct objlayout_segment *objlseg =
++ container_of(lseg, struct objlayout_segment, lseg);
++ struct objlayout_io_state *state;
++ u64 lseg_end_offset;
++ size_t size_nr_pages;
+
-+ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len);
-+ } else
-+ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT);
++ dprintk("%s: allocating io_state\n", __func__);
++ if (objio_alloc_io_state(objlseg->internal, &state))
++ return NULL;
+
-+ return ret <= 0 ? ret : strlen(buf);
++ BUG_ON(offset < lseg->range.offset);
++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length);
++ BUG_ON(offset >= lseg_end_offset);
++ if (offset + count > lseg_end_offset) {
++ count = lseg->range.length - (offset - lseg->range.offset);
++ dprintk("%s: truncated count %Zd\n", __func__, count);
++ }
++
++ if (pgbase > PAGE_SIZE) {
++ unsigned n = pgbase >> PAGE_SHIFT;
++
++ pgbase &= ~PAGE_MASK;
++ pages += n;
++ nr_pages -= n;
++ }
++
++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
++ BUG_ON(nr_pages < size_nr_pages);
++ if (nr_pages > size_nr_pages)
++ nr_pages = size_nr_pages;
++
++ INIT_LIST_HEAD(&state->err_list);
++ state->objlseg = objlseg;
++ state->rpcdata = rpcdata;
++ state->pages = pages;
++ state->pgbase = pgbase;
++ state->nr_pages = nr_pages;
++ state->offset = offset;
++ state->count = count;
++ state->sync = 0;
++
++ return state;
+}
+
-+/**
-+ * write_pnfs_dlm_device - Set or report the current pNFS data server list
-+ *
-+ * Input:
-+ * buf: ignored
-+ * size: zero
-+ *
-+ * OR
-+ *
-+ * Input:
-+ * buf: C string containing a block device name,
-+ * a colon, and then a comma separated
-+ * list of pNFS data server IPv4 addresses
-+ * size: non-zero length of C string in @buf
-+ * Output:
-+ * On success: passed-in buffer filled with '\n'-terminated C
-+ * string containing a block device name, a colon, and
-+ * then a comma separated list of pNFS
-+ * data server IPv4 addresses.
-+ * return code is the size in bytes of the string
-+ * On error: return code is a negative errno value
-+ */
-+static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size)
++static void
++objlayout_free_io_state(struct objlayout_io_state *state)
+{
-+ ssize_t rv;
++ dprintk("%s: freeing io_state\n", __func__);
++ if (unlikely(!state))
++ return;
+
-+ mutex_lock(&nfsd_mutex);
-+ rv = __write_pnfs_dlm_device(file, buf, size);
-+ mutex_unlock(&nfsd_mutex);
-+ return rv;
++ objio_free_io_state(state);
+}
+
-+#endif /* CONFIG_PNFSD */
-+
- /*----------------------------------------------------------------------------*/
- /*
- * populating the filesystem.
-@@ -1381,6 +1457,10 @@ static int nfsd_fill_super(struct super_
- [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
- [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
- #endif
-+#ifdef CONFIG_PNFSD
-+ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops,
-+ S_IWUSR|S_IRUSR},
-+#endif
- /* last one */ {""}
- };
- return simple_fill_super(sb, 0x6e667364, nfsd_files);
-@@ -1419,6 +1499,9 @@ static int create_proc_exports_entry(voi
- }
- #endif
-
-+#if defined(CONFIG_SPNFS_BLOCK)
-+int nfsd_bl_init(void);
-+#endif
- static int __init init_nfsd(void)
- {
- int retval;
-@@ -1441,6 +1524,15 @@ static int __init init_nfsd(void)
- retval = create_proc_exports_entry();
- if (retval)
- goto out_free_idmap;
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
-+ retval = spnfs_init_proc();
-+ if (retval != 0)
-+ goto out_free_idmap;
-+#if defined(CONFIG_SPNFS_BLOCK)
-+ nfsd_bl_init();
-+#endif /* CONFIG_SPNFS_BLOCK */
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
++/*
++ * I/O done common code
++ */
++static void
++objlayout_iodone(struct objlayout_io_state *state)
++{
++ dprintk("%s: state %p status\n", __func__, state);
+
- retval = register_filesystem(&nfsd_fs_type);
- if (retval)
- goto out_free_all;
-@@ -1463,7 +1555,22 @@ out_free_stat:
-
- static void __exit exit_nfsd(void)
- {
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
-+ remove_proc_entry("fs/nfs/spnfs/recall", NULL);
-+ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
-+ remove_proc_entry("fs/nfs/spnfs/getfh", NULL);
-+ remove_proc_entry("fs/nfs/spnfs/config", NULL);
-+ remove_proc_entry("fs/nfs/spnfs/ctl", NULL);
-+ remove_proc_entry("fs/nfs/spnfs", NULL);
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
++ if (likely(state->status >= 0)) {
++ objlayout_free_io_state(state);
++ } else {
++ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
+
-+#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
-+ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
-+ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL);
-+#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */
++ spin_lock(&objlay->lock);
++ objlay->delta_space_valid = OBJ_DSU_INVALID;
++ list_add(&objlay->err_list, &state->err_list);
++ spin_unlock(&objlay->lock);
++ }
++}
+
- nfsd_export_shutdown();
-+ nfsd4_pnfs_dlm_shutdown();
- nfsd_reply_cache_shutdown();
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
-diff -up linux-2.6.35.noarch/fs/nfsd/nfsd.h.orig linux-2.6.35.noarch/fs/nfsd/nfsd.h
---- linux-2.6.35.noarch/fs/nfsd/nfsd.h.orig 2010-09-30 12:22:45.288046000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfsd.h 2010-09-30 12:25:08.547288000 -0400
-@@ -286,11 +286,17 @@ extern time_t nfsd4_grace;
- #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
- NFSD4_SUPPORTED_ATTRS_WORD0
-
-+#if defined(CONFIG_PNFSD)
-+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-+ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES)
-+#else /* CONFIG_PNFSD */
- #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
- NFSD4_SUPPORTED_ATTRS_WORD1
-+#endif /* CONFIG_PNFSD */
-
- #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
-+ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \
-+ FATTR4_WORD2_LAYOUT_BLKSIZE)
-
- static inline u32 nfsd_suppattrs0(u32 minorversion)
- {
-diff -up linux-2.6.35.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.35.noarch/fs/nfsd/nfsfh.c
---- linux-2.6.35.noarch/fs/nfsd/nfsfh.c.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfsfh.c 2010-09-30 12:25:08.553285000 -0400
-@@ -10,6 +10,7 @@
- #include <linux/exportfs.h>
-
- #include <linux/sunrpc/svcauth_gss.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
- #include "nfsd.h"
- #include "vfs.h"
- #include "auth.h"
-@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s
- static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
- {
- struct knfsd_fh *fh = &fhp->fh_handle;
-+ int fsid_type;
- struct fid *fid = NULL, sfid;
- struct svc_export *exp;
- struct dentry *dentry;
-@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct
- return error;
- if (fh->fh_auth_type != 0)
- return error;
-- len = key_len(fh->fh_fsid_type) / 4;
-+ fsid_type = pnfs_fh_fsid_type(fh);
-+ len = key_len(fsid_type) / 4;
- if (len == 0)
- return error;
- if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
-@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct
- data_left -= len;
- if (data_left < 0)
- return error;
-- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
-+ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth);
- fid = (struct fid *)(fh->fh_auth + len);
- } else {
- __u32 tfh[2];
-diff -up linux-2.6.35.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.35.noarch/fs/nfsd/nfsfh.h
---- linux-2.6.35.noarch/fs/nfsd/nfsfh.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfsfh.h 2010-09-30 12:25:08.558286000 -0400
-@@ -14,6 +14,7 @@ enum nfsd_fsid {
- FSID_UUID8,
- FSID_UUID16,
- FSID_UUID16_INUM,
-+ FSID_MAX
- };
-
- enum fsid_source {
-@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp)
- }
- }
-
-+#if defined(CONFIG_PNFSD)
++/*
++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
++ *
++ * The @index component IO failed (error returned from target). Register
++ * the error for later reporting at layout-return.
++ */
++void
++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
++ int osd_error, u64 offset, u64 length, bool is_write)
++{
++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
++
++ BUG_ON(index >= state->num_comps);
++ if (osd_error) {
++ struct pnfs_osd_layout *layout =
++ (typeof(layout))state->objlseg->pnfs_osd_layout;
++
++ ioerr->oer_component = layout->olo_comps[index].oc_object_id;
++ ioerr->oer_comp_offset = offset;
++ ioerr->oer_comp_length = length;
++ ioerr->oer_iswrite = is_write;
++ ioerr->oer_errno = osd_error;
++
++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
++ __func__, index, ioerr->oer_errno,
++ ioerr->oer_iswrite,
++ _DEVID_LO(&ioerr->oer_component.oid_device_id),
++ _DEVID_HI(&ioerr->oer_component.oid_device_id),
++ ioerr->oer_component.oid_partition_id,
++ ioerr->oer_component.oid_object_id,
++ ioerr->oer_comp_offset,
++ ioerr->oer_comp_length);
++ } else {
++ /* User need not call if no error is reported */
++ ioerr->oer_errno = 0;
++ }
++}
++
++static void _rpc_commit_complete(struct work_struct *work)
++{
++ struct rpc_task *task;
++ struct nfs_write_data *wdata;
++
++ dprintk("%s enter\n", __func__);
++ task = container_of(work, struct rpc_task, u.tk_work);
++ wdata = container_of(task, struct nfs_write_data, task);
++
++ pnfs_commit_done(wdata);
++}
+
+/*
-+ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied
-+ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how
-+ * to handle a given stateid.
++ * Commit data remotely on OSDs
+ */
-+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
++enum pnfs_try_status
++objlayout_commit(struct nfs_write_data *wdata, int how)
+{
-+ return fh->fh_fsid_type >= FSID_MAX;
++ int status = PNFS_ATTEMPTED;
++
++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete);
++ schedule_work(&wdata->task.u.tk_work);
++ dprintk("%s: Return %d\n", __func__, status);
++ return status;
+}
+
-+static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh)
++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
++ * This is because the osd completion is called with ints-off from
++ * the block layer
++ */
++static void _rpc_read_complete(struct work_struct *work)
+{
-+ BUG_ON(fh->fh_version != 1);
-+ BUG_ON(pnfs_fh_is_ds(fh));
-+ fh->fh_fsid_type += FSID_MAX;
++ struct rpc_task *task;
++ struct nfs_read_data *rdata;
++
++ dprintk("%s enter\n", __func__);
++ task = container_of(work, struct rpc_task, u.tk_work);
++ rdata = container_of(task, struct nfs_read_data, task);
++
++ pnfs_read_done(rdata);
+}
+
-+#else /* CONFIG_PNFSD */
++void
++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
++{
++ int eof = state->eof;
++ struct nfs_read_data *rdata;
+
-+static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
++ state->status = status;
++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
++ rdata = state->rpcdata;
++ rdata->task.tk_status = status;
++ if (status >= 0) {
++ rdata->res.count = status;
++ rdata->res.eof = eof;
++ }
++ objlayout_iodone(state);
++ /* must not use state after this point */
++
++ if (sync)
++ pnfs_read_done(rdata);
++ else {
++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
++ schedule_work(&rdata->task.u.tk_work);
++ }
++}
++
++/*
++ * Perform sync or async reads.
++ */
++enum pnfs_try_status
++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages)
+{
-+ return 0;
++ loff_t offset = rdata->args.offset;
++ size_t count = rdata->args.count;
++ struct objlayout_io_state *state;
++ ssize_t status = 0;
++ loff_t eof;
++
++ dprintk("%s: Begin inode %p offset %llu count %d\n",
++ __func__, rdata->inode, offset, (int)count);
++
++ eof = i_size_read(rdata->inode);
++ if (unlikely(offset + count > eof)) {
++ if (offset >= eof) {
++ status = 0;
++ rdata->res.count = 0;
++ rdata->res.eof = 1;
++ goto out;
++ }
++ count = eof - offset;
++ }
++
++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
++ rdata->args.pages, rdata->args.pgbase,
++ nr_pages, offset, count,
++ rdata->pdata.lseg, rdata);
++ if (unlikely(!state)) {
++ status = -ENOMEM;
++ goto out;
++ }
++
++ state->eof = state->offset + state->count >= eof;
++
++ status = objio_read_pagelist(state);
++ out:
++ dprintk("%s: Return status %Zd\n", __func__, status);
++ rdata->pdata.pnfs_error = status;
++ return PNFS_ATTEMPTED;
+}
+
-+#endif /* CONFIG_PNFSD */
++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
++ * This is because the osd completion is called with ints-off from
++ * the block layer
++ */
++static void _rpc_write_complete(struct work_struct *work)
++{
++ struct rpc_task *task;
++ struct nfs_write_data *wdata;
+
-+/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */
-+static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh)
++ dprintk("%s enter\n", __func__);
++ task = container_of(work, struct rpc_task, u.tk_work);
++ wdata = container_of(task, struct nfs_write_data, task);
++
++ pnfs_writeback_done(wdata);
++}
++
++void
++objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
++ bool sync)
+{
-+ int fsid_type = fh->fh_fsid_type;
++ struct nfs_write_data *wdata;
+
-+ if (pnfs_fh_is_ds(fh))
-+ return fsid_type - FSID_MAX;
-+ return fsid_type;
++ dprintk("%s: Begin\n", __func__);
++ wdata = state->rpcdata;
++ state->status = status;
++ wdata->task.tk_status = status;
++ if (status >= 0) {
++ wdata->res.count = status;
++ wdata->verf.committed = state->committed;
++ dprintk("%s: Return status %d committed %d\n",
++ __func__, wdata->task.tk_status,
++ wdata->verf.committed);
++ } else
++ dprintk("%s: Return status %d\n",
++ __func__, wdata->task.tk_status);
++ objlayout_iodone(state);
++ /* must not use state after this point */
++
++ if (sync)
++ pnfs_writeback_done(wdata);
++ else {
++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
++ schedule_work(&wdata->task.u.tk_work);
++ }
+}
+
- #endif /* _LINUX_NFSD_FH_INT_H */
-diff -up linux-2.6.35.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.35.noarch/fs/nfsd/nfssvc.c
---- linux-2.6.35.noarch/fs/nfsd/nfssvc.c.orig 2010-09-30 12:22:45.298047000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/nfssvc.c 2010-09-30 12:25:08.577287000 -0400
-@@ -115,7 +115,7 @@ struct svc_program nfsd_program = {
-
- };
-
--u32 nfsd_supported_minorversion;
-+u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION;
-
- int nfsd_vers(int vers, enum vers_op change)
- {
-diff -up linux-2.6.35.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.35.noarch/fs/nfsd/pnfsd.h
---- linux-2.6.35.noarch/fs/nfsd/pnfsd.h.orig 2010-09-30 12:25:08.580286000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/pnfsd.h 2010-09-30 12:25:08.581291000 -0400
-@@ -0,0 +1,143 @@
+/*
-+ * Copyright (c) 2005 The Regents of the University of Michigan.
-+ * All rights reserved.
-+ *
-+ * Andy Adamson <andros at umich.edu>
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the University nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
++ * Perform sync or async writes.
+ */
++enum pnfs_try_status
++objlayout_write_pagelist(struct nfs_write_data *wdata,
++ unsigned nr_pages,
++ int how)
++{
++ struct objlayout_io_state *state;
++ ssize_t status;
+
-+#ifndef LINUX_NFSD_PNFSD_H
-+#define LINUX_NFSD_PNFSD_H
++ dprintk("%s: Begin inode %p offset %llu count %u\n",
++ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
++
++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
++ wdata->args.pages,
++ wdata->args.pgbase,
++ nr_pages,
++ wdata->args.offset,
++ wdata->args.count,
++ wdata->pdata.lseg, wdata);
++ if (unlikely(!state)) {
++ status = -ENOMEM;
++ goto out;
++ }
+
-+#include <linux/list.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
++ state->sync = how & FLUSH_SYNC;
+
-+#include "state.h"
-+#include "xdr4.h"
++ status = objio_write_pagelist(state, how & FLUSH_STABLE);
++ out:
++ dprintk("%s: Return status %Zd\n", __func__, status);
++ wdata->pdata.pnfs_error = status;
++ return PNFS_ATTEMPTED;
++}
+
-+/* outstanding layout stateid */
-+struct nfs4_layout_state {
-+ struct list_head ls_perfile;
-+ struct list_head ls_layouts; /* list of nfs4_layouts */
-+ struct kref ls_ref;
-+ struct nfs4_client *ls_client;
-+ struct nfs4_file *ls_file;
-+ stateid_t ls_stateid;
-+};
++void
++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
++ struct xdr_stream *xdr,
++ const struct nfs4_layoutcommit_args *args)
++{
++ struct objlayout *objlay = OBJLAYOUT(pnfslay);
++ struct pnfs_osd_layoutupdate lou;
++ __be32 *start;
+
-+/* outstanding layout */
-+struct nfs4_layout {
-+ struct list_head lo_perfile; /* hash by f_id */
-+ struct list_head lo_perclnt; /* hash by clientid */
-+ struct list_head lo_perstate;
-+ struct nfs4_file *lo_file; /* backpointer */
-+ struct nfs4_client *lo_client;
-+ struct nfs4_layout_state *lo_state;
-+ struct nfsd4_layout_seg lo_seg;
-+};
++ dprintk("%s: Begin\n", __func__);
+
-+struct pnfs_inval_state {
-+ struct knfsd_fh mdsfh; /* needed only by invalidate all */
-+ stateid_t stid;
-+ clientid_t clid;
-+ u32 status;
-+};
++ spin_lock(&objlay->lock);
++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
++ lou.dsu_delta = objlay->delta_space_used;
++ objlay->delta_space_used = 0;
++ objlay->delta_space_valid = OBJ_DSU_INIT;
++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
++ spin_unlock(&objlay->lock);
+
-+/* pNFS Data Server state */
-+#define DS_STATEID_VALID 0
-+#define DS_STATEID_ERROR 1
-+#define DS_STATEID_NEW 2
++ start = xdr_reserve_space(xdr, 4);
+
-+struct pnfs_ds_stateid {
-+ struct list_head ds_hash; /* ds_stateid hash entry */
-+ struct list_head ds_perclid; /* per client hash entry */
-+ stateid_t ds_stid;
-+ struct knfsd_fh ds_fh;
-+ unsigned long ds_access;
-+ u32 ds_status; /* from MDS */
-+ u32 ds_verifier[2]; /* from MDS */
-+ wait_queue_head_t ds_waitq;
-+ unsigned long ds_flags;
-+ struct kref ds_ref;
-+ clientid_t ds_mdsclid;
-+};
++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
-+struct pnfs_ds_clientid {
-+ struct list_head dc_hash; /* mds_clid_hashtbl entry */
-+ struct list_head dc_stateid; /* ds_stateid head */
-+ struct list_head dc_permdsid; /* per mdsid hash entry */
-+ clientid_t dc_mdsclid;
-+ struct kref dc_ref;
-+ uint32_t dc_mdsid;
-+};
++ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+
-+struct pnfs_mds_id {
-+ struct list_head di_hash; /* mds_nodeid list entry */
-+ struct list_head di_mdsclid; /* mds_clientid head */
-+ uint32_t di_mdsid;
-+ time_t di_mdsboot; /* mds boot time */
-+ struct kref di_ref;
-+};
++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
++ lou.dsu_delta, lou.olu_ioerr_flag);
++}
+
-+/* notify device request (from exported filesystem) */
-+struct nfs4_notify_device {
-+ struct nfsd4_pnfs_cb_dev_list *nd_list;
-+ struct nfs4_client *nd_client;
-+ struct list_head nd_perclnt;
++static int
++err_prio(u32 oer_errno)
++{
++ switch (oer_errno) {
++ case 0:
++ return 0;
+
-+ void *nd_args; /* nfsd internal */
-+};
++ case PNFS_OSD_ERR_RESOURCE:
++ return OSD_ERR_PRI_RESOURCE;
++ case PNFS_OSD_ERR_BAD_CRED:
++ return OSD_ERR_PRI_BAD_CRED;
++ case PNFS_OSD_ERR_NO_ACCESS:
++ return OSD_ERR_PRI_NO_ACCESS;
++ case PNFS_OSD_ERR_UNREACHABLE:
++ return OSD_ERR_PRI_UNREACHABLE;
++ case PNFS_OSD_ERR_NOT_FOUND:
++ return OSD_ERR_PRI_NOT_FOUND;
++ case PNFS_OSD_ERR_NO_SPACE:
++ return OSD_ERR_PRI_NO_SPACE;
++ default:
++ WARN_ON(1);
++ /* fallthrough */
++ case PNFS_OSD_ERR_EIO:
++ return OSD_ERR_PRI_EIO;
++ }
++}
+
-+u64 find_create_sbid(struct super_block *);
-+struct super_block *find_sbid_id(u64);
-+__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *);
-+int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *,
-+ struct nfsd4_pnfs_layoutreturn *);
-+int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *);
-+int nfs4_pnfs_cb_change_state(struct pnfs_get_state *);
-+void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
-+int put_layoutrecall(struct nfs4_layoutrecall *);
-+void nomatching_layout(struct nfs4_layoutrecall *);
-+void *layoutrecall_done(struct nfs4_layoutrecall *);
-+int nfsd4_cb_layout(struct nfs4_layoutrecall *);
-+int nfsd_layout_recall_cb(struct super_block *, struct inode *,
-+ struct nfsd4_pnfs_cb_layout *);
-+int nfsd_device_notify_cb(struct super_block *,
-+ struct nfsd4_pnfs_cb_dev_list *);
-+int nfsd4_cb_notify_device(struct nfs4_notify_device *);
-+void pnfs_set_device_notify(clientid_t *, unsigned int types);
-+void pnfs_clear_device_notify(struct nfs4_client *);
++static void
++merge_ioerr(struct pnfs_osd_ioerr *dest_err,
++ const struct pnfs_osd_ioerr *src_err)
++{
++ u64 dest_end, src_end;
+
-+#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
-+extern struct sockaddr pnfsd_lexp_addr;
-+extern size_t pnfs_lexp_addr_len;
++ if (!dest_err->oer_errno) {
++ *dest_err = *src_err;
++ /* accumulated device must be blank */
++ memset(&dest_err->oer_component.oid_device_id, 0,
++ sizeof(dest_err->oer_component.oid_device_id));
+
-+extern void pnfsd_lexp_init(struct inode *);
-+#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
++ return;
++ }
+
-+#endif /* LINUX_NFSD_PNFSD_H */
-diff -up linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c
---- linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-09-30 12:25:08.584288000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c 2010-09-30 12:25:08.586286000 -0400
-@@ -0,0 +1,225 @@
-+/*
-+ * linux/fs/nfsd/pnfs_lexp.c
-+ *
-+ * pNFS export of local filesystems.
-+ *
-+ * Export local file systems over the files layout type.
-+ * The MDS (metadata server) functions also as a single DS (data server).
-+ * This is mostly useful for development and debugging purposes.
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 2 of the License, or
-+ * (at your option) any later version.
-+ *
-+ * Copyright (C) 2008 Benny Halevy, <bhalevy at panasas.com>
-+ *
-+ * Initial implementation was based on the pnfs-gfs2 patches done
-+ * by David M. Richter <richterd at citi.umich.edu>
-+ */
++ if (dest_err->oer_component.oid_partition_id !=
++ src_err->oer_component.oid_partition_id)
++ dest_err->oer_component.oid_partition_id = 0;
+
-+#include <linux/sunrpc/svc_xprt.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
++ if (dest_err->oer_component.oid_object_id !=
++ src_err->oer_component.oid_object_id)
++ dest_err->oer_component.oid_object_id = 0;
+
-+#include "pnfsd.h"
++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
++ dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
++ dest_end = end_offset(dest_err->oer_comp_offset,
++ dest_err->oer_comp_length);
++ src_end = end_offset(src_err->oer_comp_offset,
++ src_err->oer_comp_length);
++ if (dest_end < src_end)
++ dest_end = src_end;
+
-+struct sockaddr pnfsd_lexp_addr;
-+size_t pnfs_lexp_addr_len;
++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
-+static int
-+pnfsd_lexp_layout_type(struct super_block *sb)
-+{
-+ int ret = LAYOUT_NFSV4_1_FILES;
-+ dprintk("<-- %s: return %d\n", __func__, ret);
-+ return ret;
++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
++ dest_err->oer_errno = src_err->oer_errno;
++ } else if (src_err->oer_iswrite) {
++ dest_err->oer_iswrite = true;
++ dest_err->oer_errno = src_err->oer_errno;
++ }
+}
+
-+static int
-+pnfsd_lexp_get_device_iter(struct super_block *sb,
-+ u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *res)
++static void
++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr)
+{
-+ dprintk("--> %s: sb=%p\n", __func__, sb);
++ struct objlayout_io_state *state, *tmp;
++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
-+ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
++ unsigned i;
+
-+ res->gd_eof = 1;
-+ if (res->gd_cookie)
-+ return -ENOENT;
-+ res->gd_cookie = 1;
-+ res->gd_verf = 1;
-+ res->gd_devid = 1;
++ for (i = 0; i < state->num_comps; i++) {
++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
++
++ if (!ioerr->oer_errno)
++ continue;
++
++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
++ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
++ "offset=0x%llx length=0x%llx\n",
++ __func__, i, ioerr->oer_errno,
++ ioerr->oer_iswrite,
++ _DEVID_LO(&ioerr->oer_component.oid_device_id),
++ _DEVID_HI(&ioerr->oer_component.oid_device_id),
++ ioerr->oer_component.oid_partition_id,
++ ioerr->oer_component.oid_object_id,
++ ioerr->oer_comp_offset,
++ ioerr->oer_comp_length);
++
++ merge_ioerr(&accumulated_err, ioerr);
++ }
++ list_del(&state->err_list);
++ objlayout_free_io_state(state);
++ }
+
-+ dprintk("<-- %s: return 0\n", __func__);
-+ return 0;
++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err));
+}
+
-+static int
-+pnfsd_lexp_get_device_info(struct super_block *sb,
-+ struct exp_xdr_stream *xdr,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *devid)
++void
++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
++ struct xdr_stream *xdr,
++ const struct nfs4_layoutreturn_args *args)
+{
-+ int err;
-+ struct pnfs_filelayout_device fdev;
-+ struct pnfs_filelayout_multipath fl_devices[1];
-+ u32 fl_stripe_indices[1] = { 0 };
-+ struct pnfs_filelayout_devaddr daddr;
-+ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */
-+ char daddr_buf[8*4 + 2*3 + 10];
++ struct objlayout *objlay = OBJLAYOUT(pnfslay);
++ struct objlayout_io_state *state, *tmp;
++ __be32 *start, *uninitialized_var(last_xdr);
+
-+ dprintk("--> %s: sb=%p\n", __func__, sb);
++ dprintk("%s: Begin\n", __func__);
++ start = xdr_reserve_space(xdr, 4);
++ BUG_ON(!start);
+
-+ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
++ spin_lock(&objlay->lock);
+
-+ memset(&fdev, '\0', sizeof(fdev));
++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
++ unsigned i;
++ int res = 0;
+
-+ if (devid->devid != 1) {
-+ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 "
-+ "(got: 0x%llx)\n", __func__, devid->devid);
-+ err = -EINVAL;
-+ goto out;
-+ }
++ for (i = 0; i < state->num_comps && !res; i++) {
++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
-+ /* count the number of comma-delimited DS IPs */
-+ fdev.fl_device_length = 1;
-+ fdev.fl_device_list = fl_devices;
++ if (!ioerr->oer_errno)
++ continue;
+
-+ fdev.fl_stripeindices_length = fdev.fl_device_length;
-+ fdev.fl_stripeindices_list = fl_stripe_indices;
++ dprintk("%s: err[%d]: errno=%d is_write=%d "
++ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
++ "offset=0x%llx length=0x%llx\n",
++ __func__, i, ioerr->oer_errno,
++ ioerr->oer_iswrite,
++ _DEVID_LO(&ioerr->oer_component.oid_device_id),
++ _DEVID_HI(&ioerr->oer_component.oid_device_id),
++ ioerr->oer_component.oid_partition_id,
++ ioerr->oer_component.oid_object_id,
++ ioerr->oer_comp_offset,
++ ioerr->oer_comp_length);
+
-+ daddr.r_addr.data = daddr_buf;
-+ daddr.r_addr.len = sizeof(daddr_buf);
-+ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr);
-+ if (err < 0)
-+ goto out;
-+ daddr.r_addr.len = err;
-+ switch (pnfsd_lexp_addr.sa_family) {
-+ case AF_INET:
-+ daddr.r_netid.data = "tcp";
-+ daddr.r_netid.len = 3;
-+ break;
-+ case AF_INET6:
-+ daddr.r_netid.data = "tcp6";
-+ daddr.r_netid.len = 4;
-+ break;
-+ default:
-+ BUG();
++ last_xdr = xdr->p;
++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
++ }
++ if (unlikely(res)) {
++ /* no space for even one error descriptor */
++ BUG_ON(last_xdr == start + 1);
++
++ /* we've encountered a situation with lots and lots of
++ * errors and no space to encode them all. Use the last
++ * available slot to report the union of all the
++ * remaining errors.
++ */
++ xdr_rewind_stream(xdr, last_xdr -
++ pnfs_osd_ioerr_xdr_sz() / 4);
++ encode_accumulated_error(objlay, xdr);
++ goto loop_done;
++ }
++ list_del(&state->err_list);
++ objlayout_free_io_state(state);
+ }
-+ fdev.fl_device_list[0].fl_multipath_length = 1;
-+ fdev.fl_device_list[0].fl_multipath_list = &daddr;
++loop_done:
++ spin_unlock(&objlay->lock);
+
-+ /* have nfsd encode the device info */
-+ err = filelayout_encode_devinfo(xdr, &fdev);
-+out:
-+ dprintk("<-- %s: return %d\n", __func__, err);
-+ return err;
++ *start = cpu_to_be32((xdr->p - start - 1) * 4);
++ dprintk("%s: Return\n", __func__);
+}
+
-+static int get_stripe_unit(int blocksize)
-+{
-+ if (blocksize < NFSSVC_MAXBLKSIZE)
-+ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
-+ dprintk("%s: return %d\n", __func__, blocksize);
-+ return blocksize;
-+}
++struct objlayout_deviceinfo {
++ struct page *page;
++ struct pnfs_osd_deviceaddr da; /* This must be last */
++};
+
-+static enum nfsstat4
-+pnfsd_lexp_layout_get(struct inode *inode,
-+ struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_layoutget_arg *arg,
-+ struct nfsd4_pnfs_layoutget_res *res)
++/* Initialize and call nfs_getdeviceinfo, then decode and return a
++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
++ * should be called.
++ */
++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
++ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr)
+{
-+ enum nfsstat4 rc = NFS4_OK;
-+ struct pnfs_filelayout_layout *layout = NULL;
-+ struct knfsd_fh *fhp = NULL;
++ struct objlayout_deviceinfo *odi;
++ struct pnfs_device pd;
++ struct super_block *sb;
++ struct page *page;
++ size_t sz;
++ u32 *p;
++ int err;
+
-+ dprintk("--> %s: inode=%p\n", __func__, inode);
++ page = alloc_page(GFP_KERNEL);
++ if (!page)
++ return -ENOMEM;
+
-+ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
-+ res->lg_seg.offset = 0;
-+ res->lg_seg.length = NFS4_MAX_UINT64;
++ pd.area = page_address(page);
+
-+ layout = kzalloc(sizeof(*layout), GFP_KERNEL);
-+ if (layout == NULL) {
-+ rc = -ENOMEM;
-+ goto error;
-+ }
++ memcpy(&pd.dev_id, d_id, sizeof(*d_id));
++ pd.layout_type = LAYOUT_OSD2_OBJECTS;
++ pd.pages = &page;
++ pd.pgbase = 0;
++ pd.pglen = PAGE_SIZE;
++ pd.mincount = 0;
+
-+ /* Set file layout response args */
-+ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
-+ layout->lg_stripe_type = STRIPE_SPARSE;
-+ layout->lg_commit_through_mds = true;
-+ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
-+ layout->lg_fh_length = 1;
-+ layout->device_id.sbid = arg->lg_sbid;
-+ layout->device_id.devid = 1; /*FSFTEMP*/
-+ layout->lg_first_stripe_index = 0; /*FSFTEMP*/
-+ layout->lg_pattern_offset = 0;
++ sb = pnfslay->inode->i_sb;
++ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->inode), &pd);
++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
++ if (err)
++ goto err_out;
+
-+ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
-+ if (fhp == NULL) {
-+ rc = -ENOMEM;
-+ goto error;
++ p = pd.area;
++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p);
++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL);
++ if (!odi) {
++ err = -ENOMEM;
++ goto err_out;
+ }
++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
++ odi->page = page;
++ *deviceaddr = &odi->da;
++ return 0;
+
-+ memcpy(fhp, arg->lg_fh, sizeof(*fhp));
-+ pnfs_fh_mark_ds(fhp);
-+ layout->lg_fh_list = fhp;
-+
-+ /* Call nfsd to encode layout */
-+ rc = filelayout_encode_layout(xdr, layout);
-+exit:
-+ kfree(layout);
-+ kfree(fhp);
-+ dprintk("<-- %s: return %d\n", __func__, rc);
-+ return rc;
-+
-+error:
-+ res->lg_seg.length = 0;
-+ goto exit;
++err_out:
++ __free_page(page);
++ return err;
+}
+
-+static int
-+pnfsd_lexp_layout_commit(struct inode *inode,
-+ const struct nfsd4_pnfs_layoutcommit_arg *args,
-+ struct nfsd4_pnfs_layoutcommit_res *res)
++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
-+ dprintk("%s: (unimplemented)\n", __func__);
++ struct objlayout_deviceinfo *odi = container_of(deviceaddr,
++ struct objlayout_deviceinfo,
++ da);
+
-+ return 0;
++ __free_page(odi->page);
++ kfree(odi);
+}
+
-+static int
-+pnfsd_lexp_layout_return(struct inode *inode,
-+ const struct nfsd4_pnfs_layoutreturn_arg *args)
++/*
++ * Initialize a mountpoint by retrieving the list of
++ * available devices for it.
++ * Return the pnfs_mount_type structure so the
++ * pNFS_client can refer to the mount point later on.
++ */
++int
++objlayout_initialize_mountpoint(struct nfs_server *server,
++ const struct nfs_fh *mntfh)
+{
-+ dprintk("%s: (unimplemented)\n", __func__);
-+
-+ return 0;
-+}
++ void *data;
+
-+static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh,
-+ struct pnfs_get_state *p)
-+{
-+ return 0; /* just use the current stateid */
-+}
++ data = objio_init_mt();
++ if (IS_ERR(data)) {
++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n",
++ __func__, PTR_ERR(data));
++ return PTR_ERR(data);
++ }
++ server->pnfs_ld_data = data;
+
-+static struct pnfs_export_operations pnfsd_lexp_ops = {
-+ .layout_type = pnfsd_lexp_layout_type,
-+ .get_device_info = pnfsd_lexp_get_device_info,
-+ .get_device_iter = pnfsd_lexp_get_device_iter,
-+ .layout_get = pnfsd_lexp_layout_get,
-+ .layout_commit = pnfsd_lexp_layout_commit,
-+ .layout_return = pnfsd_lexp_layout_return,
-+ .get_state = pnfsd_lexp_get_state,
-+};
++ dprintk("%s: Return data=%p\n", __func__, data);
++ return 0;
++}
+
-+void
-+pnfsd_lexp_init(struct inode *inode)
++/*
++ * Uninitialize a mountpoint
++ */
++int
++objlayout_uninitialize_mountpoint(struct nfs_server *server)
+{
-+ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops);
-+ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops;
++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data);
++ objio_fini_mt(server->pnfs_ld_data);
++ return 0;
+}
-diff -up linux-2.6.35.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.35.noarch/fs/nfsd/spnfs_com.c
---- linux-2.6.35.noarch/fs/nfsd/spnfs_com.c.orig 2010-09-30 12:25:08.589286000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/spnfs_com.c 2010-09-30 12:25:08.590292000 -0400
-@@ -0,0 +1,535 @@
+diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
+new file mode 100644
+index 0000000..cad24a4
+--- /dev/null
++++ b/fs/nfs/objlayout/objlayout.h
+@@ -0,0 +1,206 @@
+/*
-+ * fs/nfsd/spnfs_com.c
++ * objlayout.h
+ *
-+ * Communcation layer between spNFS kernel and userspace
-+ * Based heavily on idmap.c
++ * Data types and function declerations for interfacing with the
++ * pNFS standard object layout driver.
+ *
-+ */
-+
-+/*
-+ * Copyright (c) 2002 The Regents of the University of Michigan.
++ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
-+ * Marius Aamodt Eriksen <marius at umich.edu>
++ * Benny Halevy <bhalevy at panasas.com>
++ * Boaz Harrosh <bharrosh at panasas.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2
++ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
@@ -13488,7 +11530,7 @@ diff -up linux-2.6.35.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.35.noarch/fs/nfs
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the University nor the names of its
++ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
@@ -13504,2025 +11546,1922 @@ diff -up linux-2.6.35.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.35.noarch/fs/nfs
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
-+#include <linux/namei.h>
-+#include <linux/mount.h>
-+#include <linux/path.h>
-+#include <linux/sunrpc/clnt.h>
-+#include <linux/sunrpc/rpc_pipe_fs.h>
-+#include <linux/nfsd/debug.h>
+
-+#include <linux/nfsd4_spnfs.h>
++#ifndef _OBJLAYOUT_H
++#define _OBJLAYOUT_H
+
-+#define NFSDDBG_FACILITY NFSDDBG_PROC
++#include <linux/nfs_fs.h>
++#include <linux/pnfs_osd_xdr.h>
++#include "../pnfs.h"
+
-+static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-+ char __user *, size_t);
-+static ssize_t spnfs_pipe_downcall(struct file *, const char __user *,
-+ size_t);
-+static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *);
++/*
++ * in-core layout segment
++ */
++struct objlayout_segment {
++ struct pnfs_layout_segment lseg;
++ void *internal; /* for provider internal use */
++ u8 pnfs_osd_layout[];
++};
+
-+static struct rpc_pipe_ops spnfs_upcall_ops = {
-+ .upcall = spnfs_pipe_upcall,
-+ .downcall = spnfs_pipe_downcall,
-+ .destroy_msg = spnfs_pipe_destroy_msg,
++/*
++ * per-inode layout
++ */
++struct objlayout {
++ struct pnfs_layout_hdr pnfs_layout;
++
++ /* for layout_commit */
++ enum osd_delta_space_valid_enum {
++ OBJ_DSU_INIT = 0,
++ OBJ_DSU_VALID,
++ OBJ_DSU_INVALID,
++ } delta_space_valid;
++ s64 delta_space_used; /* consumed by write ops */
++
++ /* for layout_return */
++ spinlock_t lock;
++ struct list_head err_list;
+};
+
-+/* evil global variable */
-+struct spnfs *global_spnfs;
-+struct spnfs_config *spnfs_config;
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+int spnfs_use_layoutsegments;
-+uint64_t layoutsegment_size;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++static inline struct objlayout *
++OBJLAYOUT(struct pnfs_layout_hdr *lo)
++{
++ return container_of(lo, struct objlayout, pnfs_layout);
++}
+
+/*
-+ * Used by spnfs_enabled()
-+ * Tracks if the subsystem has been initialized at some point. It doesn't
-+ * matter if it's not currently initialized.
++ * per-I/O operation state
++ * embedded in objects provider io_state data structure
+ */
-+static int spnfs_enabled_at_some_point;
++struct objlayout_io_state {
++ struct objlayout_segment *objlseg;
+
-+/* call this to start the ball rolling */
-+/* code it like we're going to avoid the global variable in the future */
-+int
-+nfsd_spnfs_new(void)
++ struct page **pages;
++ unsigned pgbase;
++ unsigned nr_pages;
++ unsigned long count;
++ loff_t offset;
++ bool sync;
++
++ void *rpcdata;
++ int status; /* res */
++ int eof; /* res */
++ int committed; /* res */
++
++ /* Error reporting (layout_return) */
++ struct list_head err_list;
++ unsigned num_comps;
++ /* Pointer to array of error descriptors of size num_comps.
++ * It should contain as many entries as devices in the osd_layout
++ * that participate in the I/O. It is up to the io_engine to allocate
++ * needed space and set num_comps.
++ */
++ struct pnfs_osd_ioerr *ioerrs;
++};
++
++/*
++ * Raid engine I/O API
++ */
++extern void *objio_init_mt(void);
++extern void objio_fini_mt(void *mt);
++
++extern int objio_alloc_lseg(void **outp,
++ struct pnfs_layout_hdr *pnfslay,
++ struct pnfs_layout_segment *lseg,
++ struct pnfs_osd_layout *layout);
++extern void objio_free_lseg(void *p);
++
++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp);
++extern void objio_free_io_state(struct objlayout_io_state *state);
++
++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
++ bool stable);
++
++/*
++ * callback API
++ */
++extern void objlayout_io_set_result(struct objlayout_io_state *state,
++ unsigned index, int osd_error,
++ u64 offset, u64 length, bool is_write);
++
++static inline void
++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
-+ struct spnfs *spnfs = NULL;
-+ struct path path;
-+ struct nameidata nd;
-+ int rc;
++ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
+
-+ if (global_spnfs != NULL)
-+ return -EEXIST;
++ /* If one of the I/Os errored out and the delta_space_used was
++ * invalid we render the complete report as invalid. Protocol mandate
++ * the DSU be accurate or not reported.
++ */
++ spin_lock(&objlay->lock);
++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
++ objlay->delta_space_valid = OBJ_DSU_VALID;
++ objlay->delta_space_used += space_used;
++ }
++ spin_unlock(&objlay->lock);
++}
+
-+ path.mnt = rpc_get_mount();
-+ if (IS_ERR(path.mnt))
-+ return PTR_ERR(path.mnt);
++extern void objlayout_read_done(struct objlayout_io_state *state,
++ ssize_t status, bool sync);
++extern void objlayout_write_done(struct objlayout_io_state *state,
++ ssize_t status, bool sync);
++
++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
++ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr);
++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
++
++/*
++ * exported generic objects function vectors
++ */
++
++extern int objlayout_initialize_mountpoint(
++ struct nfs_server *,
++ const struct nfs_fh *);
++extern int objlayout_uninitialize_mountpoint(struct nfs_server *);
++
++extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *);
++extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
++
++extern struct pnfs_layout_segment *objlayout_alloc_lseg(
++ struct pnfs_layout_hdr *,
++ struct nfs4_layoutget_res *);
++extern void objlayout_free_lseg(struct pnfs_layout_segment *);
++
++extern enum pnfs_try_status objlayout_read_pagelist(
++ struct nfs_read_data *,
++ unsigned nr_pages);
++
++extern enum pnfs_try_status objlayout_write_pagelist(
++ struct nfs_write_data *,
++ unsigned nr_pages,
++ int how);
++
++extern enum pnfs_try_status objlayout_commit(
++ struct nfs_write_data *,
++ int how);
++
++extern void objlayout_encode_layoutcommit(
++ struct pnfs_layout_hdr *,
++ struct xdr_stream *,
++ const struct nfs4_layoutcommit_args *);
++
++extern void objlayout_encode_layoutreturn(
++ struct pnfs_layout_hdr *,
++ struct xdr_stream *,
++ const struct nfs4_layoutreturn_args *);
++
++#endif /* _OBJLAYOUT_H */
+diff --git a/fs/nfs/objlayout/panfs_shim.c b/fs/nfs/objlayout/panfs_shim.c
+new file mode 100644
+index 0000000..4d31856
+--- /dev/null
++++ b/fs/nfs/objlayout/panfs_shim.c
+@@ -0,0 +1,702 @@
++/*
++ * panfs_shim.c
++ *
++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack
++ *
++ * Copyright (C) 2007-2009 Panasas Inc.
++ * All rights reserved.
++ *
++ * Benny Halevy <bhalevy at panasas.com>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the Panasas company nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * See the file COPYING included with this distribution for more details.
++ *
++ */
+
-+ /* FIXME: do not abuse rpc_pipefs/nfs */
-+ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
-+ if (rc)
-+ goto err;
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <asm/byteorder.h>
+
-+ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL);
-+ if (spnfs == NULL){
-+ rc = -ENOMEM;
-+ goto err;
-+ }
++#include "objlayout.h"
++#include "panfs_shim.h"
+
-+ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs,
-+ &spnfs_upcall_ops, 0);
-+ if (IS_ERR(spnfs->spnfs_dentry)) {
-+ rc = -EPIPE;
-+ goto err;
-+ }
++#include <linux/panfs_shim_api.h>
+
-+ mutex_init(&spnfs->spnfs_lock);
-+ mutex_init(&spnfs->spnfs_plock);
-+ init_waitqueue_head(&spnfs->spnfs_wq);
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+ global_spnfs = spnfs;
-+ spnfs_enabled_at_some_point = 1;
++struct panfs_export_operations *panfs_export_ops;
+
-+ return 0;
-+err:
-+ rpc_put_mount();
-+ kfree(spnfs);
-+ return rc;
++void *
++objio_init_mt(void)
++{
++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL;
+}
+
-+/* again, code it like we're going to remove the global variable */
-+void
-+nfsd_spnfs_delete(void)
++void objio_fini_mt(void *mountid)
+{
-+ struct spnfs *spnfs = global_spnfs;
++}
+
-+ if (!spnfs)
-+ return;
-+ rpc_unlink(spnfs->spnfs_dentry);
-+ rpc_put_mount();
-+ global_spnfs = NULL;
-+ kfree(spnfs);
++static int
++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout,
++ struct pnfs_osd_data_map *lo_map,
++ pan_agg_layout_hdr_t *hdr)
++{
++ if (lo_map->odm_mirror_cnt) {
++ hdr->type = PAN_AGG_RAID1;
++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1;
++ } else if (layout->olo_num_comps > 1) {
++ hdr->type = PAN_AGG_RAID0;
++ hdr->hdr.raid0.num_comps = layout->olo_num_comps;
++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit;
++ } else
++ hdr->type = PAN_AGG_SIMPLE;
++ return 0;
+}
+
-+/* RPC pipefs upcall/downcall routines */
-+/* looks like this code is invoked by the rpc_pipe code */
-+/* to handle upcalls on things we've queued elsewhere */
-+/* See nfs_idmap_id for an exmaple of enqueueing */
-+static ssize_t
-+spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-+ char __user *dst, size_t buflen)
++static int
++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout,
++ struct pnfs_osd_data_map *lo_map,
++ pan_agg_layout_hdr_t *hdr)
+{
-+ char *data = (char *)msg->data + msg->copied;
-+ ssize_t mlen = msg->len - msg->copied;
-+ ssize_t left;
++ if (lo_map->odm_mirror_cnt)
++ goto err;
+
-+ if (mlen > buflen)
-+ mlen = buflen;
++ if (lo_map->odm_group_width || lo_map->odm_group_depth) {
++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth)
++ goto err;
+
-+ left = copy_to_user(dst, data, mlen);
-+ if (left < 0) {
-+ msg->errno = left;
-+ return left;
++ hdr->type = PAN_AGG_GRP_RAID5_LEFT;
++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps;
++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps)
++ goto err;
++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit;
++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width;
++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth;
++ /* this is a guess, panasas server is not supposed to
++ hand out layotu otherwise */
++ hdr->hdr.grp_raid5_left.group_layout_policy =
++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN;
++ } else {
++ hdr->type = PAN_AGG_RAID5_LEFT;
++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps;
++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps)
++ goto err;
++ hdr->hdr.raid5_left.stripe_unit2 =
++ hdr->hdr.raid5_left.stripe_unit1 =
++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit;
+ }
-+ mlen -= left;
-+ msg->copied += mlen;
-+ msg->errno = 0;
-+ return mlen;
++
++ return 0;
++err:
++ return -EINVAL;
+}
+
-+static ssize_t
-+spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
++/*
++ * Convert a pnfs_osd data map into Panasas aggregation layout header
++ */
++static int
++panfs_shim_conv_pnfs_osd_data_map(
++ struct pnfs_osd_layout *layout,
++ pan_agg_layout_hdr_t *hdr)
+{
-+ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
-+ struct spnfs *spnfs = (struct spnfs *)rpci->private;
-+ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im;
-+ int ret;
-+
-+ if (mlen != sizeof(struct spnfs_msg))
-+ return -ENOSPC;
++ int status = -EINVAL;
++ struct pnfs_osd_data_map *lo_map = &layout->olo_map;
+
-+ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+ if (im_in == NULL)
-+ return -ENOMEM;
++ if (!layout->olo_num_comps) {
++ dprintk("%s: !!layout.n_comps(%u)\n", __func__,
++ layout->olo_num_comps);
++ goto err;
++ }
+
-+ if (copy_from_user(im_in, src, mlen) != 0)
-+ return -EFAULT;
++ switch (lo_map->odm_raid_algorithm) {
++ case PNFS_OSD_RAID_0:
++ if (layout->olo_num_comps != lo_map->odm_num_comps ||
++ layout->olo_comps_index) {
++ dprintk("%s: !!PNFS_OSD_RAID_0 "
++ "layout.n_comps(%u) map.n_comps(%u) "
++ "comps_index(%u)\n", __func__,
++ layout->olo_num_comps,
++ lo_map->odm_num_comps,
++ layout->olo_comps_index);
++ goto err;
++ }
++ status = panfs_shim_conv_raid01(layout, lo_map, hdr);
++ break;
+
-+ mutex_lock(&spnfs->spnfs_plock);
++ case PNFS_OSD_RAID_5:
++ if (!lo_map->odm_group_width) {
++ if (layout->olo_num_comps != lo_map->odm_num_comps ||
++ layout->olo_comps_index) {
++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width "
++ "layout.n_comps(%u)!=map.n_comps(%u) "
++ "|| comps_index(%u)\n", __func__,
++ layout->olo_num_comps,
++ lo_map->odm_num_comps,
++ layout->olo_comps_index);
++ goto err;
++ }
++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps &&
++ layout->olo_num_comps > lo_map->odm_group_width) ||
++ (layout->olo_comps_index % lo_map->odm_group_width)){
++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) "
++ "layout.n_comps(%u) map.n_comps(%u) "
++ "comps_index(%u)\n", __func__,
++ lo_map->odm_group_width,
++ layout->olo_num_comps,
++ lo_map->odm_num_comps,
++ layout->olo_comps_index);
++ goto err;
++ }
++ status = panfs_shim_conv_raid5(layout, lo_map, hdr);
++ break;
+
-+ ret = mlen;
-+ im->im_status = im_in->im_status;
-+ /* If we got an error, terminate now, and wake up pending upcalls */
-+ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) {
-+ wake_up(&spnfs->spnfs_wq);
-+ goto out;
++ case PNFS_OSD_RAID_4:
++ case PNFS_OSD_RAID_PQ:
++ default:
++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__,
++ lo_map->odm_raid_algorithm);
++ goto err;
+ }
+
-+ ret = -EINVAL;
-+ /* Did we match the current upcall? */
-+ /* DMXXX: do not understand the comment above, from original code */
-+ /* DMXXX: when do we _not_ match the current upcall? */
-+ /* DMXXX: anyway, let's to a simplistic check */
-+ if (im_in->im_type == im->im_type) {
-+ /* copy the response into the spnfs struct */
-+ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res));
-+ ret = mlen;
-+ } else
-+ dprintk("spnfs: downcall type != upcall type\n");
++ return 0;
++
++err:
++ return status;
++}
++
++/*
++ * Convert pnfs_osd layout into Panasas map and caps type
++ */
++int
++objio_alloc_lseg(void **outp,
++ struct pnfs_layout_hdr *pnfslay,
++ struct pnfs_layout_segment *lseg,
++ struct pnfs_osd_layout *layout)
++{
++ int i, total_comps;
++ int status;
++ struct pnfs_osd_object_cred *lo_comp;
++ pan_size_t alloc_sz, local_sz;
++ pan_sm_map_cap_t *mcs = NULL;
++ u8 *buf;
++ pan_agg_comp_obj_t *pan_comp;
++ pan_sm_sec_t *pan_sec;
++
++ status = -EINVAL;
++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) {
++ total_comps = layout->olo_comps_index + layout->olo_num_comps;
++ } else {
++ /* allocate full map, otherwise SAM gets confused */
++ total_comps = layout->olo_map.odm_num_comps;
++ }
++ alloc_sz = total_comps *
++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t));
++ for (i = 0; i < layout->olo_num_comps; i++) {
++ void *p = layout->olo_comps[i].oc_cap.cred;
++ if (panfs_export_ops->sm_sec_t_get_size_otw(
++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL))
++ goto err;
++ alloc_sz += local_sz;
++ }
+
++ status = -ENOMEM;
++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL);
++ if (!mcs)
++ goto err;
++ buf = (u8 *)&mcs[1];
+
-+ wake_up(&spnfs->spnfs_wq);
-+/* DMXXX handle rval processing */
-+out:
-+ mutex_unlock(&spnfs->spnfs_plock);
-+ kfree(im_in);
-+ return ret;
-+}
++ mcs->offset = lseg->range.offset;
++ mcs->length = lseg->range.length;
++#if 0
++ /* FIXME: for now */
++ mcs->expiration_time.ts_sec = 0;
++ mcs->expiration_time.ts_nsec = 0;
++#endif
++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL;
++ status = panfs_shim_conv_pnfs_osd_data_map(layout,
++ &mcs->full_map.layout_hdr);
++ if (status)
++ goto err;
+
-+static void
-+spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-+{
-+ struct spnfs_msg *im = msg->data;
-+ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im);
++ mcs->full_map.components.size = total_comps;
++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf;
++ buf += total_comps * sizeof(pan_agg_comp_obj_t);
+
-+ if (msg->errno >= 0)
-+ return;
-+ mutex_lock(&spnfs->spnfs_plock);
-+ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */
-+ wake_up(&spnfs->spnfs_wq);
-+ mutex_unlock(&spnfs->spnfs_plock);
-+}
++ mcs->secs.size = total_comps;
++ mcs->secs.data = (pan_sm_sec_t *)buf;
++ buf += total_comps * sizeof(pan_sm_sec_t);
+
-+/* generic upcall. called by functions in spnfs_ops.c */
-+int
-+spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg,
-+ union spnfs_msg_res *res)
-+{
-+ struct rpc_pipe_msg msg;
-+ struct spnfs_msg *im;
-+ DECLARE_WAITQUEUE(wq, current);
-+ int ret = -EIO;
-+ int rval;
++ lo_comp = layout->olo_comps;
++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index;
++ pan_sec = mcs->secs.data + layout->olo_comps_index;
++ for (i = 0; i < layout->olo_num_comps; i++) {
++ void *p;
++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id;
++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id;
++ u64 dev_id = __be64_to_cpup(
++ (__be64 *)oc_obj_id->oid_device_id.data + 1);
+
-+ im = &spnfs->spnfs_im;
++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n",
++ __func__, i,
++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data),
++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1),
++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id);
+
-+ mutex_lock(&spnfs->spnfs_lock);
-+ mutex_lock(&spnfs->spnfs_plock);
++ if (i == 0) {
++ /* make up mgr_id to calm sam down */
++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0,
++ &obj_id->dev_id);
++ obj_id->grp_id = oc_obj_id->oid_partition_id;
++ obj_id->obj_id = oc_obj_id->oid_object_id;
++ }
+
-+ memset(im, 0, sizeof(*im));
-+ memcpy(im, upmsg, sizeof(*upmsg));
++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) {
++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n",
++ __func__, i, (u64)obj_id->grp_id,
++ lo_comp->oc_object_id.oid_partition_id);
++ status = -EINVAL;
++ goto err;
++ }
+
-+ memset(&msg, 0, sizeof(msg));
-+ msg.data = im;
-+ msg.len = sizeof(*im);
++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) {
++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n",
++ __func__, i, obj_id->obj_id,
++ lo_comp->oc_object_id.oid_object_id);
++ status = -EINVAL;
++ goto err;
++ }
+
-+ add_wait_queue(&spnfs->spnfs_wq, &wq);
-+ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg);
-+ if (rval < 0) {
-+ remove_wait_queue(&spnfs->spnfs_wq, &wq);
-+ goto out;
-+ }
++ pan_comp->dev_id = dev_id;
++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) {
++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n",
++ __func__, i, obj_id->dev_id);
++ status = -EINVAL;
++ goto err;
++ }
++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) {
++ dprintk("%s: degraded maps not supported yet\n",
++ __func__);
++ status = -ENOTSUPP;
++ goto err;
++ }
++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL;
++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) {
++ dprintk("%s: cap key security not supported yet\n",
++ __func__);
++ status = -ENOTSUPP;
++ goto err;
++ }
+
-+ set_current_state(TASK_UNINTERRUPTIBLE);
-+ mutex_unlock(&spnfs->spnfs_plock);
-+ schedule();
-+ current->state = TASK_RUNNING;
-+ remove_wait_queue(&spnfs->spnfs_wq, &wq);
-+ mutex_lock(&spnfs->spnfs_plock);
++ p = lo_comp->oc_cap.cred;
++ panfs_export_ops->sm_sec_t_unmarshall(
++ (pan_sm_sec_otw_t *)&p,
++ pan_sec,
++ buf,
++ alloc_sz,
++ NULL,
++ &local_sz);
++ buf += local_sz;
++ alloc_sz -= local_sz;
+
-+ if (im->im_status & SPNFS_STATUS_SUCCESS) {
-+ /* copy our result from the upcall */
-+ memcpy(res, &im->im_res, sizeof(*res));
-+ ret = 0;
++ lo_comp++;
++ pan_comp++;
++ pan_sec++;
+ }
+
-+out:
-+ memset(im, 0, sizeof(*im));
-+ mutex_unlock(&spnfs->spnfs_plock);
-+ mutex_unlock(&spnfs->spnfs_lock);
-+ return(ret);
++ *outp = mcs;
++ dprintk("%s:Return mcs=%p\n", __func__, mcs);
++ return 0;
++
++err:
++ objio_free_lseg(mcs);
++ dprintk("%s:Error %d\n", __func__, status);
++ return status;
+}
+
+/*
-+ * This is used to determine if the spnfsd daemon has been started at
-+ * least once since the system came up. This is used to by the export
-+ * mechanism to decide if spnfs is in use.
-+ *
-+ * Returns non-zero if the spnfsd has initialized the communication pipe
-+ * at least once.
++ * Free a Panasas map and caps type
+ */
-+int spnfs_enabled(void)
++void
++objio_free_lseg(void *p)
+{
-+ return spnfs_enabled_at_some_point;
++ kfree(p);
+}
+
-+#ifdef CONFIG_PROC_FS
-+
+/*
-+ * procfs virtual files for user/kernel space communication:
-+ *
-+ * ctl - currently just an on/off switch...can be expanded
-+ * getfh - fd to fh conversion
-+ * recall - recall a layout from the command line, for example:
-+ * echo <path> > /proc/fs/spnfs/recall
-+ * config - configuration info, e.g., stripe size, num ds, etc.
++ * I/O routines
+ */
-+
-+/*-------------- start ctl -------------------------*/
-+static ssize_t ctl_write(struct file *file, const char __user *buf,
-+ size_t count, loff_t *offset)
-+{
-+ int cmd, rc;
-+
-+ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int)))
-+ return -EFAULT;
-+ if (cmd) {
-+ rc = nfsd_spnfs_new();
-+ if (rc != 0)
-+ return rc;
-+ } else
-+ nfsd_spnfs_delete();
-+
-+ return count;
-+}
-+
-+static const struct file_operations ctl_ops = {
-+ .write = ctl_write,
-+};
-+/*-------------- end ctl ---------------------------*/
-+
-+/*-------------- start config -------------------------*/
-+static ssize_t config_write(struct file *file, const char __user *buf,
-+ size_t count, loff_t *offset)
++int
++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
+{
-+ static struct spnfs_config cfg;
++ struct panfs_shim_io_state *p;
+
-+ if (copy_from_user(&cfg, buf, count))
-+ return -EFAULT;
++ dprintk("%s: allocating io_state\n", __func__);
++ p = kzalloc(sizeof(*p), GFP_KERNEL);
++ if (!p)
++ return -ENOMEM;
+
-+ spnfs_config = &cfg;
++ *outp = &p->ol_state;
+ return 0;
+}
+
-+static const struct file_operations config_ops = {
-+ .write = config_write,
-+};
-+/*-------------- end config ---------------------------*/
-+
-+/*-------------- start getfh -----------------------*/
-+static int getfh_open(struct inode *inode, struct file *file)
++/*
++ * Free an I/O state
++ */
++void
++objio_free_io_state(struct objlayout_io_state *ol_state)
+{
-+ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
-+ if (file->private_data == NULL)
-+ return -ENOMEM;
++ struct panfs_shim_io_state *state = container_of(ol_state,
++ struct panfs_shim_io_state, ol_state);
++ int i;
+
-+ return 0;
++ dprintk("%s: freeing io_state\n", __func__);
++ for (i = 0; i < state->ol_state.nr_pages; i++)
++ kunmap(state->ol_state.pages[i]);
++
++ if (state->ucreds)
++ panfs_export_ops->ucreds_put(state->ucreds);
++ kfree(state->sg_list);
++ kfree(state);
+}
+
-+static ssize_t getfh_read(struct file *file, char __user *buf, size_t count,
-+ loff_t *offset)
++static int
++panfs_shim_pages_to_sg(
++ struct panfs_shim_io_state *state,
++ struct page **pages,
++ unsigned int pgbase,
++ unsigned nr_pages,
++ size_t count)
+{
-+ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh)))
-+ return -EFAULT;
++ unsigned i, n;
++ pan_sg_entry_t *sg;
+
-+ return count;
-+}
++ dprintk("%s pgbase %u nr_pages %u count %d "
++ "pg0 %p flags 0x%x index %llu\n",
++ __func__, pgbase, nr_pages, (int)count, pages[0],
++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index);
+
-+static ssize_t getfh_write(struct file *file, const char __user *buf,
-+ size_t count, loff_t *offset)
-+{
-+ int fd;
++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL);
++ if (sg == NULL)
++ return -ENOMEM;
+
-+ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int)))
-+ return -EFAULT;
-+ if (spnfs_getfh(fd, file->private_data) != 0)
-+ return -EIO;
++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n",
++ __func__, sg, pages, pgbase, nr_pages);
+
-+ return count;
-+}
++ for (i = 0; i < nr_pages; i++) {
++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase;
++ n = PAGE_SIZE - pgbase;
++ pgbase = 0;
++ if (n > count)
++ n = count;
++ sg[i].chunk_size = n;
++ count -= n;
++ if (likely(count)) {
++ sg[i].next = &sg[i+1];
++ } else {
++ /* we're done */
++ sg[i].next = NULL;
++ break;
++ }
++ }
++ BUG_ON(count);
+
-+static int getfh_release(struct inode *inode, struct file *file)
-+{
-+ kfree(file->private_data);
++ state->sg_list = sg;
+ return 0;
+}
+
-+static const struct file_operations getfh_ops = {
-+ .open = getfh_open,
-+ .read = getfh_read,
-+ .write = getfh_write,
-+ .release = getfh_release,
-+};
-+/*-------------- end getfh ------------------------*/
-+
-+
-+/*-------------- start recall layout --------------*/
-+static ssize_t recall_write(struct file *file, const char __user *buf,
-+ size_t count, loff_t *offset)
++/*
++ * Callback function for async reads
++ */
++static void
++panfs_shim_read_done(
++ void *arg1,
++ void *arg2,
++ pan_sam_read_res_t *res_p,
++ pan_status_t rc)
+{
-+ char input[128];
-+ char *path, *str, *p;
-+ int rc;
-+ u64 off = 0, len = 0;
++ struct panfs_shim_io_state *state = arg1;
++ ssize_t status;
+
-+ if (count > 128)
-+ return -EINVAL;
++ dprintk("%s: Begin\n", __func__);
++ if (!res_p)
++ res_p = &state->u.read.res;
++ if (rc == PAN_SUCCESS)
++ rc = res_p->result;
++ if (rc == PAN_SUCCESS) {
++ status = res_p->length;
++ WARN_ON(status < 0);
++ } else {
++ status = -panfs_export_ops->convert_rc(rc);
++ dprintk("%s: pan_sam_read rc %d: status %Zd\n",
++ __func__, rc, status);
++ }
++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
++ objlayout_read_done(&state->ol_state, status, true);
++}
+
-+ if (copy_from_user(input, buf, count))
-+ return -EFAULT;
++ssize_t
++objio_read_pagelist(struct objlayout_io_state *ol_state)
++{
++ struct panfs_shim_io_state *state = container_of(ol_state,
++ struct panfs_shim_io_state, ol_state);
++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
++ ssize_t status = 0;
++ pan_status_t rc = PAN_SUCCESS;
+
-+ /* assumes newline-terminated path */
-+ p = memchr(input, '\n', count);
-+ if (p == NULL)
-+ return -EINVAL;
-+ *p = '\0';
++ dprintk("%s: Begin\n", __func__);
+
-+ /*
-+ * Scan for path and, optionally, an offset and length
-+ * of a layout segment to be recalled; if there are two
-+ * fields, they're assumed to be path and offset.
-+ */
-+ p = input;
-+ path = strsep(&p, " ");
-+ if (path == NULL)
-+ return -EINVAL;
++ status = panfs_shim_pages_to_sg(state, ol_state->pages,
++ ol_state->pgbase, ol_state->nr_pages,
++ ol_state->count);
++ if (unlikely(status))
++ goto err;
+
-+ str = strsep(&p, " ");
-+ if (str != NULL) {
-+ rc = strict_strtoull(str, 10, &off);
-+ if (rc != 0)
-+ return -EINVAL;
++ state->obj_sec.min_security = 0;
++ state->obj_sec.map_ccaps = mcs;
+
-+ str = strsep(&p, " ");
-+ if (str != NULL) {
-+ rc = strict_strtoull(str, 10, &len);
-+ if (rc != 0)
-+ return -EINVAL;
-+ }
++ rc = panfs_export_ops->ucreds_get(&state->ucreds);
++ if (unlikely(rc)) {
++ status = -EACCES;
++ goto err;
+ }
+
-+ rc = spnfs_test_layoutrecall(path, off, len);
-+ if (rc != 0)
-+ return rc;
-+
-+ return count;
++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id;
++ state->u.read.args.offset = ol_state->offset;
++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP,
++ &state->u.read.args,
++ &state->obj_sec,
++ state->sg_list,
++ state->ucreds,
++ ol_state->sync ?
++ NULL : panfs_shim_read_done,
++ state, NULL,
++ &state->u.read.res);
++ if (rc != PAN_ERR_IN_PROGRESS)
++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc);
++ err:
++ dprintk("%s: Return %Zd\n", __func__, status);
++ return status;
+}
+
-+static const struct file_operations recall_ops = {
-+ .write = recall_write,
-+};
-+/*-------------- end recall layout --------------*/
-+
-+
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+/*-------------- start layoutseg -------------------------*/
-+static ssize_t layoutseg_write(struct file *file, const char __user *buf,
-+ size_t count, loff_t *offset)
++/*
++ * Callback function for async writes
++ */
++static void
++panfs_shim_write_done(
++ void *arg1,
++ void *arg2,
++ pan_sam_write_res_t *res_p,
++ pan_status_t rc)
+{
-+ char cmd[3];
++ struct panfs_shim_io_state *state = arg1;
++ ssize_t status;
+
-+ if (copy_from_user(cmd, buf, 1))
-+ return -EFAULT;
-+ if (cmd[0] == '0')
-+ spnfs_use_layoutsegments = 0;
-+ else
-+ spnfs_use_layoutsegments = 1;
++ dprintk("%s: Begin\n", __func__);
++ if (!res_p)
++ res_p = &state->u.write.res;
++ if (rc == PAN_SUCCESS)
++ rc = res_p->result;
++ if (rc == PAN_SUCCESS) {
++/* state->ol_state.committed = NFS_FILE_SYNC;*/
++ state->ol_state.committed = NFS_UNSTABLE;
++ status = res_p->length;
++ WARN_ON(status < 0);
+
-+ return count;
++ objlayout_add_delta_space_used(&state->ol_state,
++ res_p->delta_capacity_used);
++ } else {
++ status = -panfs_export_ops->convert_rc(rc);
++ dprintk("%s: pan_sam_write rc %u: status %Zd\n",
++ __func__, rc, status);
++ }
++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
++ objlayout_write_done(&state->ol_state, status, true);
+}
+
-+static const struct file_operations layoutseg_ops = {
-+ .write = layoutseg_write,
-+};
-+/*-------------- end layoutseg ---------------------------*/
-+
-+/*-------------- start layoutsegsize -------------------------*/
-+static ssize_t layoutsegsize_write(struct file *file, const char __user *buf,
-+ size_t count, loff_t *offset)
++ssize_t
++objio_write_pagelist(struct objlayout_io_state *ol_state,
++ bool stable /* unused, PanOSD writes are stable */)
+{
-+ char cmd[50];
++ struct panfs_shim_io_state *state = container_of(ol_state,
++ struct panfs_shim_io_state, ol_state);
++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
++ ssize_t status = 0;
++ pan_status_t rc = PAN_SUCCESS;
+
-+ if (copy_from_user(cmd, buf, 49))
-+ return -EFAULT;
-+ layoutsegment_size = simple_strtoull(cmd, NULL, 10);
++ dprintk("%s: Begin\n", __func__);
+
-+ return count;
++ status = panfs_shim_pages_to_sg(state, ol_state->pages,
++ ol_state->pgbase, ol_state->nr_pages,
++ ol_state->count);
++ if (unlikely(status))
++ goto err;
++
++ state->obj_sec.min_security = 0;
++ state->obj_sec.map_ccaps = mcs;
++
++ rc = panfs_export_ops->ucreds_get(&state->ucreds);
++ if (unlikely(rc)) {
++ status = -EACCES;
++ goto err;
++ }
++
++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id;
++ state->u.write.args.offset = ol_state->offset;
++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE,
++ &state->u.write.args,
++ &state->obj_sec,
++ state->sg_list,
++ state->ucreds,
++ ol_state->sync ?
++ NULL : panfs_shim_write_done,
++ state,
++ NULL,
++ &state->u.write.res);
++ if (rc != PAN_ERR_IN_PROGRESS)
++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc);
++ err:
++ dprintk("%s: Return %Zd\n", __func__, status);
++ return status;
+}
+
-+static const struct file_operations layoutsegsize_ops = {
-+ .write = layoutsegsize_write,
-+};
-+/*-------------- end layoutsegsize ---------------------------*/
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
-+
+int
-+spnfs_init_proc(void)
++panfs_shim_register(struct panfs_export_operations *ops)
+{
-+ struct proc_dir_entry *entry;
-+
-+ entry = proc_mkdir("fs/spnfs", NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+
-+ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+ entry->proc_fops = &ctl_ops;
-+
-+ entry = create_proc_entry("fs/spnfs/config", 0, NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+ entry->proc_fops = &config_ops;
++ if (panfs_export_ops) {
++ printk(KERN_INFO
++ "%s: panfs already registered (panfs ops %p)\n",
++ __func__, panfs_export_ops);
++ return -EINVAL;
++ }
+
-+ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+ entry->proc_fops = &getfh_ops;
++ printk(KERN_INFO "%s: registering panfs ops %p\n",
++ __func__, ops);
+
-+ entry = create_proc_entry("fs/spnfs/recall", 0, NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+ entry->proc_fops = &recall_ops;
++ panfs_export_ops = ops;
++ return 0;
++}
++EXPORT_SYMBOL(panfs_shim_register);
+
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+ entry->proc_fops = &layoutseg_ops;
++int
++panfs_shim_unregister(void)
++{
++ if (!panfs_export_ops) {
++ printk(KERN_INFO "%s: panfs is not registered\n", __func__);
++ return -EINVAL;
++ }
+
-+ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL);
-+ if (!entry)
-+ return -ENOMEM;
-+ entry->proc_fops = &layoutsegsize_ops;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++ printk(KERN_INFO "%s: unregistering panfs ops %p\n",
++ __func__, panfs_export_ops);
+
++ panfs_export_ops = NULL;
+ return 0;
+}
-+#endif /* CONFIG_PROC_FS */
-diff -up linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c
---- linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c.orig 2010-09-30 12:25:08.593289000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c 2010-09-30 12:25:08.595287000 -0400
-@@ -0,0 +1,878 @@
++EXPORT_SYMBOL(panfs_shim_unregister);
++
+/*
-+ * fs/nfsd/spnfs_ops.c
-+ *
-+ * Communcation layer between spNFS kernel and userspace
-+ *
++ * Policy Operations
+ */
-+/******************************************************************************
-+
-+(c) 2007 Network Appliance, Inc. All Rights Reserved.
-+
-+Network Appliance provides this source code under the GPL v2 License.
-+The GPL v2 license is available at
-+http://opensource.org/licenses/gpl-license.php.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
-+******************************************************************************/
-+
-+#include <linux/sched.h>
-+#include <linux/file.h>
-+#include <linux/namei.h>
-+#include <linux/nfs_fs.h>
-+#include <linux/nfsd4_spnfs.h>
-+#include <linux/nfsd/debug.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+#include <linux/nfsd/nfs4layoutxdr.h>
-+
-+#include "pnfsd.h"
-+
-+/* comment out CONFIG_SPNFS_TEST for non-test behaviour */
-+/* #define CONFIG_SPNFS_TEST 1 */
-+
-+#define NFSDDBG_FACILITY NFSDDBG_PNFS
++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024)
++#define PANLAYOUT_DEF_STRIPE_WIDTH 9
++#define PANLAYOUT_MAX_STRIPE_WIDTH 11
++#define PANLAYOUT_MAX_GATHER_STRIPES 8
+
+/*
-+ * The functions that are called from elsewhere in the kernel
-+ * to perform tasks in userspace
-+ *
++ * Get the max [rw]size
+ */
-+
-+#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
-+extern int spnfs_use_layoutsegments;
-+extern uint64_t layoutsegment_size;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
-+extern struct spnfs *global_spnfs;
-+
-+int
-+spnfs_layout_type(struct super_block *sb)
++static ssize_t
++panlayout_get_blocksize(void)
+{
-+ return LAYOUT_NFSV4_1_FILES;
++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) *
++ PANLAYOUT_DEF_STRIPE_UNIT *
++ PANLAYOUT_MAX_GATHER_STRIPES;
++ dprintk("%s: Return %Zd\n", __func__, sz);
++ return sz;
+}
+
-+enum nfsstat4
-+spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_layoutget_arg *lg_arg,
-+ struct nfsd4_pnfs_layoutget_res *lg_res)
-+{
-+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
-+ struct spnfs_msg *im = NULL;
-+ union spnfs_msg_res *res = NULL;
-+ struct pnfs_filelayout_layout *flp = NULL;
-+ int status, i;
-+ enum nfsstat4 nfserr;
-+
-+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+ if (im == NULL) {
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+
-+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+ if (res == NULL) {
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+
-+ im->im_type = SPNFS_TYPE_LAYOUTGET;
-+ im->im_args.layoutget_args.inode = inode->i_ino;
-+ im->im_args.layoutget_args.generation = inode->i_generation;
++/*
++ * Don't gather across stripes, but rather gather (coalesce) up to
++ * the stripe size.
++ *
++ * FIXME: change interface to use merge_align, merge_count
++ */
++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS)
+
-+ /* call function to queue the msg for upcall */
-+ if (spnfs_upcall(spnfs, im, res) != 0) {
-+ dprintk("failed spnfs upcall: layoutget\n");
-+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
-+ goto layoutget_cleanup;
-+ }
-+ status = res->layoutget_res.status;
-+ if (status != 0) {
-+ /* FIXME? until user mode is fixed, translate system error */
-+ switch (status) {
-+ case -E2BIG:
-+ case -ETOOSMALL:
-+ nfserr = NFS4ERR_TOOSMALL;
-+ break;
-+ case -ENOMEM:
-+ case -EAGAIN:
-+ case -EINTR:
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ break;
-+ case -ENOENT:
-+ nfserr = NFS4ERR_BADLAYOUT;
-+ break;
-+ default:
-+ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
-+ }
-+ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n",
-+ status, nfserr);
-+ goto layoutget_cleanup;
-+ }
++static struct pnfs_layoutdriver_type panlayout_type = {
++ .id = PNFS_LAYOUT_PANOSD,
++ .name = "PNFS_LAYOUT_PANOSD",
++ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
-+ lg_res->lg_return_on_close = 0;
-+#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
-+ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */
-+ /* the amount requested by the client. */
-+ if (spnfs_use_layoutsegments) {
-+ if (layoutsegment_size != 0)
-+ lg_res->lg_seg.length = layoutsegment_size;
-+ } else
-+ lg_res->lg_seg.length = NFS4_MAX_UINT64;
-+#else
-+ lg_res->lg_seg.length = NFS4_MAX_UINT64;
-+#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++ .initialize_mountpoint = objlayout_initialize_mountpoint,
++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
+
-+ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL);
-+ if (flp == NULL) {
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+ flp->device_id.sbid = lg_arg->lg_sbid;
-+ flp->device_id.devid = res->layoutget_res.devid;
-+ flp->lg_layout_type = 1; /* XXX */
-+ flp->lg_stripe_type = res->layoutget_res.stripe_type;
-+ flp->lg_commit_through_mds = 0;
-+ flp->lg_stripe_unit = res->layoutget_res.stripe_size;
-+ flp->lg_first_stripe_index = 0;
-+ flp->lg_pattern_offset = 0;
-+ flp->lg_fh_length = res->layoutget_res.stripe_count;
++ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
++ .free_layout_hdr = objlayout_free_layout_hdr,
+
-+ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh),
-+ GFP_KERNEL);
-+ if (flp->lg_fh_list == NULL) {
-+ nfserr = NFS4ERR_LAYOUTTRYLATER;
-+ goto layoutget_cleanup;
-+ }
-+ /*
-+ * FIX: Doing an extra copy here. Should group res.flist's fh_len
-+ * and fh_val into a knfsd_fh structure.
-+ */
-+ for (i = 0; i < flp->lg_fh_length; i++) {
-+ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len;
-+ memcpy(&flp->lg_fh_list[i].fh_base,
-+ res->layoutget_res.flist[i].fh_val,
-+ res->layoutget_res.flist[i].fh_len);
-+ }
++ .alloc_lseg = objlayout_alloc_lseg,
++ .free_lseg = objlayout_free_lseg,
+
-+ /* encode the layoutget body */
-+ nfserr = filelayout_encode_layout(xdr, flp);
++ .get_blocksize = panlayout_get_blocksize,
+
-+layoutget_cleanup:
-+ if (flp) {
-+ if (flp->lg_fh_list)
-+ kfree(flp->lg_fh_list);
-+ kfree(flp);
-+ }
-+ kfree(im);
-+ kfree(res);
++ .read_pagelist = objlayout_read_pagelist,
++ .write_pagelist = objlayout_write_pagelist,
++ .commit = objlayout_commit,
+
-+ return nfserr;
-+}
++ .encode_layoutcommit = objlayout_encode_layoutcommit,
++ .encode_layoutreturn = objlayout_encode_layoutreturn,
++};
+
-+int
-+spnfs_layoutcommit(void)
++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs");
++MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
++MODULE_LICENSE("GPL");
++
++static int __init
++panlayout_init(void)
+{
-+ return 0;
++ int ret = pnfs_register_layoutdriver(&panlayout_type);
++
++ if (ret)
++ printk(KERN_INFO
++ "%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n",
++ __func__, ret);
++ else
++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n",
++ __func__);
++ return ret;
+}
+
-+int
-+spnfs_layoutreturn(struct inode *inode,
-+ const struct nfsd4_pnfs_layoutreturn_arg *args)
++static void __exit
++panlayout_exit(void)
+{
-+ return 0;
++ pnfs_unregister_layoutdriver(&panlayout_type);
++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n",
++ __func__);
+}
+
-+int
-+spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
-+{
-+ struct super_block *sb;
-+ struct nfsd4_pnfs_cb_layout lr;
++module_init(panlayout_init);
++module_exit(panlayout_exit);
+diff --git a/fs/nfs/objlayout/panfs_shim.h b/fs/nfs/objlayout/panfs_shim.h
+new file mode 100644
+index 0000000..18ef6db
+--- /dev/null
++++ b/fs/nfs/objlayout/panfs_shim.h
+@@ -0,0 +1,482 @@
++/*
++ * panfs_shim.h
++ *
++ * Data types and external function declerations for interfacing with
++ * panfs (Panasas DirectFlow) I/O stack
++ *
++ * Copyright (C) 2007 Panasas Inc.
++ * All rights reserved.
++ *
++ * Benny Halevy <bhalevy at panasas.com>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the Panasas company nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * See the file COPYING included with this distribution for more details.
++ *
++ */
+
-+ switch (type) {
-+ case RETURN_FILE:
-+ sb = inode->i_sb;
-+ dprintk("%s: recalling layout for ino = %lu\n",
-+ __func__, inode->i_ino);
-+ break;
-+ case RETURN_FSID:
-+ sb = inode->i_sb;
-+ dprintk("%s: recalling layout for fsid x (unimplemented)\n",
-+ __func__);
-+ return 0;
-+ case RETURN_ALL:
-+ /* XXX figure out how to get a sb since there's no inode ptr */
-+ dprintk("%s: recalling all layouts (unimplemented)\n",
-+ __func__);
-+ return 0;
-+ default:
-+ return -EINVAL;
-+ }
++#ifndef _PANLAYOUT_PANFS_SHIM_H
++#define _PANLAYOUT_PANFS_SHIM_H
+
-+ lr.cbl_recall_type = type;
-+ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES;
-+ lr.cbl_seg.clientid = 0;
-+ lr.cbl_seg.offset = offset;
-+ lr.cbl_seg.length = len;
-+ lr.cbl_seg.iomode = IOMODE_ANY;
-+ lr.cbl_layoutchanged = 0;
++typedef s8 pan_int8_t;
++typedef u8 pan_uint8_t;
++typedef s16 pan_int16_t;
++typedef u16 pan_uint16_t;
++typedef s32 pan_int32_t;
++typedef u32 pan_uint32_t;
++typedef s64 pan_int64_t;
++typedef u64 pan_uint64_t;
+
-+ nfsd_layout_recall_cb(sb, inode, &lr);
++/*
++ * from pan_base_types.h
++ */
++typedef pan_uint64_t pan_rpc_none_t;
++typedef pan_uint32_t pan_rpc_arrdim_t;
++typedef pan_uint32_t pan_status_t;
++typedef pan_uint8_t pan_otw_t;
++typedef pan_uint8_t pan_pad_t;
+
-+ return 0;
-+}
++typedef pan_uint32_t pan_timespec_sec_t;
++typedef pan_uint32_t pan_timespec_nsec_t;
+
++typedef struct pan_timespec_s pan_timespec_t;
++struct pan_timespec_s {
++ pan_timespec_sec_t ts_sec;
++ pan_timespec_nsec_t ts_nsec;
++};
+
-+int
-+spnfs_test_layoutrecall(char *path, u64 offset, u64 len)
-+{
-+ struct nameidata nd;
-+ struct inode *inode;
-+ int type, rc;
++/*
++ * from pan_std_types.h
++ */
++typedef pan_uint32_t pan_size_t;
++typedef int pan_bool_t;
+
-+ dprintk("%s: path=%s, offset=%llu, len=%llu\n",
-+ __func__, path, offset, len);
++/*
++ * from pan_common_error.h
++ */
++#define PAN_SUCCESS ((pan_status_t)0)
++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55)
+
-+ if (strcmp(path, "all") == 0) {
-+ inode = NULL;
-+ type = RETURN_ALL;
-+ } else {
-+ rc = path_lookup(path, 0, &nd);
-+ if (rc != 0)
-+ return -ENOENT;
++/*
++ * from pan_sg.h
++ */
++typedef struct pan_sg_entry_s pan_sg_entry_t;
++struct pan_sg_entry_s {
++ void *buffer; /* pointer to memory */
++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */
++ pan_sg_entry_t *next;
++};
+
-+ /*
-+ * XXX todo: add a RETURN_FSID scenario here...maybe if
-+ * inode is a dir...
-+ */
++/*
++ * from pan_storage.h
++ */
++typedef pan_uint64_t pan_stor_dev_id_t;
++typedef pan_uint32_t pan_stor_obj_grp_id_t;
++typedef pan_uint64_t pan_stor_obj_uniq_t;
++typedef pan_uint32_t pan_stor_action_t;
++typedef pan_uint8_t pan_stor_cap_key_t[20];
+
-+ inode = nd.path.dentry->d_inode;
-+ type = RETURN_FILE;
-+ }
++typedef pan_uint8_t pan_stor_key_type_t;
++typedef pan_uint64_t pan_stor_len_t;
++typedef pan_int64_t pan_stor_delta_len_t;
++typedef pan_uint64_t pan_stor_offset_t;
++typedef pan_uint16_t pan_stor_op_t;
+
-+ if (len == 0)
-+ len = NFS4_MAX_UINT64;
++typedef pan_uint16_t pan_stor_sec_level_t;
+
-+ rc = spnfs_layoutrecall(inode, type, offset, len);
++struct pan_stor_obj_id_s {
++ pan_stor_dev_id_t dev_id;
++ pan_stor_obj_uniq_t obj_id;
++ pan_stor_obj_grp_id_t grp_id;
++};
+
-+ if (type != RETURN_ALL)
-+ path_put(&nd.path);
-+ return rc;
-+}
++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t;
+
-+int
-+spnfs_getdeviceiter(struct super_block *sb,
-+ u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *gd_res)
-+{
-+ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */
-+ struct spnfs_msg *im = NULL;
-+ union spnfs_msg_res *res = NULL;
-+ int status = 0;
++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U)
++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U)
++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U)
++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U)
++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U)
++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U)
++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U)
++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U)
++
++/*
++ * from pan_aggregation_map.h
++ */
++typedef pan_uint8_t pan_agg_type_t;
++typedef pan_uint64_t pan_agg_map_version_t;
++typedef pan_uint8_t pan_agg_obj_state_t;
++typedef pan_uint8_t pan_agg_comp_state_t;
++typedef pan_uint8_t pan_agg_comp_flag_t;
++
++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00)
++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01)
++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02)
++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03)
++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04)
++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05)
++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06)
++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07)
++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00)
++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01)
++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02)
++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03)
++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00)
++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01)
++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02)
++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04)
+
-+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+ if (im == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceiter_out;
-+ }
++struct pan_aggregation_map_s {
++ pan_agg_map_version_t version;
++ pan_agg_obj_state_t avail_state;
++ pan_stor_obj_id_t obj_id;
++};
+
-+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+ if (res == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceiter_out;
-+ }
++typedef struct pan_aggregation_map_s pan_aggregation_map_t;
+
-+ im->im_type = SPNFS_TYPE_GETDEVICEITER;
-+ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie;
-+ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf;
++struct pan_agg_comp_obj_s {
++ pan_stor_dev_id_t dev_id;
++ pan_agg_comp_state_t avail_state;
++ pan_agg_comp_flag_t comp_flags;
++};
+
-+ /* call function to queue the msg for upcall */
-+ status = spnfs_upcall(spnfs, im, res);
-+ if (status != 0) {
-+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+ status = -EIO;
-+ goto getdeviceiter_out;
-+ }
-+ status = res->getdeviceiter_res.status;
++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t;
+
-+ if (res->getdeviceiter_res.eof)
-+ gd_res->gd_eof = 1;
-+ else {
-+ gd_res->gd_devid = res->getdeviceiter_res.devid;
-+ gd_res->gd_cookie = res->getdeviceiter_res.cookie;
-+ gd_res->gd_verf = res->getdeviceiter_res.verf;
-+ gd_res->gd_eof = 0;
-+ }
++struct pan_agg_simple_header_s {
++ pan_uint8_t unused;
++};
+
-+getdeviceiter_out:
-+ kfree(im);
-+ kfree(res);
++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t;
+
-+ return status;
-+}
++struct pan_agg_raid1_header_s {
++ pan_uint16_t num_comps;
++};
+
-+#ifdef CONFIG_SPNFS_TEST
-+/*
-+ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the
-+ * 1024 encoded stripe indices.
-+ *
-+ * Skip the devaddr4 length and encode the indicies count (1024) in the
-+ * rq_res.head and set the rq_res.head length.
-+ *
-+ * Set the rq_res page_len to 4096 (for the 1024 stripe indices).
-+ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the
-+ * rq_res head to hold the rest of the getdeviceinfo return.
-+ *
-+ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and
-+ * rq_respages[rq_resused] contains the rq_res.pages.
-+ */
-+static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info,
-+ const struct pnfs_filelayout_device *fdev)
-+{
-+ struct nfsd4_compoundres *resp = info->resp;
-+ struct svc_rqst *rqstp = resp->rqstp;
-+ struct xdr_buf *xb = &resp->rqstp->rq_res;
-+ __be32 *p;
++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t;
+
-+ p = nfsd4_xdr_reserve_space(resp, 8);
-+ p++; /* Fill in length later */
-+ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */
-+ resp->p = p;
++struct pan_agg_raid0_header_s {
++ pan_uint16_t num_comps;
++ pan_uint32_t stripe_unit;
++};
+
-+ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base;
-+ xb->pages = &rqstp->rq_respages[rqstp->rq_resused];
-+ xb->page_base = 0;
-+ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */
-+ xb->tail[0].iov_base = resp->p;
-+ resp->end = xb->head[0].iov_base + PAGE_SIZE;
-+ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p;
-+ return 0;
-+}
-+/*
-+ * Return a stripeindices of length 1024 to test
-+ * the pNFS client multipage getdeviceinfo implementation.
-+ *
-+ * Encode a page of stripe indices.
-+ */
-+static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev,
-+ struct spnfs_device *dev,
-+ struct pnfs_devinfo_arg *info)
-+{
-+ struct svc_rqst *rqstp = info->xdr.resp->rqstp;
-+ __be32 *p;
-+ int i, j = 0;
++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t;
+
-+ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]);
-+ fldev->fl_stripeindices_length = 1024;
-+ /* round-robin the data servers device index into the stripe indicie */
-+ for (i = 0; i < 1024; i++) {
-+ *p++ = cpu_to_be32(j);
-+ if (j < dev->dscount - 1)
-+ j++;
-+ else
-+ j = 0;
-+ }
-+ fldev->fl_stripeindices_list = NULL;
-+}
-+#endif /* CONFIG_SPNFS_TEST */
++struct pan_agg_raid5_left_header_s {
++ pan_uint16_t num_comps;
++ pan_uint32_t stripe_unit0;
++ pan_uint32_t stripe_unit1;
++ pan_uint32_t stripe_unit2;
++};
+
-+int
-+spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *devid)
-+{
-+ struct spnfs *spnfs = global_spnfs;
-+ struct spnfs_msg *im = NULL;
-+ union spnfs_msg_res *res = NULL;
-+ struct spnfs_device *dev;
-+ struct pnfs_filelayout_device *fldev = NULL;
-+ struct pnfs_filelayout_multipath *mp = NULL;
-+ struct pnfs_filelayout_devaddr *fldap = NULL;
-+ int status = 0, i, len;
++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t;
+
-+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+ if (im == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t;
+
-+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+ if (res == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
++struct pan_agg_grp_raid5_left_header_s {
++ pan_uint16_t num_comps;
++ pan_uint32_t stripe_unit;
++ pan_uint16_t rg_width;
++ pan_uint16_t rg_depth;
++ pan_uint8_t group_layout_policy;
++};
+
-+ im->im_type = SPNFS_TYPE_GETDEVICEINFO;
-+ /* XXX FIX: figure out what to do about fsid */
-+ im->im_args.getdeviceinfo_args.devid = devid->devid;
++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00)
++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01)
+
-+ /* call function to queue the msg for upcall */
-+ status = spnfs_upcall(spnfs, im, res);
-+ if (status != 0) {
-+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+ status = -EIO;
-+ goto getdeviceinfo_out;
-+ }
-+ status = res->getdeviceinfo_res.status;
-+ if (status != 0)
-+ goto getdeviceinfo_out;
++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00)
++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01)
++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02)
++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03)
++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04)
++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06)
++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01)
++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06)
+
-+ dev = &res->getdeviceinfo_res.devinfo;
++struct pan_agg_layout_hdr_s {
++ pan_agg_type_t type;
++ pan_pad_t pad[3];
++ union {
++ pan_uint64_t null;
++ pan_agg_simple_header_t simple;
++ pan_agg_raid1_header_t raid1;
++ pan_agg_raid0_header_t raid0;
++ pan_agg_raid5_left_header_t raid5_left;
++ pan_agg_grp_raid5_left_header_t grp_raid5_left;
++ } hdr;
++};
+
-+ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */
-+ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL);
-+ if (fldev == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t;
+
-+ /*
-+ * Stripe count is the same as data server count for our purposes
-+ */
-+ fldev->fl_stripeindices_length = dev->dscount;
-+ fldev->fl_device_length = dev->dscount;
++struct pan_agg_comp_obj_a_s {
++ pan_rpc_arrdim_t size;
++ pan_agg_comp_obj_t *data;
++};
++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a;
+
-+ /* Set stripe indices */
-+#ifdef CONFIG_SPNFS_TEST
-+ spnfs_set_test_indices(fldev, dev, info);
-+ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr;
-+#else /* CONFIG_SPNFS_TEST */
-+ fldev->fl_stripeindices_list =
-+ kmalloc(fldev->fl_stripeindices_length * sizeof(u32),
-+ GFP_KERNEL);
-+ if (fldev->fl_stripeindices_list == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
-+ for (i = 0; i < fldev->fl_stripeindices_length; i++)
-+ fldev->fl_stripeindices_list[i] = i;
-+#endif /* CONFIG_SPNFS_TEST */
++struct pan_agg_full_map_s {
++ pan_aggregation_map_t map_hdr;
++ pan_agg_layout_hdr_t layout_hdr;
++ pan_agg_comp_obj_a components;
++};
+
-+ /*
-+ * Set the device's data server addresses No multipath for spnfs,
-+ * so mp length is always 1.
-+ *
-+ */
-+ fldev->fl_device_list =
-+ kmalloc(fldev->fl_device_length *
-+ sizeof(struct pnfs_filelayout_multipath),
-+ GFP_KERNEL);
-+ if (fldev->fl_device_list == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
-+ for (i = 0; i < fldev->fl_device_length; i++) {
-+ mp = &fldev->fl_device_list[i];
-+ mp->fl_multipath_length = 1;
-+ mp->fl_multipath_list =
-+ kmalloc(sizeof(struct pnfs_filelayout_devaddr),
-+ GFP_KERNEL);
-+ if (mp->fl_multipath_list == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
-+ fldap = mp->fl_multipath_list;
++typedef struct pan_agg_full_map_s pan_agg_full_map_t;
+
-+ /*
-+ * Copy the netid into the device address, for example: "tcp"
-+ */
-+ len = strlen(dev->dslist[i].netid);
-+ fldap->r_netid.data = kmalloc(len, GFP_KERNEL);
-+ if (fldap->r_netid.data == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
-+ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len);
-+ fldap->r_netid.len = len;
++/*
++ * from pan_obsd_rpc_types.h
++ */
++typedef pan_uint8_t pan_obsd_security_key_a[16];
+
-+ /*
-+ * Copy the network address into the device address,
-+ * for example: "10.35.9.16.08.01"
-+ */
-+ len = strlen(dev->dslist[i].addr);
-+ fldap->r_addr.data = kmalloc(len, GFP_KERNEL);
-+ if (fldap->r_addr.data == NULL) {
-+ status = -ENOMEM;
-+ goto getdeviceinfo_out;
-+ }
-+ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len);
-+ fldap->r_addr.len = len;
-+ }
++typedef pan_uint8_t pan_obsd_capability_key_a[20];
+
-+ /* encode the device data */
-+ status = filelayout_encode_devinfo(xdr, fldev);
++typedef pan_uint8_t pan_obsd_key_holder_id_t;
+
-+getdeviceinfo_out:
-+ if (fldev) {
-+ kfree(fldev->fl_stripeindices_list);
-+ if (fldev->fl_device_list) {
-+ for (i = 0; i < fldev->fl_device_length; i++) {
-+ fldap =
-+ fldev->fl_device_list[i].fl_multipath_list;
-+ kfree(fldap->r_netid.data);
-+ kfree(fldap->r_addr.data);
-+ kfree(fldap);
-+ }
-+ kfree(fldev->fl_device_list);
-+ }
-+ kfree(fldev);
-+ }
++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01)
++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02)
+
-+ kfree(im);
-+ kfree(res);
++struct pan_obsd_key_holder_s {
++ pan_obsd_key_holder_id_t select;
++ pan_pad_t pad[3];
++ union {
++ pan_obsd_security_key_a basis_key;
++ pan_obsd_capability_key_a cap_key;
++ } key;
++};
+
-+ return status;
-+}
++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t;
+
-+int
-+spnfs_setattr(void)
-+{
-+ return 0;
-+}
++/*
++ * from pan_sm_sec.h
++ */
++typedef pan_uint8_t pan_sm_sec_type_t;
++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t;
+
-+int
-+spnfs_open(struct inode *inode, struct nfsd4_open *open)
-+{
-+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
-+ struct spnfs_msg *im = NULL;
-+ union spnfs_msg_res *res = NULL;
-+ int status = 0;
++struct pan_obsd_capability_generic_otw_t_s {
++ pan_rpc_arrdim_t size;
++ pan_uint8_t *data;
++};
++typedef struct pan_obsd_capability_generic_otw_t_s
++ pan_obsd_capability_generic_otw_t;
+
-+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+ if (im == NULL) {
-+ status = -ENOMEM;
-+ goto open_out;
-+ }
++struct pan_sm_sec_obsd_s {
++ pan_obsd_key_holder_t key;
++ pan_obsd_capability_generic_otw_t cap_otw;
++ pan_sm_sec_otw_allo_mode_t allo_mode;
++};
+
-+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+ if (res == NULL) {
-+ status = -ENOMEM;
-+ goto open_out;
-+ }
++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t;
+
-+ im->im_type = SPNFS_TYPE_OPEN;
-+ im->im_args.open_args.inode = inode->i_ino;
-+ im->im_args.open_args.generation = inode->i_generation;
-+ im->im_args.open_args.create = open->op_create;
-+ im->im_args.open_args.createmode = open->op_createmode;
-+ im->im_args.open_args.truncate = open->op_truncate;
++struct pan_sm_sec_s {
++ pan_sm_sec_type_t type;
++ pan_pad_t pad[3];
++ union {
++ pan_rpc_none_t none;
++ pan_sm_sec_obsd_t obsd;
++ } variant;
++};
+
-+ /* call function to queue the msg for upcall */
-+ status = spnfs_upcall(spnfs, im, res);
-+ if (status != 0) {
-+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+ status = -EIO;
-+ goto open_out;
-+ }
-+ status = res->open_res.status;
++typedef struct pan_sm_sec_s pan_sm_sec_t;
+
-+open_out:
-+ kfree(im);
-+ kfree(res);
++struct pan_sm_sec_a_s {
++ pan_rpc_arrdim_t size;
++ pan_sm_sec_t *data;
++};
++typedef struct pan_sm_sec_a_s pan_sm_sec_a;
++typedef pan_otw_t *pan_sm_sec_otw_t;
+
-+ return status;
-+}
++/*
++ * from pan_sm_types.h
++ */
++typedef pan_uint64_t pan_sm_cap_handle_t;
+
-+int
-+spnfs_create(void)
-+{
-+ return 0;
-+}
++struct pan_sm_map_cap_s {
++ pan_agg_full_map_t full_map;
++ pan_stor_offset_t offset;
++ pan_stor_len_t length;
++ pan_sm_sec_a secs;
++ pan_sm_cap_handle_t handle;
++ pan_timespec_t expiration_time;
++ pan_stor_action_t action_mask;
++ pan_uint32_t flags;
++};
++
++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t;
+
+/*
-+ * Invokes the spnfsd with the inode number of the object to remove.
-+ * The file has already been removed on the MDS, so all the spnsfd
-+ * daemon does is remove the stripes.
-+ * Returns 0 on success otherwise error code
++ * from pan_sm_ops.h
+ */
-+int
-+spnfs_remove(unsigned long ino, unsigned long generation)
-+{
-+ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
-+ struct spnfs_msg *im = NULL;
-+ union spnfs_msg_res *res = NULL;
-+ int status = 0;
++typedef pan_rpc_none_t pan_sm_cache_ptr_t;
+
-+ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
-+ if (im == NULL) {
-+ status = -ENOMEM;
-+ goto remove_out;
-+ }
++/*
++ * from pan_sam_api.h
++ */
++typedef pan_uint32_t pan_sam_access_flags_t;
+
-+ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
-+ if (res == NULL) {
-+ status = -ENOMEM;
-+ goto remove_out;
-+ }
++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t;
++struct pan_sam_dev_error_s {
++ pan_stor_dev_id_t dev_id;
++ pan_stor_op_t stor_op;
++ pan_status_t error;
++};
+
-+ im->im_type = SPNFS_TYPE_REMOVE;
-+ im->im_args.remove_args.inode = ino;
-+ im->im_args.remove_args.generation = generation;
++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t;
++struct pan_sam_ext_status_s {
++ pan_uint32_t available;
++ pan_uint32_t size;
++ pan_sam_dev_error_t *errors;
++};
+
-+ /* call function to queue the msg for upcall */
-+ status = spnfs_upcall(spnfs, im, res);
-+ if (status != 0) {
-+ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
-+ status = -EIO;
-+ goto remove_out;
-+ }
-+ status = res->remove_res.status;
++enum pan_sam_rpc_sec_sel_e {
++ PAN_SAM_RPC_SEC_DEFAULT,
++ PAN_SAM_RPC_SEC_ATLEAST,
++ PAN_SAM_RPC_SEC_EXACTLY
++};
++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t;
+
-+remove_out:
-+ kfree(im);
-+ kfree(res);
++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t;
++struct pan_sam_obj_sec_s {
++ pan_stor_sec_level_t min_security;
++ pan_sm_map_cap_t *map_ccaps;
++};
+
-+ return status;
-+}
++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t;
++struct pan_sam_rpc_sec_s {
++ pan_sam_rpc_sec_sel_t selector;
++};
+
-+static int
-+read_one(struct inode *inode, loff_t offset, size_t len, char *buf,
-+ struct file **filp)
-+{
-+ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
-+ size_t iolen;
-+ int completed = 0, ds, err;
++typedef struct pan_sam_read_args_s pan_sam_read_args_t;
++struct pan_sam_read_args_s {
++ pan_stor_obj_id_t obj_id;
++ pan_sm_cache_ptr_t obj_ent;
++ void *return_attr;
++ void *checksum;
++ pan_stor_offset_t offset;
++ pan_uint16_t sm_options;
++ void *callout;
++ void *callout_arg;
++};
++
++typedef struct pan_sam_read_res_s pan_sam_read_res_t;
++struct pan_sam_read_res_s {
++ pan_status_t result;
++ pan_sam_ext_status_t ext_status;
++ pan_stor_len_t length;
++ void *attr;
++ void *checksum;
++};
++
++typedef void (*pan_sam_read_cb_t)(
++ void *user_arg1,
++ void *user_arg2,
++ pan_sam_read_res_t *res_p,
++ pan_status_t status);
+
-+ while (len > 0) {
-+ tmp = offset;
-+ soff = do_div(tmp, spnfs_config->stripe_size);
-+ snum = tmp;
-+ ds = do_div(tmp, spnfs_config->num_ds);
-+ if (spnfs_config->dense_striping == 0)
-+ soffset = offset;
-+ else {
-+ tmp = snum;
-+ do_div(tmp, spnfs_config->num_ds);
-+ soffset = tmp * spnfs_config->stripe_size + soff;
-+ }
-+ if (len < spnfs_config->stripe_size - soff)
-+ iolen = len;
-+ else
-+ iolen = spnfs_config->stripe_size - soff;
++#define PAN_SAM_ACCESS_NONE 0x0000
++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020
+
-+ pos = soffset;
-+ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos);
-+ if (err < 0)
-+ return -EIO;
-+ if (err == 0)
-+ break;
-+ filp[ds]->f_pos = pos;
-+ iolen = err;
-+ completed += iolen;
-+ len -= iolen;
-+ offset += iolen;
-+ bufoffset += iolen;
-+ }
++typedef struct pan_sam_write_args_s pan_sam_write_args_t;
++struct pan_sam_write_args_s {
++ pan_stor_obj_id_t obj_id;
++ pan_sm_cache_ptr_t obj_ent;
++ pan_stor_offset_t offset;
++ void *attr;
++ void *return_attr;
++};
+
-+ return completed;
-+}
++typedef struct pan_sam_write_res_s pan_sam_write_res_t;
++struct pan_sam_write_res_s {
++ pan_status_t result;
++ pan_sam_ext_status_t ext_status;
++ pan_stor_len_t length;
++ pan_stor_delta_len_t delta_capacity_used;
++ pan_bool_t parity_dirty;
++ void *attr;
++};
+
-+static __be32
-+read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
-+ struct svc_rqst *rqstp)
-+{
-+ int i, vnum, err, bytecount = 0;
-+ char path[128];
-+ struct file *filp[SPNFS_MAX_DATA_SERVERS];
-+ size_t iolen;
-+ __be32 status = nfs_ok;
++typedef void (*pan_sam_write_cb_t)(
++ void *user_arg1,
++ void *user_arg2,
++ pan_sam_write_res_t *res_p,
++ pan_status_t status);
+
-+ /*
-+ * XXX We should just be doing this at open time, but it gets
-+ * kind of messy storing this info in nfsd's state structures
-+ * and piggybacking its path through the various state handling
-+ * functions. Revisit this.
-+ */
-+ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
-+ for (i = 0; i < spnfs_config->num_ds; i++) {
-+ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
-+ inode->i_ino, inode->i_generation);
-+ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0);
-+ if (filp[i] == NULL) {
-+ status = nfserr_io;
-+ goto read_out;
-+ }
-+ get_file(filp[i]);
-+ }
++/*
++ * from pan_mgr_types.h
++ */
++#define PAN_MGR_ID_TYPE_SHIFT 56
++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL)
++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL)
+
-+ for (vnum = 0 ; vnum < vlen ; vnum++) {
-+ iolen = rqstp->rq_vec[vnum].iov_len;
-+ err = read_one(inode, offset + bytecount, iolen,
-+ (char *)rqstp->rq_vec[vnum].iov_base, filp);
-+ if (err < 0) {
-+ status = nfserr_io;
-+ goto read_out;
-+ }
-+ if (err < iolen) {
-+ bytecount += err;
-+ goto read_out;
-+ }
-+ bytecount += rqstp->rq_vec[vnum].iov_len;
-+ }
++typedef pan_uint16_t pan_mgr_type_t;
++typedef pan_uint64_t pan_mgr_id_t;
+
-+read_out:
-+ *lenp = bytecount;
-+ for (i = 0; i < spnfs_config->num_ds; i++) {
-+ if (filp[i]) {
-+ filp_close(filp[i], current->files);
-+ fput(filp[i]);
-+ }
-+ }
-+ return status;
-+}
++#define PAN_MGR_SM ((pan_mgr_type_t) 2U)
++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U)
+
-+__be32
-+spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
-+ struct svc_rqst *rqstp)
-+{
-+ if (spnfs_config)
-+ return read(inode, offset, lenp, vlen, rqstp);
-+ else {
-+ printk(KERN_ERR "Please upgrade to latest spnfsd\n");
-+ return nfserr_notsupp;
-+ }
++/*
++ * from pan_mgr_types_c.h
++ */
++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \
++ pan_mgr_id_t _id1, _id2; \
++\
++ _id1 = (_mgr_type_); \
++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \
++ _id1 &= PAN_MGR_ID_TYPE_MASK; \
++ _id2 = (_mgr_uniq_); \
++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \
++ _id1 |= _id2; \
++ *(_mgr_id_p_) = _id1; \
+}
+
-+static int
-+write_one(struct inode *inode, loff_t offset, size_t len, char *buf,
-+ struct file **filp)
-+{
-+ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
-+ size_t iolen;
-+ int completed = 0, ds, err;
-+
-+ while (len > 0) {
-+ tmp = offset;
-+ soff = do_div(tmp, spnfs_config->stripe_size);
-+ snum = tmp;
-+ ds = do_div(tmp, spnfs_config->num_ds);
-+ if (spnfs_config->dense_striping == 0)
-+ soffset = offset;
-+ else {
-+ tmp = snum;
-+ do_div(tmp, spnfs_config->num_ds);
-+ soffset = tmp * spnfs_config->stripe_size + soff;
-+ }
-+ if (len < spnfs_config->stripe_size - soff)
-+ iolen = len;
-+ else
-+ iolen = spnfs_config->stripe_size - soff;
++/*
++ * from pan_storage_c.h
++ */
++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \
++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \
++ == PAN_MGR_OBSD)
+
-+ pos = soffset;
-+ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos);
-+ if (err < 0)
-+ return -EIO;
-+ filp[ds]->f_pos = pos;
-+ iolen = err;
-+ completed += iolen;
-+ len -= iolen;
-+ offset += iolen;
-+ bufoffset += iolen;
-+ }
++/*
++ * pnfs_shim internal definitions
++ */
+
-+ return completed;
-+}
++struct panfs_shim_io_state {
++ struct objlayout_io_state ol_state;
+
-+static __be32
-+write(struct inode *inode, loff_t offset, size_t len, int vlen,
-+ struct svc_rqst *rqstp)
-+{
-+ int i, vnum, err, bytecount = 0;
-+ char path[128];
-+ struct file *filp[SPNFS_MAX_DATA_SERVERS];
-+ size_t iolen;
-+ __be32 status = nfs_ok;
++ pan_sg_entry_t *sg_list;
++ pan_sam_obj_sec_t obj_sec;
++ void *ucreds;
++ union {
++ struct {
++ pan_sam_read_args_t args;
++ pan_sam_read_res_t res;
++ } read;
++ struct {
++ pan_sam_write_args_t args;
++ pan_sam_write_res_t res;
++ } write;
++ } u;
++};
+
-+ /*
-+ * XXX We should just be doing this at open time, but it gets
-+ * kind of messy storing this info in nfsd's state structures
-+ * and piggybacking its path through the various state handling
-+ * functions. Revisit this.
-+ */
-+ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
-+ for (i = 0; i < spnfs_config->num_ds; i++) {
-+ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
-+ inode->i_ino, inode->i_generation);
-+ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0);
-+ if (filp[i] == NULL) {
-+ status = nfserr_io;
-+ goto write_out;
-+ }
-+ get_file(filp[i]);
-+ }
++#endif /* _PANLAYOUT_PANFS_SHIM_H */
+diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+new file mode 100644
+index 0000000..d05c6be
+--- /dev/null
++++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+@@ -0,0 +1,435 @@
++/*
++ * pnfs_osd_xdr.c
++ *
++ * Object-Based pNFS Layout XDR layer
++ *
++ * Copyright (C) 2007-2009 Panasas Inc.
++ * All rights reserved.
++ *
++ * Benny Halevy <bhalevy at panasas.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2
++ * See the file COPYING included with this distribution for more details.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the Panasas company nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
+
-+ for (vnum = 0; vnum < vlen; vnum++) {
-+ iolen = rqstp->rq_vec[vnum].iov_len;
-+ err = write_one(inode, offset + bytecount, iolen,
-+ (char *)rqstp->rq_vec[vnum].iov_base, filp);
-+ if (err != iolen) {
-+ dprintk("spnfs_write: err=%d expected %Zd\n", err, len);
-+ status = nfserr_io;
-+ goto write_out;
-+ }
-+ bytecount += rqstp->rq_vec[vnum].iov_len;
-+ }
++#include <linux/pnfs_osd_xdr.h>
+
-+write_out:
-+ for (i = 0; i < spnfs_config->num_ds; i++) {
-+ if (filp[i]) {
-+ filp_close(filp[i], current->files);
-+ fput(filp[i]);
-+ }
-+ }
++#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
-+ return status;
++/*
++ * The following implementation is based on these Internet Drafts:
++ *
++ * draft-ietf-nfsv4-minorversion-21
++ * draft-ietf-nfsv4-pnfs-obj-12
++ */
++
++/*
++ * struct pnfs_osd_objid {
++ * struct pnfs_deviceid oid_device_id;
++ * u64 oid_partition_id;
++ * u64 oid_object_id;
++ * };
++ */
++static inline u32 *
++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid)
++{
++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data));
++ READ64(objid->oid_partition_id);
++ READ64(objid->oid_object_id);
++ return p;
+}
+
-+__be32
-+spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen,
-+ struct svc_rqst *rqstp)
++static inline u32 *
++pnfs_osd_xdr_decode_opaque_cred(u32 *p,
++ struct pnfs_osd_opaque_cred *opaque_cred)
+{
-+ if (spnfs_config)
-+ return write(inode, offset, len, vlen, rqstp);
-+ else {
-+ printk(KERN_ERR "Please upgrade to latest spnfsd\n");
-+ return nfserr_notsupp;
-+ }
++ READ32(opaque_cred->cred_len);
++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len);
++ return p;
+}
+
-+int
-+spnfs_commit(void)
++/*
++ * struct pnfs_osd_object_cred {
++ * struct pnfs_osd_objid oc_object_id;
++ * u32 oc_osd_version;
++ * u32 oc_cap_key_sec;
++ * struct pnfs_osd_opaque_cred oc_cap_key
++ * struct pnfs_osd_opaque_cred oc_cap;
++ * };
++ */
++static inline u32 *
++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp,
++ u8 **credp)
+{
-+ return 0;
++ u8 *cred;
++
++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id);
++ READ32(comp->oc_osd_version);
++ READ32(comp->oc_cap_key_sec);
++
++ cred = *credp;
++ comp->oc_cap_key.cred = cred;
++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key);
++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len));
++ comp->oc_cap.cred = cred;
++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap);
++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len));
++ *credp = cred;
++
++ return p;
+}
+
+/*
-+ * Return the state for this object.
-+ * At this time simply return 0 to indicate success and use the existing state
++ * struct pnfs_osd_data_map {
++ * u32 odm_num_comps;
++ * u64 odm_stripe_unit;
++ * u32 odm_group_width;
++ * u32 odm_group_depth;
++ * u32 odm_mirror_cnt;
++ * u32 odm_raid_algorithm;
++ * };
+ */
-+int
-+spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg)
++static inline u32 *
++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map)
+{
-+ return 0;
++ READ32(data_map->odm_num_comps);
++ READ64(data_map->odm_stripe_unit);
++ READ32(data_map->odm_group_width);
++ READ32(data_map->odm_group_depth);
++ READ32(data_map->odm_mirror_cnt);
++ READ32(data_map->odm_raid_algorithm);
++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
++ __func__,
++ data_map->odm_num_comps,
++ (unsigned long long)data_map->odm_stripe_unit,
++ data_map->odm_group_width,
++ data_map->odm_group_depth,
++ data_map->odm_mirror_cnt,
++ data_map->odm_raid_algorithm);
++ return p;
++}
++
++struct pnfs_osd_layout *
++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p)
++{
++ int i;
++ u32 *start = p;
++ struct pnfs_osd_object_cred *comp;
++ u8 *cred;
++
++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map);
++ READ32(layout->olo_comps_index);
++ READ32(layout->olo_num_comps);
++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1);
++ comp = layout->olo_comps;
++ cred = (u8 *)(comp + layout->olo_num_comps);
++ dprintk("%s: comps_index=%u num_comps=%u\n",
++ __func__, layout->olo_comps_index, layout->olo_num_comps);
++ for (i = 0; i < layout->olo_num_comps; i++) {
++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred);
++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx "
++ "key_len=%u cap_len=%u\n",
++ __func__, i,
++ _DEVID_LO(&comp->oc_object_id.oid_device_id),
++ _DEVID_HI(&comp->oc_object_id.oid_device_id),
++ comp->oc_object_id.oid_partition_id,
++ comp->oc_object_id.oid_object_id,
++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
++ comp++;
++ }
++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__,
++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout);
++ return layout;
+}
+
+/*
-+ * Return the filehandle for the specified file descriptor
++ * Get Device Information Decoding
++ *
++ * Note: since Device Information is currently done synchronously, most
++ * of the actual fields are left inside the rpc buffer and are only
++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
++ * should not be freed while the returned information is in use.
+ */
-+int
-+spnfs_getfh(int fd, struct nfs_fh *fh)
++
++u32 *__xdr_read_calc_nfs4_string(
++ u32 *p, struct nfs4_string *str, u8 **freespace)
+{
-+ struct file *file;
++ u32 len;
++ char *data;
++ bool need_copy;
+
-+ file = fget(fd);
-+ if (file == NULL)
-+ return -EIO;
++ READ32(len);
++ data = (char *)p;
+
-+ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh));
-+ fput(file);
-+ return 0;
++ if (data[len]) { /* Not null terminated we'll need extra space */
++ data = *freespace;
++ *freespace += len + 1;
++ need_copy = true;
++ } else {
++ need_copy = false;
++ }
++
++ if (str) {
++ str->len = len;
++ str->data = data;
++ if (need_copy) {
++ memcpy(data, p, len);
++ data[len] = 0;
++ }
++ }
++
++ p += XDR_QUADLEN(len);
++ return p;
+}
-diff -up linux-2.6.35.noarch/fs/nfsd/state.h.orig linux-2.6.35.noarch/fs/nfsd/state.h
---- linux-2.6.35.noarch/fs/nfsd/state.h.orig 2010-09-30 12:22:45.302049000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/state.h 2010-09-30 12:25:08.600287000 -0400
-@@ -241,6 +241,12 @@ struct nfs4_client {
- u32 cl_cb_seq_nr;
- struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
- /* wait here for slots */
-+#if defined(CONFIG_PNFSD)
-+ struct list_head cl_layouts; /* outstanding layouts */
-+ struct list_head cl_layoutrecalls; /* outstanding layoutrecall
-+ callbacks */
-+ atomic_t cl_deviceref; /* Num outstanding devs */
-+#endif /* CONFIG_PNFSD */
- };
-
- static inline void
-@@ -357,6 +363,14 @@ struct nfs4_file {
- u32 fi_id; /* used with stateowner->so_id
- * for stateid_hashtbl hash */
- bool fi_had_conflict;
-+#if defined(CONFIG_PNFSD)
-+ struct list_head fi_layouts;
-+ struct list_head fi_layout_states;
-+ /* used by layoutget / layoutrecall */
-+ struct nfs4_fsid fi_fsid;
-+ u32 fi_fhlen;
-+ u8 fi_fhval[NFS4_FHSIZE];
-+#endif /* CONFIG_PNFSD */
- };
-
- /* XXX: for first cut may fall back on returning file that doesn't work
-@@ -385,6 +399,15 @@ static inline struct file *find_any_file
- return f->fi_fds[O_RDONLY];
- }
-
-+#if defined(CONFIG_PNFSD)
-+/* pNFS Metadata server state */
+
-+struct pnfs_ds_dev_entry {
-+ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */
-+ u32 dd_dsid;
-+};
-+#endif /* CONFIG_PNFSD */
++u32 *__xdr_read_calc_u8_opaque(
++ u32 *p, struct nfs4_string *str)
++{
++ u32 len;
++
++ READ32(len);
++
++ if (str) {
++ str->len = len;
++ str->data = (char *)p;
++ }
+
- /*
- * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
- *
-@@ -407,6 +430,9 @@ struct nfs4_stateid {
- struct list_head st_perfile;
- struct list_head st_perstateowner;
- struct list_head st_lockowners;
-+#if defined(CONFIG_PNFSD)
-+ struct list_head st_pnfs_ds_id;
-+#endif /* CONFIG_PNFSD */
- struct nfs4_stateowner * st_stateowner;
- struct nfs4_file * st_file;
- stateid_t st_stateid;
-@@ -457,6 +483,34 @@ extern void nfsd4_recdir_purge_old(void)
- extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
- extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
- extern void release_session_client(struct nfsd4_session *);
-+extern void nfsd4_free_slab(struct kmem_cache **);
-+extern struct nfs4_file *find_file(struct inode *);
-+extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *);
-+extern void put_nfs4_file(struct nfs4_file *);
-+extern void get_nfs4_file(struct nfs4_file *);
-+extern struct nfs4_client *find_confirmed_client(clientid_t *);
-+extern struct nfs4_stateid *find_stateid(stateid_t *, int flags);
-+extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *);
-+extern __be32 nfs4_check_stateid(stateid_t *);
-+extern void expire_client_lock(struct nfs4_client *);
-+extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *);
++ p += XDR_QUADLEN(len);
++ return p;
++}
+
-+#if defined(CONFIG_PNFSD)
-+extern int nfsd4_init_pnfs_slabs(void);
-+extern void nfsd4_free_pnfs_slabs(void);
-+extern void pnfs_expire_client(struct nfs4_client *);
-+extern void release_pnfs_ds_dev_list(struct nfs4_stateid *);
-+extern void nfs4_pnfs_state_init(void);
-+extern void nfs4_pnfs_state_shutdown(void);
-+extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
-+extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *);
-+#else /* CONFIG_PNFSD */
-+static inline void nfsd4_free_pnfs_slabs(void) {}
-+static inline int nfsd4_init_pnfs_slabs(void) { return 0; }
-+static inline void pnfs_expire_client(struct nfs4_client *clp) {}
-+static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {}
-+static inline void nfs4_pnfs_state_shutdown(void) {}
-+#endif /* CONFIG_PNFSD */
-
- static inline void
- nfs4_put_stateowner(struct nfs4_stateowner *so)
-@@ -470,4 +524,24 @@ nfs4_get_stateowner(struct nfs4_stateown
- kref_get(&so->so_ref);
- }
-
-+static inline u64
-+end_offset(u64 start, u64 len)
++/*
++ * struct pnfs_osd_targetid {
++ * u32 oti_type;
++ * struct nfs4_string oti_scsi_device_id;
++ * };
++ */
++u32 *__xdr_read_calc_targetid(
++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace)
+{
-+ u64 end;
++ u32 oti_type;
+
-+ end = start + len;
-+ return end >= start ? end : NFS4_MAX_UINT64;
++ READ32(oti_type);
++ if (targetid)
++ targetid->oti_type = oti_type;
++
++ switch (oti_type) {
++ case OBJ_TARGET_SCSI_NAME:
++ case OBJ_TARGET_SCSI_DEVICE_ID:
++ p = __xdr_read_calc_u8_opaque(p,
++ targetid ? &targetid->oti_scsi_device_id : NULL);
++ }
++
++ return p;
+}
+
-+/* last octet in a range */
-+static inline u64
-+last_byte_offset(u64 start, u64 len)
++/*
++ * struct pnfs_osd_net_addr {
++ * struct nfs4_string r_netid;
++ * struct nfs4_string r_addr;
++ * };
++ */
++u32 *__xdr_read_calc_net_addr(
++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace)
+{
-+ u64 end;
+
-+ BUG_ON(!len);
-+ end = start + len;
-+ return end > start ? end - 1 : NFS4_MAX_UINT64;
++ p = __xdr_read_calc_nfs4_string(p,
++ netaddr ? &netaddr->r_netid : NULL,
++ freespace);
++
++ p = __xdr_read_calc_nfs4_string(p,
++ netaddr ? &netaddr->r_addr : NULL,
++ freespace);
++
++ return p;
+}
+
- #endif /* NFSD4_STATE_H */
-diff -up linux-2.6.35.noarch/fs/nfsd/vfs.c.orig linux-2.6.35.noarch/fs/nfsd/vfs.c
---- linux-2.6.35.noarch/fs/nfsd/vfs.c.orig 2010-09-30 12:22:45.308046000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/vfs.c 2010-09-30 12:25:08.607287000 -0400
-@@ -37,7 +37,12 @@
- #ifdef CONFIG_NFSD_V4
- #include <linux/nfs4_acl.h>
- #include <linux/nfsd_idmap.h>
-+#include <linux/security.h>
-+#include <linux/nfsd4_spnfs.h>
- #endif /* CONFIG_NFSD_V4 */
-+#if defined(CONFIG_SPNFS_BLOCK)
-+#include <linux/nfsd4_block.h>
-+#endif
-
- #include "nfsd.h"
- #include "vfs.h"
-@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str
- NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
- if (err)
- goto out;
-+#if defined(CONFIG_SPNFS_BLOCK)
-+ if (pnfs_block_enabled(inode, 0)) {
-+ err = bl_layoutrecall(inode, RETURN_FILE,
-+ iap->ia_size, inode->i_size - iap->ia_size);
-+ }
-+#endif /* CONFIG_SPNFS_BLOCK */
- }
-
- /*
-@@ -1716,6 +1727,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru
- struct inode *fdir, *tdir;
- __be32 err;
- int host_err;
-+#ifdef CONFIG_SPNFS
-+ unsigned long ino = 0;
-+ unsigned long generation = 0;
-+ unsigned int nlink = 0;
-+#endif /* CONFIG_SPNFS */
-
- err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
- if (err)
-@@ -1779,7 +1795,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru
- if (host_err)
- goto out_dput_new;
-
-+#ifdef CONFIG_SPNFS
-+ /*
-+ * if the target is a preexisting regular file, remember the
-+ * inode number and generation so we can delete the stripes;
-+ * save the link count as well so that the stripes only get
-+ * get deleted when the last link is deleted
-+ */
-+ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) {
-+ ino = ndentry->d_inode->i_ino;
-+ generation = ndentry->d_inode->i_generation;
-+ nlink = ndentry->d_inode->i_nlink;
-+ }
-+#endif /* CONFIG_SPNFS */
++/*
++ * struct pnfs_osd_targetaddr {
++ * u32 ota_available;
++ * struct pnfs_osd_net_addr ota_netaddr;
++ * };
++ */
++u32 *__xdr_read_calc_targetaddr(
++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace)
++{
++ u32 ota_available;
+
- host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-+#ifdef CONFIG_SPNFS
-+ if (spnfs_enabled() && (!host_err && ino && nlink == 1))
-+ spnfs_remove(ino, generation);
-+#endif /* CONFIG_SPNFS */
++ READ32(ota_available);
++ if (targetaddr)
++ targetaddr->ota_available = ota_available;
+
- if (!host_err) {
- host_err = commit_metadata(tfhp);
- if (!host_err)
-@@ -1820,6 +1855,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
- struct inode *dirp;
- __be32 err;
- int host_err;
-+#if defined(CONFIG_SPNFS)
-+ unsigned long ino;
-+ unsigned long generation;
-+ unsigned int nlink;
-+#endif /* defined(CONFIG_SPNFS) */
-
- err = nfserr_acces;
- if (!flen || isdotent(fname, flen))
-@@ -1843,6 +1883,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
- goto out;
- }
-
-+#if defined(CONFIG_SPNFS)
-+ /*
-+ * Remember the inode number to communicate to the spnfsd
-+ * for removal of stripes; save the link count as well so that
-+ * the stripes only get get deleted when the last link is deleted
-+ */
-+ ino = rdentry->d_inode->i_ino;
-+ generation = rdentry->d_inode->i_generation;
-+ nlink = rdentry->d_inode->i_nlink;
-+#endif /* defined(CONFIG_SPNFS) */
++ if (ota_available) {
++ p = __xdr_read_calc_net_addr(p,
++ targetaddr ? &targetaddr->ota_netaddr : NULL,
++ freespace);
++ }
++
++ return p;
++}
+
- if (!type)
- type = rdentry->d_inode->i_mode & S_IFMT;
-
-@@ -1867,6 +1918,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru
- if (!host_err)
- host_err = commit_metadata(fhp);
-
-+#if defined(CONFIG_SPNFS)
-+ /*
-+ * spnfs: notify spnfsd of removal to destroy stripes
-+ */
+/*
-+ sb = current_fh->fh_dentry->d_inode->i_sb;
-+ if (sb->s_export_op->spnfs_remove) {
-+*/
-+ dprintk("%s check if spnfs_enabled\n", __FUNCTION__);
-+ if (spnfs_enabled() && nlink == 1) {
-+ BUG_ON(ino == 0);
-+ dprintk("%s calling spnfs_remove inumber=%ld\n",
-+ __FUNCTION__, ino);
-+ if (spnfs_remove(ino, generation) == 0) {
-+ dprintk("%s spnfs_remove success\n", __FUNCTION__);
-+ } else {
-+ /* XXX How do we make this atomic? */
-+ printk(KERN_WARNING "nfsd: pNFS could not "
-+ "remove stripes for inode: %ld\n", ino);
-+ }
++ * struct pnfs_osd_deviceaddr {
++ * struct pnfs_osd_targetid oda_targetid;
++ * struct pnfs_osd_targetaddr oda_targetaddr;
++ * u8 oda_lun[8];
++ * struct nfs4_string oda_systemid;
++ * struct pnfs_osd_object_cred oda_root_obj_cred;
++ * struct nfs4_string oda_osdname;
++ * };
++ */
++u32 *__xdr_read_calc_deviceaddr(
++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace)
++{
++ p = __xdr_read_calc_targetid(p,
++ deviceaddr ? &deviceaddr->oda_targetid : NULL,
++ freespace);
++
++ p = __xdr_read_calc_targetaddr(p,
++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL,
++ freespace);
++
++ if (deviceaddr)
++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun));
++ else
++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun));
++
++ p = __xdr_read_calc_u8_opaque(p,
++ deviceaddr ? &deviceaddr->oda_systemid : NULL);
++
++ if (deviceaddr) {
++ p = pnfs_osd_xdr_decode_object_cred(p,
++ &deviceaddr->oda_root_obj_cred, freespace);
++ } else {
++ *freespace += pnfs_osd_object_cred_incore_sz(p);
++ p += pnfs_osd_object_cred_xdr_sz(p);
+ }
-+#endif /* defined(CONFIG_SPNFS) */
+
- mnt_drop_write(fhp->fh_export->ex_path.mnt);
- out_nfserr:
- err = nfserrno(host_err);
-diff -up linux-2.6.35.noarch/fs/nfsd/xdr4.h.orig linux-2.6.35.noarch/fs/nfsd/xdr4.h
---- linux-2.6.35.noarch/fs/nfsd/xdr4.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfsd/xdr4.h 2010-09-30 12:25:08.612291000 -0400
-@@ -37,6 +37,8 @@
- #ifndef _LINUX_NFSD_XDR4_H
- #define _LINUX_NFSD_XDR4_H
-
-+#include <linux/nfsd/nfsd4_pnfs.h>
++ p = __xdr_read_calc_u8_opaque(p,
++ deviceaddr ? &deviceaddr->oda_osdname : NULL);
+
- #include "state.h"
- #include "nfsd.h"
-
-@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete {
- u32 rca_one_fs;
- };
-
-+struct nfsd4_pnfs_getdevinfo {
-+ struct nfsd4_pnfs_deviceid gd_devid; /* request */
-+ u32 gd_layout_type; /* request */
-+ u32 gd_maxcount; /* request */
-+ u32 gd_notify_types;/* request */
-+ struct super_block *gd_sb;
-+};
++ return p;
++}
+
-+struct nfsd4_pnfs_getdevlist {
-+ u32 gd_layout_type; /* request */
-+ u32 gd_maxdevices; /* request */
-+ u64 gd_cookie; /* request - response */
-+ u64 gd_verf; /* request - response */
-+ struct svc_fh *gd_fhp; /* response */
-+ u32 gd_eof; /* response */
-+};
++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p)
++{
++ u8 *null_freespace = NULL;
++ size_t sz;
+
-+struct nfsd4_pnfs_layoutget {
-+ u64 lg_minlength; /* request */
-+ u32 lg_signal; /* request */
-+ u32 lg_maxcount; /* request */
-+ struct svc_fh *lg_fhp; /* request */
-+ stateid_t lg_sid; /* request/response */
-+ struct nfsd4_layout_seg lg_seg; /* request/response */
-+ u32 lg_roc; /* response */
-+};
++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace);
++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace;
+
-+struct nfsd4_pnfs_layoutcommit {
-+ struct nfsd4_pnfs_layoutcommit_arg args;
-+ stateid_t lc_sid; /* request */
-+ struct nfsd4_pnfs_layoutcommit_res res;
-+};
++ return sz;
++}
+
-+enum layoutreturn_flags {
-+ LR_FLAG_INTERN = 1 << 0, /* internal return */
-+ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */
-+};
++void pnfs_osd_xdr_decode_deviceaddr(
++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p)
++{
++ u8 *freespace = (u8 *)(deviceaddr + 1);
+
-+struct nfsd4_pnfs_layoutreturn {
-+ struct nfsd4_pnfs_layoutreturn_arg args;
-+ u32 lr_flags;
-+ stateid_t lr_sid; /* request/resopnse */
-+ u32 lrs_present; /* response */
-+};
++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
++}
+
- struct nfsd4_op {
- int opnum;
- __be32 status;
-@@ -426,6 +473,13 @@ struct nfsd4_op {
- struct nfsd4_destroy_session destroy_session;
- struct nfsd4_sequence sequence;
- struct nfsd4_reclaim_complete reclaim_complete;
-+#if defined(CONFIG_PNFSD)
-+ struct nfsd4_pnfs_getdevlist pnfs_getdevlist;
-+ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo;
-+ struct nfsd4_pnfs_layoutget pnfs_layoutget;
-+ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit;
-+ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn;
-+#endif /* CONFIG_PNFSD */
- } u;
- struct nfs4_replay * replay;
- };
-diff -up linux-2.6.35.noarch/fs/nfs/file.c.orig linux-2.6.35.noarch/fs/nfs/file.c
---- linux-2.6.35.noarch/fs/nfs/file.c.orig 2010-09-30 12:22:45.126043000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/file.c 2010-09-30 12:25:08.280277000 -0400
-@@ -36,6 +36,7 @@
- #include "internal.h"
- #include "iostat.h"
- #include "fscache.h"
-+#include "pnfs.h"
-
- #define NFSDBG_FACILITY NFSDBG_FILE
-
-@@ -380,12 +381,16 @@ static int nfs_write_begin(struct file *
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- struct page *page;
- int once_thru = 0;
-+ struct pnfs_layout_segment *lseg;
-
- dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
- file->f_path.dentry->d_parent->d_name.name,
- file->f_path.dentry->d_name.name,
- mapping->host->i_ino, len, (long long) pos);
-
-+ lseg = pnfs_update_layout(mapping->host,
-+ nfs_file_open_context(file),
-+ pos, len, IOMODE_RW);
- start:
- /*
- * Prevent starvation issues if someone is doing a consistency
-@@ -394,17 +399,22 @@ start:
- ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (ret)
-- return ret;
-+ goto out;
-
- page = grab_cache_page_write_begin(mapping, index, flags);
-- if (!page)
-- return -ENOMEM;
-+ if (!page) {
-+ ret = -ENOMEM;
-+ goto out;
-+ }
- *pagep = page;
-
-- ret = nfs_flush_incompatible(file, page);
-+ ret = nfs_flush_incompatible(file, page, lseg);
- if (ret) {
- unlock_page(page);
- page_cache_release(page);
-+ *pagep = NULL;
-+ *fsdata = NULL;
-+ goto out;
- } else if (!once_thru &&
- nfs_want_read_modify_write(file, page, pos, len)) {
- once_thru = 1;
-@@ -413,6 +423,12 @@ start:
- if (!ret)
- goto start;
- }
-+ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
-+ out:
-+ if (ret) {
-+ put_lseg(lseg);
-+ *fsdata = NULL;
-+ }
- return ret;
- }
-
-@@ -422,6 +438,7 @@ static int nfs_write_end(struct file *fi
- {
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
- int status;
-+ struct pnfs_layout_segment *lseg;
-
- dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
- file->f_path.dentry->d_parent->d_name.name,
-@@ -448,10 +465,17 @@ static int nfs_write_end(struct file *fi
- zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
- }
-
-- status = nfs_updatepage(file, page, offset, copied);
-+ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
-+ status = pnfs_write_end(file, page, pos, len, copied, lseg);
-+ if (status)
-+ goto out;
-+ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
-
-+ out:
- unlock_page(page);
- page_cache_release(page);
-+ pnfs_write_end_cleanup(file, fsdata);
-+ put_lseg(lseg);
-
- if (status < 0)
- return status;
-@@ -562,6 +586,8 @@ static int nfs_vm_page_mkwrite(struct vm
- /* make sure the cache has finished storing the page */
- nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
-
-+ /* XXX Do we want to call pnfs_update_layout here? */
++/*
++ * struct pnfs_osd_layoutupdate {
++ * u32 dsu_valid;
++ * s64 dsu_delta;
++ * u32 olu_ioerr_flag;
++ * };
++ */
++int
++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
++ struct pnfs_osd_layoutupdate *lou)
++{
++ __be32 *p = xdr_reserve_space(xdr, 16);
++
++ if (!p)
++ return -E2BIG;
++
++ *p++ = cpu_to_be32(lou->dsu_valid);
++ if (lou->dsu_valid)
++ p = xdr_encode_hyper(p, lou->dsu_delta);
++ *p++ = cpu_to_be32(lou->olu_ioerr_flag);
++ return 0;
++}
++
++/*
++ * struct pnfs_osd_objid {
++ * struct pnfs_deviceid oid_device_id;
++ * u64 oid_partition_id;
++ * u64 oid_object_id;
++ */
++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
++ struct pnfs_osd_objid *object_id)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, 32);
++ if (!p)
++ return -E2BIG;
++
++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
++ sizeof(object_id->oid_device_id.data));
++ p = xdr_encode_hyper(p, object_id->oid_partition_id);
++ p = xdr_encode_hyper(p, object_id->oid_object_id);
++
++ return 0;
++}
++
++/*
++ * struct pnfs_osd_ioerr {
++ * struct pnfs_osd_objid oer_component;
++ * u64 oer_comp_offset;
++ * u64 oer_comp_length;
++ * u32 oer_iswrite;
++ * u32 oer_errno;
++ * };
++ */
++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
++ struct pnfs_osd_ioerr *ioerr)
++{
++ __be32 *p;
++ int ret;
+
- lock_page(page);
- mapping = page->mapping;
- if (mapping != dentry->d_inode->i_mapping)
-@@ -572,11 +598,11 @@ static int nfs_vm_page_mkwrite(struct vm
- if (pagelen == 0)
- goto out_unlock;
-
-- ret = nfs_flush_incompatible(filp, page);
-+ ret = nfs_flush_incompatible(filp, page, NULL);
- if (ret != 0)
- goto out_unlock;
++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
++ if (ret)
++ return ret;
++
++ p = xdr_reserve_space(xdr, 24);
++ if (!p)
++ return -E2BIG;
++
++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
++ p = xdr_encode_hyper(p, ioerr->oer_comp_length);
++ *p++ = cpu_to_be32(ioerr->oer_iswrite);
++ *p = cpu_to_be32(ioerr->oer_errno);
++
++ return 0;
++}
+diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
+index 9194902..96e375e 100644
+--- a/fs/nfs/pagelist.c
++++ b/fs/nfs/pagelist.c
+@@ -20,6 +20,7 @@
+ #include <linux/nfs_mount.h>
-- ret = nfs_updatepage(filp, page, 0, pagelen);
-+ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL);
- out_unlock:
- if (!ret)
- return VM_FAULT_LOCKED;
-diff -up linux-2.6.35.noarch/fs/nfs/inode.c.orig linux-2.6.35.noarch/fs/nfs/inode.c
---- linux-2.6.35.noarch/fs/nfs/inode.c.orig 2010-09-30 12:22:45.132041000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/inode.c 2010-09-30 12:25:08.286278000 -0400
-@@ -48,6 +48,7 @@
#include "internal.h"
- #include "fscache.h"
- #include "dns_resolve.h"
+#include "pnfs.h"
- #define NFSDBG_FACILITY NFSDBG_VFS
-
-@@ -648,6 +649,7 @@ struct nfs_open_context *get_nfs_open_co
- atomic_inc(&ctx->lock_context.count);
- return ctx;
- }
-+EXPORT_SYMBOL(get_nfs_open_context);
+ static struct kmem_cache *nfs_page_cachep;
- static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
+@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p)
+ struct nfs_page *
+ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
+ struct page *page,
+- unsigned int offset, unsigned int count)
++ unsigned int offset, unsigned int count,
++ struct pnfs_layout_segment *lseg)
{
-@@ -1000,6 +1002,7 @@ void nfs_fattr_init(struct nfs_fattr *fa
- fattr->time_start = jiffies;
- fattr->gencount = nfs_inc_attr_generation_counter();
- }
-+EXPORT_SYMBOL(nfs_fattr_init);
+ struct nfs_page *req;
- struct nfs_fattr *nfs_alloc_fattr(void)
- {
-@@ -1209,6 +1212,14 @@ static int nfs_update_inode(struct inode
- server->fsid = fattr->fsid;
+@@ -81,6 +83,9 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
+ req->wb_context = get_nfs_open_context(ctx);
+ req->wb_lock_context = nfs_get_lock_context(ctx);
+ kref_init(&req->wb_kref);
++ req->wb_lseg = lseg;
++ if (lseg)
++ get_lseg(lseg);
+ return req;
+ }
- /*
-+ * file needs layout commit, server attributes may be stale
-+ */
-+ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) {
-+ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n",
-+ __func__, inode->i_sb->s_id, inode->i_ino);
-+ return 0;
+@@ -156,9 +161,12 @@ void nfs_clear_request(struct nfs_page *req)
+ put_nfs_open_context(ctx);
+ req->wb_context = NULL;
+ }
++ if (req->wb_lseg != NULL) {
++ put_lseg(req->wb_lseg);
++ req->wb_lseg = NULL;
+ }
-+ /*
- * Update the read time so we don't revalidate too often.
- */
- nfsi->read_cache_jiffies = fattr->time_start;
-@@ -1407,11 +1418,12 @@ static int nfs_update_inode(struct inode
+ }
+
+-
+ /**
+ * nfs_release_request - Release the count on an NFS read/write request
+ * @req: request to release
+@@ -237,7 +245,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+ * Return 'true' if this is the case, else return 'false'.
*/
- void nfs4_evict_inode(struct inode *inode)
+ static int nfs_can_coalesce_requests(struct nfs_page *prev,
+- struct nfs_page *req)
++ struct nfs_page *req,
++ struct nfs_pageio_descriptor *pgio)
{
-+ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true);
- truncate_inode_pages(&inode->i_data, 0);
- end_writeback(inode);
-+ pnfs_destroy_layout(NFS_I(inode));
- /* If we are holding a delegation, return it! */
- nfs_inode_return_delegation_noreclaim(inode);
-- /* First call standard NFS clear_inode() code */
- nfs_clear_inode(inode);
- }
- #endif
-@@ -1446,6 +1458,8 @@ static inline void nfs4_init_once(struct
- nfsi->delegation = NULL;
- nfsi->delegation_state = 0;
- init_rwsem(&nfsi->rwsem);
-+ rpc_init_wait_queue(&nfsi->lo_rpcwaitq, "pNFS Layout");
-+ nfsi->layout = NULL;
- #endif
+ if (req->wb_context->cred != prev->wb_context->cred)
+ return 0;
+@@ -251,6 +260,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
+ return 0;
+ if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ return 0;
++ if (req->wb_lseg != prev->wb_lseg)
++ return 0;
++#ifdef CONFIG_NFS_V4_1
++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
++ return 0;
++#endif /* CONFIG_NFS_V4_1 */
+ return 1;
}
-diff -up linux-2.6.35.noarch/fs/nfs/internal.h.orig linux-2.6.35.noarch/fs/nfs/internal.h
---- linux-2.6.35.noarch/fs/nfs/internal.h.orig 2010-09-30 12:22:45.136044000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/internal.h 2010-09-30 12:25:08.291287000 -0400
-@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv
- struct nfs_fattr *);
- extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
- extern int nfs4_check_client_ready(struct nfs_client *clp);
-+extern int nfs_sockaddr_cmp(const struct sockaddr *sa1,
-+ const struct sockaddr *sa2);
-+extern int nfs4_set_client(struct nfs_server *server,
-+ const char *hostname,
-+ const struct sockaddr *addr,
-+ const size_t addrlen,
-+ const char *ip_addr,
-+ rpc_authflavor_t authflavour,
-+ int proto, const struct rpc_timeout *timeparms,
-+ u32 minorversion);
- #ifdef CONFIG_PROC_FS
- extern int __init nfs_fs_proc_init(void);
- extern void nfs_fs_proc_exit(void);
-@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead
- extern struct rpc_procinfo nfs4_procedures[];
- #endif
-
-+extern int nfs4_recover_expired_lease(struct nfs_client *clp);
-+
- /* proc.c */
- void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
-
-@@ -249,10 +261,31 @@ extern int nfs4_get_rootfh(struct nfs_se
- #endif
-
- /* read.c */
-+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops);
-+extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops);
- extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
-
- /* write.c */
-+extern int nfs_initiate_write(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how);
-+extern int pnfs_initiate_write(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how);
-+extern int nfs_initiate_commit(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how);
-+extern int pnfs_initiate_commit(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how, int pnfs);
- extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
-+extern void nfs_mark_list_commit(struct list_head *head);
- #ifdef CONFIG_MIGRATION
- extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *);
-diff -up linux-2.6.35.noarch/fs/nfs/Kconfig.orig linux-2.6.35.noarch/fs/nfs/Kconfig
---- linux-2.6.35.noarch/fs/nfs/Kconfig.orig 2010-09-30 12:22:45.078042000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/Kconfig 2010-09-30 12:25:08.198277000 -0400
-@@ -76,10 +76,42 @@ config NFS_V4
-
- config NFS_V4_1
- bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-- depends on NFS_V4 && EXPERIMENTAL
-+ depends on NFS_FS && NFS_V4 && EXPERIMENTAL
-+ select PNFS_FILE_LAYOUT
- help
- This option enables support for minor version 1 of the NFSv4 protocol
-- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
-+ (RFC 5661) in the kernel's NFS client.
-+
-+ If unsure, say N.
-+
-+config PNFS_FILE_LAYOUT
-+ tristate
-+
-+config PNFS_OBJLAYOUT
-+ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
-+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
-+ help
-+ Say M here if you want your pNFS client to support the Objects Layout Driver.
-+ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
-+ upper level driver (SCSI_OSD_ULD).
-+
-+ If unsure, say N.
-+
-+config PNFS_PANLAYOUT
-+ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
-+ depends on PNFS_OBJLAYOUT
-+ help
-+ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver.
-+
-+ If unsure, say N.
-+
-+config PNFS_BLOCK
-+ tristate "Provide a pNFS block client (EXPERIMENTAL)"
-+ depends on NFS_FS && NFS_V4_1
-+ select MD
-+ select BLK_DEV_DM
-+ help
-+ Say M or y here if you want your pNfs client to support the block protocol
-
- If unsure, say N.
-
-diff -up linux-2.6.35.noarch/fs/nfs/Makefile.orig linux-2.6.35.noarch/fs/nfs/Makefile
---- linux-2.6.35.noarch/fs/nfs/Makefile.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/Makefile 2010-09-30 12:25:08.203278000 -0400
-@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x
- delegation.o idmap.o \
- callback.o callback_xdr.o callback_proc.o \
- nfs4namespace.o
-+nfs-$(CONFIG_NFS_V4_1) += pnfs.o
- nfs-$(CONFIG_SYSCTL) += sysctl.o
- nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
-+
-+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
-+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
-+
-+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
-+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c
---- linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c.orig 2010-09-30 12:25:08.300279000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c 2010-09-30 12:25:08.302278000 -0400
-@@ -0,0 +1,701 @@
+@@ -283,7 +298,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+ if (newlen > desc->pg_bsize)
+ return 0;
+ prev = nfs_list_entry(desc->pg_list.prev);
+- if (!nfs_can_coalesce_requests(prev, req))
++ if (!nfs_can_coalesce_requests(prev, req, desc))
+ return 0;
+ } else
+ desc->pg_base = req->wb_pgbase;
+@@ -372,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+ * @idx_start: lower bound of page->index to scan
+ * @npages: idx_start + npages sets the upper bound to scan.
+ * @tag: tag to scan for
++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver
+ *
+ * Moves elements from one of the inode request lists.
+ * If the number of requests is set to 0, the entire address_space
+@@ -381,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+ */
+ int nfs_scan_list(struct nfs_inode *nfsi,
+ struct list_head *dst, pgoff_t idx_start,
+- unsigned int npages, int tag)
++ unsigned int npages, int tag, int *use_pnfs)
+ {
+ struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+ struct nfs_page *req;
+@@ -412,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi,
+ radix_tree_tag_clear(&nfsi->nfs_page_tree,
+ req->wb_index, tag);
+ nfs_list_add_request(req, dst);
++ if (req->wb_lseg)
++ *use_pnfs = 1;
+ res++;
+ if (res == INT_MAX)
+ goto out;
+diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
+new file mode 100644
+index 0000000..dfdf661
+--- /dev/null
++++ b/fs/nfs/pnfs.c
+@@ -0,0 +1,1723 @@
+/*
-+ * Module for the pnfs nfs4 file layout driver.
-+ * Defines all I/O and Policy interface operations, plus code
-+ * to register itself with the pNFS client.
++ * pNFS functions to call and manage layout drivers.
+ *
-+ * Copyright (c) 2002
++ * Copyright (c) 2002 [year of first publication]
+ * The Regents of the University of Michigan
+ * All Rights Reserved
+ *
@@ -15549,11550 +13488,13587 @@ diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.35.noarch/fs
+ */
+
+#include <linux/nfs_fs.h>
-+
+#include "internal.h"
-+#include "nfs4filelayout.h"
++#include "pnfs.h"
++#include "iostat.h"
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++#define NFSDBG_FACILITY NFSDBG_PNFS
+
-+MODULE_LICENSE("GPL");
-+MODULE_AUTHOR("Dean Hildebrand <dhildebz at umich.edu>");
-+MODULE_DESCRIPTION("The NFSv4 file layout driver");
++/* Locking:
++ *
++ * pnfs_spinlock:
++ * protects pnfs_modules_tbl.
++ */
++static DEFINE_SPINLOCK(pnfs_spinlock);
+
-+int
-+filelayout_initialize_mountpoint(struct nfs_server *nfss,
-+ const struct nfs_fh *mntfh)
++/*
++ * pnfs_modules_tbl holds all pnfs modules
++ */
++static LIST_HEAD(pnfs_modules_tbl);
++
++/* Return the registered pnfs layout driver module matching given id */
++static struct pnfs_layoutdriver_type *
++find_pnfs_driver_locked(u32 id)
+{
-+ int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
-+ nfs4_fl_free_deviceid_callback);
-+ if (status) {
-+ printk(KERN_WARNING "%s: deviceid cache could not be "
-+ "initialized\n", __func__);
-+ return status;
++ struct pnfs_layoutdriver_type *local;
++
++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
++ if (local->id == id)
++ goto out;
++ local = NULL;
++out:
++ dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
++ return local;
++}
++
++static struct pnfs_layoutdriver_type *
++find_pnfs_driver(u32 id)
++{
++ struct pnfs_layoutdriver_type *local;
++
++ spin_lock(&pnfs_spinlock);
++ local = find_pnfs_driver_locked(id);
++ spin_unlock(&pnfs_spinlock);
++ return local;
++}
++
++/* Set cred to indicate we require a layoutcommit
++ * If we don't even have a layout, we don't need to commit it.
++ */
++void
++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx)
++{
++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx);
++ spin_lock(&nfsi->vfs_inode.i_lock);
++ if (has_layout(nfsi) &&
++ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state)) {
++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred);
++ __set_bit(NFS_LAYOUT_NEED_LCOMMIT,
++ &nfsi->layout->state);
++ nfsi->change_attr++;
++ spin_unlock(&nfsi->vfs_inode.i_lock);
++ dprintk("%s: Set layoutcommit\n", __func__);
++ return;
+ }
-+ dprintk("%s: deviceid cache has been initialized successfully\n",
-+ __func__);
-+ return 0;
++ spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+
-+/* Uninitialize a mountpoint by destroying its device list */
-+int
-+filelayout_uninitialize_mountpoint(struct nfs_server *nfss)
++/* Update last_write_offset for layoutcommit.
++ * TODO: We should only use commited extents, but the current nfs
++ * implementation does not calculate the written range in nfs_commit_done.
++ * We therefore update this field in writeback_done.
++ */
++void
++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent)
+{
-+ dprintk("--> %s\n", __func__);
++ loff_t end_pos;
+
-+ if (nfss->nfs_client->cl_devid_cache)
-+ pnfs_put_deviceid_cache(nfss->nfs_client);
-+ return 0;
++ spin_lock(&nfsi->vfs_inode.i_lock);
++ if (offset < nfsi->layout->write_begin_pos)
++ nfsi->layout->write_begin_pos = offset;
++ end_pos = offset + extent - 1; /* I'm being inclusive */
++ if (end_pos > nfsi->layout->write_end_pos)
++ nfsi->layout->write_end_pos = end_pos;
++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n",
++ __func__,
++ (unsigned long) extent,
++ (unsigned long) offset ,
++ (unsigned long) nfsi->layout->write_begin_pos,
++ (unsigned long) nfsi->layout->write_end_pos);
++ spin_unlock(&nfsi->vfs_inode.i_lock);
+}
+
-+/* This function is used by the layout driver to calculate the
-+ * offset of the file on the dserver based on whether the
-+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
++void
++unset_pnfs_layoutdriver(struct nfs_server *nfss)
++{
++ if (nfss->pnfs_curr_ld) {
++ nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss);
++ module_put(nfss->pnfs_curr_ld->owner);
++ }
++ nfss->pnfs_curr_ld = NULL;
++}
++
++/*
++ * Try to set the server's pnfs module to the pnfs layout type specified by id.
++ * Currently only one pNFS layout driver per filesystem is supported.
++ *
++ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
-+static loff_t
-+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
++void
++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
++ u32 id)
+{
-+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
++ struct pnfs_layoutdriver_type *ld_type = NULL;
+
-+ switch (flseg->stripe_type) {
-+ case STRIPE_SPARSE:
-+ return offset;
++ if (id == 0)
++ goto out_no_driver;
++ if ((server->nfs_client->rpc_ops->version != 4) ||
++ (server->nfs_client->cl_minorversion != 1))
++ goto out_no_driver;
++ if (!(server->nfs_client->cl_exchange_flags &
++ (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
++ printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
++ id, server->nfs_client->cl_exchange_flags);
++ goto out_no_driver;
++ }
++ ld_type = find_pnfs_driver(id);
++ if (!ld_type) {
++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
++ ld_type = find_pnfs_driver(id);
++ if (!ld_type) {
++ dprintk("%s: No pNFS module found for %u.\n",
++ __func__, id);
++ goto out_no_driver;
++ }
++ }
++ if (!try_module_get(ld_type->owner)) {
++ dprintk("%s: Could not grab reference on module\n", __func__);
++ goto out_no_driver;
++ }
++ server->pnfs_curr_ld = ld_type;
++ if (ld_type->initialize_mountpoint(server, mntfh)) {
++ printk(KERN_ERR
++ "%s: Error initializing mount point for layout driver %u.\n",
++ __func__, id);
++ module_put(ld_type->owner);
++ goto out_no_driver;
++ }
++ dprintk("%s: pNFS module for %u set\n", __func__, id);
++ return;
+
-+ case STRIPE_DENSE:
-+ {
-+ u32 stripe_width;
-+ u64 tmp, off;
-+ u32 unit = flseg->stripe_unit;
++out_no_driver:
++ dprintk("%s: Using NFSv4 I/O\n", __func__);
++ server->pnfs_curr_ld = NULL;
++}
+
-+ stripe_width = unit * flseg->dsaddr->stripe_count;
-+ tmp = off = offset - flseg->pattern_offset;
-+ do_div(tmp, stripe_width);
-+ return tmp * unit + do_div(off, unit);
++int
++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
++{
++ int status = -EINVAL;
++ struct pnfs_layoutdriver_type *tmp;
++
++ if (ld_type->id == 0) {
++ printk(KERN_ERR "%s id 0 is reserved\n", __func__);
++ return status;
+ }
-+ default:
-+ BUG();
++ if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
++ printk(KERN_ERR "%s Layout driver must provide "
++ "alloc_lseg and free_lseg.\n", __func__);
++ return status;
++ }
++
++ if (!ld_type->read_pagelist || !ld_type->write_pagelist ||
++ !ld_type->commit) {
++ printk(KERN_ERR "%s Layout driver must provide "
++ "read_pagelist, write_pagelist, and commit.\n",
++ __func__);
++ return status;
++ }
++
++ spin_lock(&pnfs_spinlock);
++ tmp = find_pnfs_driver_locked(ld_type->id);
++ if (!tmp) {
++ list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
++ status = 0;
++ dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
++ ld_type->name);
++ } else {
++ printk(KERN_ERR "%s Module with id %d already loaded!\n",
++ __func__, ld_type->id);
+ }
++ spin_unlock(&pnfs_spinlock);
++
++ return status;
++}
++EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
++
++void
++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
++{
++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
++ spin_lock(&pnfs_spinlock);
++ list_del(&ld_type->pnfs_tblid);
++ spin_unlock(&pnfs_spinlock);
++}
++EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
++
++/*
++ * pNFS client layout cache
++ */
++
++static struct pnfs_layout_hdr *
++pnfs_alloc_layout_hdr(struct inode *ino)
++{
++ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
++ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
++ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
++}
++
++static void
++pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
++{
++ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->inode)->pnfs_curr_ld;
++ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
++}
+
-+ /* We should never get here... just to stop the gcc warning */
-+ return 0;
++static void
++get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
++{
++ assert_spin_locked(&lo->inode->i_lock);
++ lo->refcount++;
+}
+
-+/*
-+ * Call ops for the async read/write cases
-+ * In the case of dense layouts, the offset needs to be reset to its
-+ * original value.
-+ */
-+static void filelayout_read_call_done(struct rpc_task *task, void *data)
++static void
++put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
-+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
++ assert_spin_locked(&lo->inode->i_lock);
++ BUG_ON(lo->refcount == 0);
+
-+ if (rdata->fldata.orig_offset) {
-+ dprintk("%s new off %llu orig offset %llu\n", __func__,
-+ rdata->args.offset, rdata->fldata.orig_offset);
-+ rdata->args.offset = rdata->fldata.orig_offset;
++ lo->refcount--;
++ if (!lo->refcount) {
++ dprintk("%s: freeing layout cache %p\n", __func__, lo);
++ BUG_ON(!list_empty(&lo->layouts));
++ NFS_I(lo->inode)->layout = NULL;
++ pnfs_free_layout_hdr(lo);
+ }
-+
-+ /* Note this may cause RPC to be resent */
-+ rdata->pdata.call_ops->rpc_call_done(task, data);
+}
+
-+static void filelayout_read_release(void *data)
++void
++put_layout_hdr(struct inode *inode)
+{
-+ struct nfs_read_data *rdata = (struct nfs_read_data *)data;
++ spin_lock(&inode->i_lock);
++ put_layout_hdr_locked(NFS_I(inode)->layout);
++ spin_unlock(&inode->i_lock);
+
-+ put_lseg(rdata->pdata.lseg);
-+ rdata->pdata.lseg = NULL;
-+ rdata->pdata.call_ops->rpc_release(data);
+}
+
-+static void filelayout_write_call_done(struct rpc_task *task, void *data)
++static void
++init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
-+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
-+
-+ if (wdata->fldata.orig_offset) {
-+ dprintk("%s new off %llu orig offset %llu\n", __func__,
-+ wdata->args.offset, wdata->fldata.orig_offset);
-+ wdata->args.offset = wdata->fldata.orig_offset;
-+ }
-+
-+ /* Note this may cause RPC to be resent */
-+ wdata->pdata.call_ops->rpc_call_done(task, data);
++ INIT_LIST_HEAD(&lseg->fi_list);
++ kref_init(&lseg->kref);
++ lseg->valid = true;
++ lseg->layout = lo;
+}
+
-+static void filelayout_write_release(void *data)
++/* Called without i_lock held */
++static void
++destroy_lseg(struct kref *kref)
+{
-+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
++ struct pnfs_layout_segment *lseg =
++ container_of(kref, struct pnfs_layout_segment, kref);
++ struct pnfs_layout_hdr *local = lseg->layout;
+
-+ put_lseg(wdata->pdata.lseg);
-+ wdata->pdata.lseg = NULL;
-+ wdata->pdata.call_ops->rpc_release(data);
++ dprintk("--> %s\n", __func__);
++ NFS_SERVER(local->inode)->pnfs_curr_ld->free_lseg(lseg);
++ /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
++ put_layout_hdr(local->inode);
+}
+
-+struct rpc_call_ops filelayout_read_call_ops = {
-+ .rpc_call_prepare = nfs_read_prepare,
-+ .rpc_call_done = filelayout_read_call_done,
-+ .rpc_release = filelayout_read_release,
-+};
-+
-+struct rpc_call_ops filelayout_write_call_ops = {
-+ .rpc_call_prepare = nfs_write_prepare,
-+ .rpc_call_done = filelayout_write_call_done,
-+ .rpc_release = filelayout_write_release,
-+};
-+
-+/* Perform sync or async reads.
-+ *
-+ * An optimization for the NFS file layout driver
-+ * allows the original read/write data structs to be passed in the
-+ * last argument.
-+ *
-+ * TODO: join with write_pagelist?
-+ */
-+static enum pnfs_try_status
-+filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages)
++void
++put_lseg(struct pnfs_layout_segment *lseg)
+{
-+ struct pnfs_layout_segment *lseg = data->pdata.lseg;
-+ struct nfs4_pnfs_ds *ds;
-+ loff_t offset = data->args.offset;
-+ u32 idx;
-+ struct nfs_fh *fh;
++ bool do_wake_up;
++ struct nfs_inode *nfsi;
+
-+ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n",
-+ __func__, data->inode->i_ino, nr_pages,
-+ data->args.pgbase, (size_t)data->args.count, offset);
++ if (!lseg)
++ return;
+
-+ /* Retrieve the correct rpc_client for the byte range */
-+ idx = nfs4_fl_calc_ds_index(lseg, offset);
-+ ds = nfs4_fl_prepare_ds(lseg, idx);
-+ if (!ds) {
-+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
-+ return PNFS_NOT_ATTEMPTED;
-+ }
-+ dprintk("%s USE DS:ip %x %hu\n", __func__,
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
++ atomic_read(&lseg->kref.refcount), lseg->valid);
++ do_wake_up = !lseg->valid;
++ nfsi = NFS_I(lseg->layout->inode);
++ kref_put(&lseg->kref, destroy_lseg);
++ if (do_wake_up)
++ rpc_wake_up(&nfsi->lo_rpcwaitq);
++}
++EXPORT_SYMBOL_GPL(put_lseg);
+
-+ /* just try the first data server for the index..*/
-+ data->fldata.ds_nfs_client = ds->ds_clp;
-+ fh = nfs4_fl_select_ds_fh(lseg, offset);
-+ if (fh)
-+ data->args.fh = fh;
++void get_lseg(struct pnfs_layout_segment *lseg)
++{
++ kref_get(&lseg->kref);
++}
++EXPORT_SYMBOL_GPL(get_lseg);
+
-+ /*
-+ * Now get the file offset on the dserver
-+ * Set the read offset to this offset, and
-+ * save the original offset in orig_offset
-+ * In the case of aync reads, the offset will be reset in the
-+ * call_ops->rpc_call_done() routine.
-+ */
-+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-+ data->fldata.orig_offset = offset;
++static inline u64
++end_offset(u64 start, u64 len)
++{
++ u64 end;
+
-+ /* Perform an asynchronous read */
-+ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
-+ &filelayout_read_call_ops);
++ end = start + len;
++ return end >= start ? end: NFS4_MAX_UINT64;
++}
+
-+ data->pdata.pnfs_error = 0;
++/* last octet in a range */
++static inline u64
++last_byte_offset(u64 start, u64 len)
++{
++ u64 end;
+
-+ return PNFS_ATTEMPTED;
++ BUG_ON(!len);
++ end = start + len;
++ return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
-+/* Perform async writes. */
-+static enum pnfs_try_status
-+filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync)
++/*
++ * is l2 fully contained in l1?
++ * start1 end1
++ * [----------------------------------)
++ * start2 end2
++ * [----------------)
++ */
++static inline int
++lo_seg_contained(struct pnfs_layout_range *l1,
++ struct pnfs_layout_range *l2)
+{
-+ struct pnfs_layout_segment *lseg = data->pdata.lseg;
-+ struct nfs4_pnfs_ds *ds;
-+ loff_t offset = data->args.offset;
-+ u32 idx;
-+ struct nfs_fh *fh;
++ u64 start1 = l1->offset;
++ u64 end1 = end_offset(start1, l1->length);
++ u64 start2 = l2->offset;
++ u64 end2 = end_offset(start2, l2->length);
+
-+ /* Retrieve the correct rpc_client for the byte range */
-+ idx = nfs4_fl_calc_ds_index(lseg, offset);
-+ ds = nfs4_fl_prepare_ds(lseg, idx);
-+ if (!ds) {
-+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
-+ return PNFS_NOT_ATTEMPTED;
-+ }
-+ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
-+ data->inode->i_ino, sync, (size_t) data->args.count, offset,
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
++ return (start1 <= start2) && (end1 >= end2);
++}
+
-+ data->fldata.ds_nfs_client = ds->ds_clp;
-+ fh = nfs4_fl_select_ds_fh(lseg, offset);
-+ if (fh)
-+ data->args.fh = fh;
-+ /*
-+ * Get the file offset on the dserver. Set the write offset to
-+ * this offset and save the original offset.
-+ */
-+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
-+ data->fldata.orig_offset = offset;
++/*
++ * is l1 and l2 intersecting?
++ * start1 end1
++ * [----------------------------------)
++ * start2 end2
++ * [----------------)
++ */
++static inline int
++lo_seg_intersecting(struct pnfs_layout_range *l1,
++ struct pnfs_layout_range *l2)
++{
++ u64 start1 = l1->offset;
++ u64 end1 = end_offset(start1, l1->length);
++ u64 start2 = l2->offset;
++ u64 end2 = end_offset(start2, l2->length);
+
-+ /*
-+ * Perform an asynchronous write The offset will be reset in the
-+ * call_ops->rpc_call_done() routine
-+ */
-+ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
-+ &filelayout_write_call_ops, sync);
++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
++ (end2 == NFS4_MAX_UINT64 || end2 > start1);
++}
++
++/*
++ * iomode matching rules:
++ * range lseg match
++ * ----- ----- -----
++ * ANY READ true
++ * ANY RW true
++ * RW READ false
++ * RW RW true
++ * READ READ true
++ * READ RW false
++ */
++static int
++should_free_lseg(struct pnfs_layout_segment *lseg,
++ struct pnfs_layout_range *range)
++{
++ return (range->iomode == IOMODE_ANY ||
++ lseg->range.iomode == range->iomode) &&
++ lo_seg_intersecting(&lseg->range, range);
++}
+
-+ data->pdata.pnfs_error = 0;
-+ return PNFS_ATTEMPTED;
++static bool
++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg)
++{
++ return atomic_read(&lseg->kref.refcount) == 1;
+}
+
-+/*
-+ * filelayout_check_layout()
-+ *
-+ * Make sure layout segment parameters are sane WRT the device.
-+ * At this point no generic layer initialization of the lseg has occurred,
-+ * and nothing has been added to the layout_hdr cache.
-+ *
-+ */
-+static int
-+filelayout_check_layout(struct pnfs_layout_hdr *lo,
-+ struct nfs4_filelayout_segment *fl,
-+ struct nfs4_layoutget_res *lgr,
-+ struct nfs4_deviceid *id)
++static void
++pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
++ struct pnfs_layout_range *range)
+{
-+ struct nfs4_file_layout_dsaddr *dsaddr;
-+ int status = -EINVAL;
-+ struct nfs_server *nfss = NFS_SERVER(lo->inode);
++ struct pnfs_layout_segment *lseg, *next;
+
-+ dprintk("--> %s\n", __func__);
++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
++ __func__, lo, range->offset, range->length, range->iomode);
+
-+ if (fl->pattern_offset > lgr->range.offset) {
-+ dprintk("%s pattern_offset %lld to large\n",
-+ __func__, fl->pattern_offset);
-+ goto out;
++ assert_spin_locked(&lo->inode->i_lock);
++ list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
++ if (!should_free_lseg(lseg, range) ||
++ !_pnfs_can_return_lseg(lseg))
++ continue;
++ dprintk("%s: freeing lseg %p iomode %d "
++ "offset %llu length %llu\n", __func__,
++ lseg, lseg->range.iomode, lseg->range.offset,
++ lseg->range.length);
++ list_move(&lseg->fi_list, tmp_list);
+ }
++ if (list_empty(&lo->segs)) {
++ struct nfs_client *clp;
+
-+ if (fl->stripe_unit % PAGE_SIZE) {
-+ dprintk("%s Stripe unit (%u) not page aligned\n",
-+ __func__, fl->stripe_unit);
-+ goto out;
++ clp = NFS_SERVER(lo->inode)->nfs_client;
++ spin_lock(&clp->cl_lock);
++ /* List does not take a reference, so no need for put here */
++ list_del_init(&lo->layouts);
++ spin_unlock(&clp->cl_lock);
++ pnfs_invalidate_layout_stateid(lo);
+ }
+
-+ /* find and reference the deviceid */
-+ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
-+ if (dsaddr == NULL) {
-+ dsaddr = get_device_info(lo->inode, id);
-+ if (dsaddr == NULL)
-+ goto out;
-+ }
-+ fl->dsaddr = dsaddr;
++ dprintk("%s:Return\n", __func__);
++}
+
-+ if (fl->first_stripe_index < 0 ||
-+ fl->first_stripe_index >= dsaddr->stripe_count) {
-+ dprintk("%s Bad first_stripe_index %d\n",
-+ __func__, fl->first_stripe_index);
-+ goto out_put;
-+ }
++static void
++pnfs_free_lseg_list(struct list_head *tmp_list)
++{
++ struct pnfs_layout_segment *lseg;
+
-+ if ((fl->stripe_type == STRIPE_SPARSE &&
-+ fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
-+ (fl->stripe_type == STRIPE_DENSE &&
-+ fl->num_fh != dsaddr->stripe_count)) {
-+ dprintk("%s num_fh %u not valid for given packing\n",
-+ __func__, fl->num_fh);
-+ goto out_put;
++ while (!list_empty(tmp_list)) {
++ lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
++ fi_list);
++ dprintk("%s calling put_lseg on %p\n", __func__, lseg);
++ list_del(&lseg->fi_list);
++ put_lseg(lseg);
+ }
++}
+
-+ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
-+ dprintk("%s Stripe unit (%u) not aligned with rsize %u "
-+ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
-+ nfss->wsize);
-+ }
++void
++pnfs_destroy_layout(struct nfs_inode *nfsi)
++{
++ struct pnfs_layout_hdr *lo;
++ struct pnfs_layout_range range = {
++ .iomode = IOMODE_ANY,
++ .offset = 0,
++ .length = NFS4_MAX_UINT64,
++ };
++ LIST_HEAD(tmp_list);
+
-+ status = 0;
-+out:
-+ dprintk("--> %s returns %d\n", __func__, status);
-+ return status;
-+out_put:
-+ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
-+ goto out;
++ spin_lock(&nfsi->vfs_inode.i_lock);
++ lo = nfsi->layout;
++ if (lo) {
++ pnfs_clear_lseg_list(lo, &tmp_list, &range);
++ WARN_ON(!list_empty(&nfsi->layout->segs));
++ WARN_ON(!list_empty(&nfsi->layout->layouts));
++ WARN_ON(nfsi->layout->refcount != 1);
++
++ /* Matched by refcount set to 1 in alloc_init_layout_hdr */
++ put_layout_hdr_locked(lo);
++ }
++ spin_unlock(&nfsi->vfs_inode.i_lock);
++ pnfs_free_lseg_list(&tmp_list);
+}
+
-+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
++/*
++ * Called by the state manger to remove all layouts established under an
++ * expired lease.
++ */
++void
++pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
-+ int i;
++ struct pnfs_layout_hdr *lo;
++ LIST_HEAD(tmp_list);
+
-+ for (i = 0; i < fl->num_fh; i++) {
-+ if (!fl->fh_array[i])
-+ break;
-+ kfree(fl->fh_array[i]);
++ spin_lock(&clp->cl_lock);
++ list_splice_init(&clp->cl_layouts, &tmp_list);
++ spin_unlock(&clp->cl_lock);
++
++ while (!list_empty(&tmp_list)) {
++ lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
++ layouts);
++ dprintk("%s freeing layout for inode %lu\n", __func__,
++ lo->inode->i_ino);
++ pnfs_destroy_layout(NFS_I(lo->inode));
+ }
-+ kfree(fl->fh_array);
-+ fl->fh_array = NULL;
+}
+
++/* update lo->stateid with new if is more recent
++ *
++ * lo->stateid could be the open stateid, in which case we just use what given.
++ */
+static void
-+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
++pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
++ const nfs4_stateid *new)
+{
-+ filelayout_free_fh_array(fl);
-+ kfree(fl);
++ nfs4_stateid *old = &lo->stateid;
++ bool overwrite = false;
++
++ write_seqlock(&lo->seqlock);
++ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
++ memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
++ overwrite = true;
++ else {
++ u32 oldseq, newseq;
++
++ oldseq = be32_to_cpu(old->stateid.seqid);
++ newseq = be32_to_cpu(new->stateid.seqid);
++ if ((int)(newseq - oldseq) > 0)
++ overwrite = true;
++ }
++ if (overwrite)
++ memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
++ write_sequnlock(&lo->seqlock);
+}
+
-+static int
-+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
-+ struct nfs4_filelayout_segment *fl,
-+ struct nfs4_layoutget_res *lgr,
-+ struct nfs4_deviceid *id)
++static void
++pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
++ struct nfs4_state *state)
+{
-+ uint32_t *p = (uint32_t *)lgr->layout.buf;
-+ uint32_t nfl_util;
-+ int i;
-+
-+ dprintk("%s: set_layout_map Begin\n", __func__);
++ int seq;
+
-+ memcpy(id, p, sizeof(*id));
-+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-+ print_deviceid(id);
++ dprintk("--> %s\n", __func__);
++ write_seqlock(&lo->seqlock);
++ do {
++ seq = read_seqbegin(&state->seqlock);
++ memcpy(lo->stateid.data, state->stateid.data,
++ sizeof(state->stateid.data));
++ } while (read_seqretry(&state->seqlock, seq));
++ set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
++ write_sequnlock(&lo->seqlock);
++ dprintk("<-- %s\n", __func__);
++}
+
-+ nfl_util = be32_to_cpup(p++);
-+ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
-+ fl->commit_through_mds = 1;
-+ if (nfl_util & NFL4_UFLG_DENSE)
-+ fl->stripe_type = STRIPE_DENSE;
-+ else
-+ fl->stripe_type = STRIPE_SPARSE;
-+ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
++void
++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
++ struct nfs4_state *open_state)
++{
++ int seq;
+
-+ fl->first_stripe_index = be32_to_cpup(p++);
-+ p = xdr_decode_hyper(p, &fl->pattern_offset);
-+ fl->num_fh = be32_to_cpup(p++);
++ dprintk("--> %s\n", __func__);
++ do {
++ seq = read_seqbegin(&lo->seqlock);
++ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
++ /* This will trigger retry of the read */
++ pnfs_layout_from_open_stateid(lo, open_state);
++ } else
++ memcpy(dst->data, lo->stateid.data,
++ sizeof(lo->stateid.data));
++ } while (read_seqretry(&lo->seqlock, seq));
++ dprintk("<-- %s\n", __func__);
++}
+
-+ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
-+ __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
-+ fl->pattern_offset);
++/*
++* Get layout from server.
++* for now, assume that whole file layouts are requested.
++* arg->offset: 0
++* arg->length: all ones
++*/
++static struct pnfs_layout_segment *
++send_layoutget(struct pnfs_layout_hdr *lo,
++ struct nfs_open_context *ctx,
++ struct pnfs_layout_range *range)
++{
++ struct inode *ino = lo->inode;
++ struct nfs_server *server = NFS_SERVER(ino);
++ struct nfs4_layoutget *lgp;
++ struct pnfs_layout_segment *lseg = NULL;
+
-+ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
-+ GFP_KERNEL);
-+ if (!fl->fh_array)
-+ return -ENOMEM;
++ dprintk("--> %s\n", __func__);
+
-+ for (i = 0; i < fl->num_fh; i++) {
-+ /* Do we want to use a mempool here? */
-+ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
-+ if (!fl->fh_array[i]) {
-+ filelayout_free_fh_array(fl);
-+ return -ENOMEM;
-+ }
-+ fl->fh_array[i]->size = be32_to_cpup(p++);
-+ if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
-+ printk(KERN_ERR "Too big fh %d received %d\n",
-+ i, fl->fh_array[i]->size);
-+ filelayout_free_fh_array(fl);
-+ return -EIO;
-+ }
-+ memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
-+ p += XDR_QUADLEN(fl->fh_array[i]->size);
-+ dprintk("DEBUG: %s: fh len %d\n", __func__,
-+ fl->fh_array[i]->size);
++ BUG_ON(ctx == NULL);
++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
++ if (lgp == NULL) {
++ put_layout_hdr(ino);
++ return NULL;
+ }
++ lgp->args.minlength = PAGE_CACHE_SIZE;
++ if (lgp->args.minlength > range->length)
++ lgp->args.minlength = range->length;
++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
++ lgp->args.range = *range;
++ lgp->args.type = server->pnfs_curr_ld->id;
++ lgp->args.inode = ino;
++ lgp->args.ctx = get_nfs_open_context(ctx);
++ lgp->lsegpp = &lseg;
+
-+ return 0;
++ /* Synchronously retrieve layout information from server and
++ * store in lseg.
++ */
++ nfs4_proc_layoutget(lgp);
++ if (!lseg) {
++ /* remember that LAYOUTGET failed and suspend trying */
++ set_bit(lo_fail_bit(range->iomode), &lo->state);
++ }
++ return lseg;
+}
+
+static struct pnfs_layout_segment *
-+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
-+ struct nfs4_layoutget_res *lgr)
++has_layout_to_return(struct pnfs_layout_hdr *lo,
++ struct pnfs_layout_range *range)
+{
-+ struct nfs4_filelayout_segment *fl;
-+ int rc;
-+ struct nfs4_deviceid id;
++ struct pnfs_layout_segment *out = NULL, *lseg;
++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
++ __func__, lo, range->offset, range->length, range->iomode);
+
-+ dprintk("--> %s\n", __func__);
-+ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
-+ if (!fl)
-+ return NULL;
++ assert_spin_locked(&lo->inode->i_lock);
++ list_for_each_entry(lseg, &lo->segs, fi_list)
++ if (should_free_lseg(lseg, range)) {
++ out = lseg;
++ break;
++ }
+
-+ rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
-+ if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
-+ _filelayout_free_lseg(fl);
-+ return NULL;
-+ }
-+ return &fl->generic_hdr;
++ dprintk("%s:Return lseg=%p\n", __func__, out);
++ return out;
+}
+
-+static void
-+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
++bool
++pnfs_return_layout_barrier(struct nfs_inode *nfsi,
++ struct pnfs_layout_range *range)
+{
-+ struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
-+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
++ struct pnfs_layout_segment *lseg;
++ bool ret = false;
+
-+ dprintk("--> %s\n", __func__);
-+ pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
-+ &fl->dsaddr->deviceid);
-+ _filelayout_free_lseg(fl);
++ spin_lock(&nfsi->vfs_inode.i_lock);
++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) {
++ if (!should_free_lseg(lseg, range))
++ continue;
++ lseg->valid = false;
++ if (!_pnfs_can_return_lseg(lseg)) {
++ dprintk("%s: wait on lseg %p refcount %d\n",
++ __func__, lseg,
++ atomic_read(&lseg->kref.refcount));
++ ret = true;
++ }
++ }
++ spin_unlock(&nfsi->vfs_inode.i_lock);
++ dprintk("%s:Return %d\n", __func__, ret);
++ return ret;
+}
+
-+/* Allocate a new nfs_write_data struct and initialize */
-+static struct nfs_write_data *
-+filelayout_clone_write_data(struct nfs_write_data *old)
++void
++pnfs_layoutreturn_release(struct nfs4_layoutreturn *lrp)
+{
-+ static struct nfs_write_data *new;
++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
++ LIST_HEAD(tmp_list);
+
-+ new = nfs_commitdata_alloc();
-+ if (!new)
-+ goto out;
-+ kref_init(&new->refcount);
-+ new->parent = old;
-+ kref_get(&old->refcount);
-+ new->inode = old->inode;
-+ new->cred = old->cred;
-+ new->args.offset = 0;
-+ new->args.count = 0;
-+ new->res.count = 0;
-+ new->res.fattr = &new->fattr;
-+ nfs_fattr_init(&new->fattr);
-+ new->res.verf = &new->verf;
-+ new->args.context = get_nfs_open_context(old->args.context);
-+ new->pdata.lseg = NULL;
-+ new->pdata.call_ops = old->pdata.call_ops;
-+ new->pdata.how = old->pdata.how;
-+out:
-+ return new;
++ if (lrp->args.return_type != RETURN_FILE)
++ return;
++ spin_lock(&lrp->args.inode->i_lock);
++ pnfs_clear_lseg_list(lo, &tmp_list, &lrp->args.range);
++ if (!lrp->res.valid)
++ ; /* forgetful model internal release */
++ else if (!lrp->res.lrs_present)
++ pnfs_invalidate_layout_stateid(lo);
++ else
++ pnfs_set_layout_stateid(lo, &lrp->res.stateid);
++ put_layout_hdr_locked(lo); /* Matched in _pnfs_return_layout */
++ spin_unlock(&lrp->args.inode->i_lock);
++ pnfs_free_lseg_list(&tmp_list);
+}
+
-+static void filelayout_commit_call_done(struct rpc_task *task, void *data)
++static int
++return_layout(struct inode *ino, struct pnfs_layout_range *range,
++ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
++ bool wait, const nfs4_stateid *stateid)
+{
-+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
++ struct nfs4_layoutreturn *lrp;
++ struct nfs_server *server = NFS_SERVER(ino);
++ int status = -ENOMEM;
+
-+ wdata->pdata.call_ops->rpc_call_done(task, data);
-+}
++ dprintk("--> %s\n", __func__);
+
-+static struct rpc_call_ops filelayout_commit_call_ops = {
-+ .rpc_call_prepare = nfs_write_prepare,
-+ .rpc_call_done = filelayout_commit_call_done,
-+ .rpc_release = filelayout_write_release,
-+};
++ BUG_ON(type != RETURN_FILE);
+
-+/*
-+ * Execute a COMMIT op to the MDS or to each data server on which a page
-+ * in 'pages' exists.
-+ * Invoke the pnfs_commit_complete callback.
-+ */
-+enum pnfs_try_status
-+filelayout_commit(struct nfs_write_data *data, int sync)
++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
++ if (lrp == NULL) {
++ if (lo && (type == RETURN_FILE))
++ put_layout_hdr(lo->inode);
++ goto out;
++ }
++ lrp->args.reclaim = 0;
++ lrp->args.layout_type = server->pnfs_curr_ld->id;
++ lrp->args.return_type = type;
++ lrp->args.range = *range;
++ lrp->args.inode = ino;
++ lrp->stateid = stateid;
++
++ status = nfs4_proc_layoutreturn(lrp, wait);
++out:
++ dprintk("<-- %s status: %d\n", __func__, status);
++ return status;
++}
++
++int
++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
++ const nfs4_stateid *stateid, /* optional */
++ enum pnfs_layoutreturn_type type,
++ bool wait)
+{
-+ LIST_HEAD(head);
-+ struct nfs_page *req;
-+ loff_t file_offset = 0;
-+ u16 idx, i;
-+ struct list_head **ds_page_list = NULL;
-+ u16 *indices_used;
-+ int num_indices_seen = 0;
-+ const struct rpc_call_ops *call_ops;
-+ struct rpc_clnt *clnt;
-+ struct nfs_write_data **clone_list = NULL;
-+ struct nfs_write_data *dsdata;
-+ struct nfs4_pnfs_ds *ds;
++ struct pnfs_layout_hdr *lo = NULL;
++ struct nfs_inode *nfsi = NFS_I(ino);
++ struct pnfs_layout_range arg;
++ int status = 0;
+
-+ dprintk("%s data %p sync %d\n", __func__, data, sync);
++ dprintk("--> %s type %d\n", __func__, type);
+
-+ /* Alloc room for both in one go */
-+ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) *
-+ (sizeof(u16) + sizeof(struct list_head *)),
-+ GFP_KERNEL);
-+ if (!ds_page_list)
-+ goto mem_error;
-+ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1);
-+ /*
-+ * Sort pages based on which ds to send to.
-+ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT.
-+ * Note we are assuming there is only a single lseg in play.
-+ * When that is not true, we could first sort on lseg, then
-+ * sort within each as we do here.
-+ */
-+ while (!list_empty(&data->pages)) {
-+ req = nfs_list_entry(data->pages.next);
-+ nfs_list_remove_request(req);
-+ if (!req->wb_lseg ||
-+ ((struct nfs4_filelayout_segment *)
-+ FILELAYOUT_LSEG(req->wb_lseg))->commit_through_mds)
-+ idx = NFS4_PNFS_MAX_MULTI_CNT;
-+ else {
-+ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
-+ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset);
-+ }
-+ if (ds_page_list[idx]) {
-+ /* Already seen this idx */
-+ list_add(&req->wb_list, ds_page_list[idx]);
-+ } else {
-+ /* New idx not seen so far */
-+ list_add_tail(&req->wb_list, &head);
-+ indices_used[num_indices_seen++] = idx;
-+ }
-+ ds_page_list[idx] = &req->wb_list;
-+ }
-+ /* Once created, clone must be released via call_op */
-+ clone_list = kzalloc(num_indices_seen *
-+ sizeof(struct nfs_write_data *), GFP_KERNEL);
-+ if (!clone_list)
-+ goto mem_error;
-+ for (i = 0; i < num_indices_seen - 1; i++) {
-+ clone_list[i] = filelayout_clone_write_data(data);
-+ if (!clone_list[i])
-+ goto mem_error;
-+ }
-+ clone_list[i] = data;
-+ /*
-+ * Now send off the RPCs to each ds. Note that it is important
-+ * that any RPC to the MDS be sent last (or at least after all
-+ * clones have been made.)
-+ */
-+ for (i = 0; i < num_indices_seen; i++) {
-+ dsdata = clone_list[i];
-+ idx = indices_used[i];
-+ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]);
-+ if (idx == NFS4_PNFS_MAX_MULTI_CNT) {
-+ call_ops = data->pdata.call_ops;;
-+ clnt = NFS_CLIENT(dsdata->inode);
-+ ds = NULL;
-+ } else {
-+ struct nfs_fh *fh;
+
-+ call_ops = &filelayout_commit_call_ops;
-+ req = nfs_list_entry(dsdata->pages.next);
-+ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx);
-+ if (!ds) {
-+ /* Trigger retry of this chunk through MDS */
-+ dsdata->task.tk_status = -EIO;
-+ data->pdata.call_ops->rpc_release(dsdata);
-+ continue;
-+ }
-+ clnt = ds->ds_clp->cl_rpcclient;
-+ dsdata->fldata.ds_nfs_client = ds->ds_clp;
-+ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT;
-+ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset);
-+ if (fh)
-+ dsdata->args.fh = fh;
-+ }
-+ dprintk("%s: Initiating commit: %llu USE DS:\n",
-+ __func__, file_offset);
-+ print_ds(ds);
++ arg.iomode = range ? range->iomode : IOMODE_ANY;
++ arg.offset = 0;
++ arg.length = NFS4_MAX_UINT64;
++
++ if (type == RETURN_FILE) {
++ spin_lock(&ino->i_lock);
++ lo = nfsi->layout;
++ if (lo && !has_layout_to_return(lo, &arg))
++ lo = NULL;
++ if (!lo) {
++ spin_unlock(&ino->i_lock);
++ dprintk("%s: no layout segments to return\n", __func__);
++ goto out;
++ }
+
-+ /* Send COMMIT to data server */
-+ nfs_initiate_commit(dsdata, clnt, call_ops, sync);
-+ }
-+ kfree(clone_list);
-+ kfree(ds_page_list);
-+ data->pdata.pnfs_error = 0;
-+ return PNFS_ATTEMPTED;
++ /* Reference matched in pnfs_layoutreturn_release */
++ get_layout_hdr_locked(lo);
+
-+ mem_error:
-+ if (clone_list) {
-+ for (i = 0; i < num_indices_seen - 1; i++) {
-+ if (!clone_list[i])
-+ break;
-+ data->pdata.call_ops->rpc_release(clone_list[i]);
++ spin_unlock(&ino->i_lock);
++
++ if (layoutcommit_needed(nfsi)) {
++ if (stateid && !wait) { /* callback */
++ dprintk("%s: layoutcommit pending\n", __func__);
++ status = -EAGAIN;
++ goto out_put;
++ }
++ status = pnfs_layoutcommit_inode(ino, wait);
++ if (status) {
++ /* Return layout even if layoutcommit fails */
++ dprintk("%s: layoutcommit failed, status=%d. "
++ "Returning layout anyway\n",
++ __func__, status);
++ }
+ }
-+ kfree(clone_list);
++ status = return_layout(ino, &arg, type, lo, wait, stateid);
+ }
-+ kfree(ds_page_list);
-+ /* One of these will be empty, but doesn't hurt to do both */
-+ nfs_mark_list_commit(&head);
-+ nfs_mark_list_commit(&data->pages);
-+ data->pdata.call_ops->rpc_release(data);
-+ return PNFS_ATTEMPTED;
++out:
++ dprintk("<-- %s status: %d\n", __func__, status);
++ return status;
++out_put:
++ put_layout_hdr(ino);
++ goto out;
+}
+
+/*
-+ * Return the stripesize for the specified file
-+ * Called with inode i_lock held.
++ * Compare two layout segments for sorting into layout cache.
++ * We want to preferentially return RW over RO layouts, so ensure those
++ * are seen first.
+ */
-+ssize_t
-+filelayout_get_stripesize(struct pnfs_layout_hdr *lo)
++static s64
++cmp_layout(struct pnfs_layout_range *l1,
++ struct pnfs_layout_range *l2)
+{
-+ struct pnfs_layout_range range = {
-+ .iomode = IOMODE_READ,
-+ .offset = 0,
-+ .length = NFS4_MAX_UINT64,
-+ };
-+ struct pnfs_layout_segment *lseg;
-+ struct nfs4_filelayout_segment *fl;
-+ ssize_t size;
++ s64 d;
+
-+ /* Horrible hack...ideally upper layer would send lseg */
-+ lseg = pnfs_has_layout(lo, &range);
-+ if (!lseg)
-+ return 0;
-+ fl = container_of(lseg, struct nfs4_filelayout_segment, generic_hdr);
-+ size = fl->stripe_unit;
-+ put_lseg_locked(lseg);
-+ return size;
++ /* higher offset > lower offset */
++ d = l1->offset - l2->offset;
++ if (d)
++ return d;
++
++ /* longer length > shorter length */
++ d = l1->length - l2->length;
++ if (d)
++ return d;
++
++ /* read > read/write */
++ return (int)(l2->iomode == IOMODE_READ) -
++ (int)(l1->iomode == IOMODE_READ);
+}
+
-+/*
-+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
-+ *
-+ * return 1 : coalesce page
-+ * return 0 : don't coalesce page
-+ */
-+int
-+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
-+ struct nfs_page *req)
++static void
++pnfs_insert_layout(struct pnfs_layout_hdr *lo,
++ struct pnfs_layout_segment *lseg)
+{
-+ u64 p_stripe, r_stripe;
++ struct pnfs_layout_segment *lp;
++ int found = 0;
+
-+ if (pgio->pg_boundary == 0)
-+ return 1;
-+ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
-+ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
++ dprintk("%s:Begin\n", __func__);
+
-+ do_div(p_stripe, pgio->pg_boundary);
-+ do_div(r_stripe, pgio->pg_boundary);
++ assert_spin_locked(&lo->inode->i_lock);
++ if (list_empty(&lo->segs)) {
++ struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+
-+ return (p_stripe == r_stripe);
-+}
++ spin_lock(&clp->cl_lock);
++ BUG_ON(!list_empty(&lo->layouts));
++ list_add_tail(&lo->layouts, &clp->cl_layouts);
++ spin_unlock(&clp->cl_lock);
++ }
++ list_for_each_entry(lp, &lo->segs, fi_list) {
++ if (cmp_layout(&lp->range, &lseg->range) > 0)
++ continue;
++ list_add_tail(&lseg->fi_list, &lp->fi_list);
++ dprintk("%s: inserted lseg %p "
++ "iomode %d offset %llu length %llu before "
++ "lp %p iomode %d offset %llu length %llu\n",
++ __func__, lseg, lseg->range.iomode,
++ lseg->range.offset, lseg->range.length,
++ lp, lp->range.iomode, lp->range.offset,
++ lp->range.length);
++ found = 1;
++ break;
++ }
++ if (!found) {
++ list_add_tail(&lseg->fi_list, &lo->segs);
++ dprintk("%s: inserted lseg %p "
++ "iomode %d offset %llu length %llu at tail\n",
++ __func__, lseg, lseg->range.iomode,
++ lseg->range.offset, lseg->range.length);
++ }
++ get_layout_hdr_locked(lo);
+
-+static struct pnfs_layoutdriver_type filelayout_type = {
-+ .id = LAYOUT_NFSV4_1_FILES,
-+ .name = "LAYOUT_NFSV4_1_FILES",
-+ .owner = THIS_MODULE,
-+ .flags = PNFS_USE_RPC_CODE,
-+ .initialize_mountpoint = filelayout_initialize_mountpoint,
-+ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint,
-+ .alloc_lseg = filelayout_alloc_lseg,
-+ .free_lseg = filelayout_free_lseg,
-+ .get_stripesize = filelayout_get_stripesize,
-+ .pg_test = filelayout_pg_test,
-+ .read_pagelist = filelayout_read_pagelist,
-+ .write_pagelist = filelayout_write_pagelist,
-+ .commit = filelayout_commit,
-+};
++ dprintk("%s:Return\n", __func__);
++}
+
-+static int __init nfs4filelayout_init(void)
++static struct pnfs_layout_hdr *
++alloc_init_layout_hdr(struct inode *ino)
+{
-+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
-+ __func__);
-+ return pnfs_register_layoutdriver(&filelayout_type);
++ struct pnfs_layout_hdr *lo;
++
++ lo = pnfs_alloc_layout_hdr(ino);
++ if (!lo)
++ return NULL;
++ lo->refcount = 1;
++ INIT_LIST_HEAD(&lo->layouts);
++ INIT_LIST_HEAD(&lo->segs);
++ seqlock_init(&lo->seqlock);
++ lo->inode = ino;
++ return lo;
+}
+
-+static void __exit nfs4filelayout_exit(void)
++static struct pnfs_layout_hdr *
++pnfs_find_alloc_layout(struct inode *ino)
+{
-+ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
-+ __func__);
-+ pnfs_unregister_layoutdriver(&filelayout_type);
-+}
++ struct nfs_inode *nfsi = NFS_I(ino);
++ struct pnfs_layout_hdr *new = NULL;
+
-+module_init(nfs4filelayout_init);
-+module_exit(nfs4filelayout_exit);
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c
---- linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-09-30 12:25:08.309280000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c 2010-09-30 12:25:08.311279000 -0400
-@@ -0,0 +1,627 @@
-+/*
-+ * Device operations for the pnfs nfs4 file layout driver.
-+ *
-+ * Copyright (c) 2002
-+ * The Regents of the University of Michigan
-+ * All Rights Reserved
-+ *
-+ * Dean Hildebrand <dhildebz at umich.edu>
-+ * Garth Goodson <Garth.Goodson at netapp.com>
-+ *
-+ * Permission is granted to use, copy, create derivative works, and
-+ * redistribute this software and such derivative works for any purpose,
-+ * so long as the name of the University of Michigan is not used in
-+ * any advertising or publicity pertaining to the use or distribution
-+ * of this software without specific, written prior authorization. If
-+ * the above copyright notice or any other identification of the
-+ * University of Michigan is included in any copy of any portion of
-+ * this software, then the disclaimer below must also be included.
-+ *
-+ * This software is provided as is, without representation or warranty
-+ * of any kind either express or implied, including without limitation
-+ * the implied warranties of merchantability, fitness for a particular
-+ * purpose, or noninfringement. The Regents of the University of
-+ * Michigan shall not be liable for any damages, including special,
-+ * indirect, incidental, or consequential damages, with respect to any
-+ * claim arising out of or in connection with the use of the software,
-+ * even if it has been or is hereafter advised of the possibility of
-+ * such damages.
-+ */
++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
-+#include <linux/nfs_fs.h>
-+#include <linux/vmalloc.h>
++ assert_spin_locked(&ino->i_lock);
++ if (nfsi->layout)
++ return nfsi->layout;
+
-+#include "internal.h"
-+#include "nfs4filelayout.h"
++ spin_unlock(&ino->i_lock);
++ new = alloc_init_layout_hdr(ino);
++ spin_lock(&ino->i_lock);
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++ if (likely(nfsi->layout == NULL)) /* Won the race? */
++ nfsi->layout = new;
++ else
++ pnfs_free_layout_hdr(new);
++ return nfsi->layout;
++}
+
+/*
-+ * Data server cache
-+ *
-+ * Data servers can be mapped to different device ids.
-+ * nfs4_pnfs_ds reference counting
-+ * - set to 1 on allocation
-+ * - incremented when a device id maps a data server already in the cache.
-+ * - decremented when deviceid is removed from the cache.
++ * iomode matching rules:
++ * range lseg match
++ * ----- ----- -----
++ * ANY READ true
++ * ANY RW true
++ * RW READ false
++ * RW RW true
++ * READ READ true
++ * READ RW true
+ */
-+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-+static LIST_HEAD(nfs4_data_server_cache);
-+
-+/* Debug routines */
-+void
-+print_ds(struct nfs4_pnfs_ds *ds)
-+{
-+ if (ds == NULL) {
-+ printk("%s NULL device\n", __func__);
-+ return;
-+ }
-+ printk(" ip_addr %x port %hu\n"
-+ " ref count %d\n"
-+ " client %p\n"
-+ " cl_exchange_flags %x\n",
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
-+ atomic_read(&ds->ds_count), ds->ds_clp,
-+ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
-+}
-+
-+void
-+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
++static int
++is_matching_lseg(struct pnfs_layout_segment *lseg,
++ struct pnfs_layout_range *range)
+{
-+ int i;
-+
-+ ifdebug(FACILITY) {
-+ printk("%s dsaddr->ds_num %d\n", __func__,
-+ dsaddr->ds_num);
-+ for (i = 0; i < dsaddr->ds_num; i++)
-+ print_ds(dsaddr->ds_list[i]);
-+ }
-+}
++ struct pnfs_layout_range range1;
+
-+void print_deviceid(struct nfs4_deviceid *id)
-+{
-+ u32 *p = (u32 *)id;
++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) ||
++ !lo_seg_intersecting(&lseg->range, range))
++ return 0;
+
-+ dprintk("%s: device id= [%x%x%x%x]\n", __func__,
-+ p[0], p[1], p[2], p[3]);
++ /* range1 covers only the first byte in the range */
++ range1 = *range;
++ range1.length = 1;
++ return lo_seg_contained(&lseg->range, &range1);
+}
+
-+/* nfs4_ds_cache_lock is held */
-+static struct nfs4_pnfs_ds *
-+_data_server_lookup_locked(u32 ip_addr, u32 port)
++/*
++ * lookup range in layout
++ */
++struct pnfs_layout_segment *
++pnfs_has_layout(struct pnfs_layout_hdr *lo,
++ struct pnfs_layout_range *range)
+{
-+ struct nfs4_pnfs_ds *ds;
++ struct pnfs_layout_segment *lseg, *ret = NULL;
+
-+ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
-+ ntohl(ip_addr), ntohs(port));
++ dprintk("%s:Begin\n", __func__);
+
-+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
-+ if (ds->ds_ip_addr == ip_addr &&
-+ ds->ds_port == port) {
-+ return ds;
++ assert_spin_locked(&lo->inode->i_lock);
++ list_for_each_entry(lseg, &lo->segs, fi_list) {
++ if (is_matching_lseg(lseg, range)) {
++ ret = lseg;
++ break;
+ }
++ if (cmp_layout(range, &lseg->range) > 0)
++ break;
+ }
-+ return NULL;
++
++ dprintk("%s:Return lseg %p ref %d valid %d\n",
++ __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0,
++ ret ? ret->valid : 0);
++ return ret;
+}
+
-+/* Create an rpc to the data server defined in 'dev_list' */
-+static int
-+nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
++/*
++ * Layout segment is retreived from the server if not cached.
++ * The appropriate layout segment is referenced and returned to the caller.
++ */
++struct pnfs_layout_segment *
++pnfs_update_layout(struct inode *ino,
++ struct nfs_open_context *ctx,
++ loff_t pos,
++ u64 count,
++ enum pnfs_iomode iomode)
+{
-+ struct nfs_server *tmp;
-+ struct sockaddr_in sin;
-+ struct rpc_clnt *mds_clnt = mds_srv->client;
-+ struct nfs_client *clp = mds_srv->nfs_client;
-+ struct sockaddr *mds_addr;
-+ int err = 0;
-+
-+ dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
-+ mds_clnt->cl_auth->au_flavor);
++ struct pnfs_layout_range arg = {
++ .iomode = iomode,
++ .offset = pos,
++ .length = count,
++ };
++ struct nfs_inode *nfsi = NFS_I(ino);
++ struct pnfs_layout_hdr *lo;
++ struct pnfs_layout_segment *lseg = NULL;
+
-+ sin.sin_family = AF_INET;
-+ sin.sin_addr.s_addr = ds->ds_ip_addr;
-+ sin.sin_port = ds->ds_port;
++ if (!pnfs_enabled_sb(NFS_SERVER(ino)))
++ return NULL;
++ spin_lock(&ino->i_lock);
++ lo = pnfs_find_alloc_layout(ino);
++ if (lo == NULL) {
++ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
++ goto out_unlock;
++ }
+
-+ /*
-+ * If this DS is also the MDS, use the MDS session only if the
-+ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role.
-+ */
-+ mds_addr = (struct sockaddr *)&clp->cl_addr;
-+ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) {
-+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
-+ printk(KERN_INFO
-+ "ip:port %x:%hu is not a pNFS Data Server\n",
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
-+ err = -ENODEV;
-+ } else {
-+ atomic_inc(&clp->cl_count);
-+ ds->ds_clp = clp;
-+ dprintk("%s Using MDS Session for DS\n", __func__);
++ /* Check to see if the layout for the given range already exists */
++ lseg = pnfs_has_layout(lo, &arg);
++ if (lseg) {
++ if (lseg->valid) {
++ dprintk("%s: Using cached lseg %p for %llu@%llu "
++ "iomode %d)\n",
++ __func__,
++ lseg,
++ arg.length,
++ arg.offset,
++ arg.iomode);
++ get_lseg(lseg);
++ goto out_unlock;
+ }
-+ goto out;
++ /* someone is cleaning the layout */
++ lseg = NULL;
+ }
+
-+ /* Temporay server for nfs4_set_client */
-+ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-+ if (!tmp)
-+ goto out;
-+
-+ /*
-+ * Set a retrans, timeout interval, and authflavor equual to the MDS
-+ * values. Use the MDS nfs_client cl_ipaddr field so as to use the
-+ * same co_ownerid as the MDS.
-+ */
-+ err = nfs4_set_client(tmp,
-+ mds_srv->nfs_client->cl_hostname,
-+ (struct sockaddr *)&sin,
-+ sizeof(struct sockaddr),
-+ mds_srv->nfs_client->cl_ipaddr,
-+ mds_clnt->cl_auth->au_flavor,
-+ IPPROTO_TCP,
-+ mds_clnt->cl_xprt->timeout,
-+ 1 /* minorversion */);
-+ if (err < 0)
-+ goto out_free;
-+
-+ clp = tmp->nfs_client;
-+
-+ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */
-+ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp);
-+ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS;
-+
-+ err = nfs4_recover_expired_lease(clp);
-+ if (!err)
-+ err = nfs4_check_client_ready(clp);
-+ if (err)
-+ goto out_put;
-+
-+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) {
-+ printk(KERN_INFO "ip:port %x:%hu is not a pNFS Data Server\n",
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
-+ err = -ENODEV;
-+ goto out_put;
-+ }
-+ /*
-+ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role
-+ * The is_ds_only_session depends on this.
-+ */
-+ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS;
-+ /*
-+ * Set DS lease equal to the MDS lease, renewal is scheduled in
-+ * create_session
-+ */
-+ spin_lock(&mds_srv->nfs_client->cl_lock);
-+ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
-+ spin_unlock(&mds_srv->nfs_client->cl_lock);
-+ clp->cl_last_renewal = jiffies;
++ /* if LAYOUTGET already failed once we don't try again */
++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
++ goto out_unlock;
+
-+ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-+ ds->ds_clp = clp;
++ get_layout_hdr_locked(lo); /* Matched in pnfs_layoutget_release */
++ spin_unlock(&ino->i_lock);
+
-+ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__,
-+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
-+ clp->cl_rpcclient);
-+out_free:
-+ kfree(tmp);
++ lseg = send_layoutget(lo, ctx, &arg);
+out:
-+ dprintk("%s Returns %d\n", __func__, err);
-+ return err;
-+out_put:
-+ nfs_put_client(clp);
-+ goto out_free;
++ dprintk("%s end, state 0x%lx lseg %p\n", __func__,
++ nfsi->layout->state, lseg);
++ return lseg;
++out_unlock:
++ spin_unlock(&ino->i_lock);
++ goto out;
+}
+
-+static void
-+destroy_ds(struct nfs4_pnfs_ds *ds)
++int
++pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
-+ dprintk("--> %s\n", __func__);
-+ ifdebug(FACILITY)
-+ print_ds(ds);
-+
-+ if (ds->ds_clp)
-+ nfs_put_client(ds->ds_clp);
-+ kfree(ds);
-+}
++ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
++ struct nfs4_layoutget_res *res = &lgp->res;
++ struct pnfs_layout_segment *lseg;
++ struct inode *ino = lo->inode;
++ int status = 0;
+
-+static void
-+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
-+{
-+ struct nfs4_pnfs_ds *ds;
-+ int i;
++ /* Inject layout blob into I/O device driver */
++ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
++ if (!lseg || IS_ERR(lseg)) {
++ if (!lseg)
++ status = -ENOMEM;
++ else
++ status = PTR_ERR(lseg);
++ dprintk("%s: Could not allocate layout: error %d\n",
++ __func__, status);
++ goto out;
++ }
+
-+ print_deviceid(&dsaddr->deviceid.de_id);
++ spin_lock(&ino->i_lock);
++ init_lseg(lo, lseg);
++ lseg->range = res->range;
++ get_lseg(lseg);
++ *lgp->lsegpp = lseg;
++ pnfs_insert_layout(lo, lseg);
+
-+ for (i = 0; i < dsaddr->ds_num; i++) {
-+ ds = dsaddr->ds_list[i];
-+ if (ds != NULL) {
-+ if (atomic_dec_and_lock(&ds->ds_count,
-+ &nfs4_ds_cache_lock)) {
-+ list_del_init(&ds->ds_node);
-+ spin_unlock(&nfs4_ds_cache_lock);
-+ destroy_ds(ds);
-+ }
-+ }
++ if (res->return_on_close) {
++ /* FI: This needs to be re-examined. At lo level,
++ * all it needs is a bit indicating whether any of
++ * the lsegs in the list have the flags set.
++ */
++ lo->roc_iomode |= res->range.iomode;
+ }
-+ kfree(dsaddr->stripe_indices);
-+ kfree(dsaddr);
++
++ /* Done processing layoutget. Set the layout stateid */
++ pnfs_set_layout_stateid(lo, &res->stateid);
++ spin_unlock(&ino->i_lock);
++out:
++ return status;
+}
+
+void
-+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset,
++ size_t *count)
+{
-+ struct nfs4_file_layout_dsaddr *dsaddr =
-+ container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
++ struct page *first, *last;
++ loff_t foff, i_size = i_size_read(inode);
++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
++ size_t range;
+
-+ nfs4_fl_free_deviceid(dsaddr);
++ first = list_entry((pages)->prev, struct page, lru);
++ last = list_entry((pages)->next, struct page, lru);
++
++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT;
++
++ range = (last->index - first->index) * PAGE_CACHE_SIZE;
++ if (last->index == end_index)
++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
++ else
++ range += PAGE_CACHE_SIZE;
++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff,
++ range);
++ *offset = foff;
++ *count = range;
+}
+
-+static struct nfs4_pnfs_ds *
-+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
++void
++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
+{
-+ struct nfs4_pnfs_ds *tmp_ds, *ds;
++ struct pnfs_layout_hdr *lo;
++ struct pnfs_layoutdriver_type *ld;
+
-+ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
-+ if (!ds)
-+ goto out;
++ pgio->pg_test = NULL;
+
-+ spin_lock(&nfs4_ds_cache_lock);
-+ tmp_ds = _data_server_lookup_locked(ip_addr, port);
-+ if (tmp_ds == NULL) {
-+ ds->ds_ip_addr = ip_addr;
-+ ds->ds_port = port;
-+ atomic_set(&ds->ds_count, 1);
-+ INIT_LIST_HEAD(&ds->ds_node);
-+ ds->ds_clp = NULL;
-+ list_add(&ds->ds_node, &nfs4_data_server_cache);
-+ dprintk("%s add new data server ip 0x%x\n", __func__,
-+ ds->ds_ip_addr);
-+ } else {
-+ kfree(ds);
-+ atomic_inc(&tmp_ds->ds_count);
-+ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
-+ __func__, tmp_ds->ds_ip_addr,
-+ atomic_read(&tmp_ds->ds_count));
-+ ds = tmp_ds;
-+ }
-+ spin_unlock(&nfs4_ds_cache_lock);
-+out:
-+ return ds;
++ lo = NFS_I(inode)->layout;
++ ld = NFS_SERVER(inode)->pnfs_curr_ld;
++ if (!ld || !lo)
++ return;
++
++ pgio->pg_test = ld->pg_test;
+}
+
+/*
-+ * Currently only support ipv4, and one multi-path address.
++ * rsize is already set by caller to MDS rsize.
+ */
-+static struct nfs4_pnfs_ds *
-+decode_and_add_ds(__be32 **pp, struct inode *inode)
++void
++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
++ struct inode *inode,
++ struct nfs_open_context *ctx,
++ struct list_head *pages,
++ size_t *rsize)
+{
-+ struct nfs4_pnfs_ds *ds = NULL;
-+ char *buf;
-+ const char *ipend, *pstr;
-+ u32 ip_addr, port;
-+ int nlen, rlen, i;
-+ int tmp[2];
-+ __be32 *r_netid, *r_addr, *p = *pp;
-+
-+ /* r_netid */
-+ nlen = be32_to_cpup(p++);
-+ r_netid = p;
-+ p += XDR_QUADLEN(nlen);
-+
-+ /* r_addr */
-+ rlen = be32_to_cpup(p++);
-+ r_addr = p;
-+ p += XDR_QUADLEN(rlen);
-+ *pp = p;
-+
-+ /* Check that netid is "tcp" */
-+ if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) {
-+ dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
-+ goto out_err;
-+ }
++ struct nfs_server *nfss = NFS_SERVER(inode);
++ size_t count = 0;
++ loff_t loff;
+
-+ /* ipv6 length plus port is legal */
-+ if (rlen > INET6_ADDRSTRLEN + 8) {
-+ dprintk("%s Invalid address, length %d\n", __func__,
-+ rlen);
-+ goto out_err;
-+ }
-+ buf = kmalloc(rlen + 1, GFP_KERNEL);
-+ buf[rlen] = '\0';
-+ memcpy(buf, r_addr, rlen);
++ pgio->pg_iswrite = 0;
++ pgio->pg_test = NULL;
++ pgio->pg_lseg = NULL;
+
-+ /* replace the port dots with dashes for the in4_pton() delimiter*/
-+ for (i = 0; i < 2; i++) {
-+ char *res = strrchr(buf, '.');
-+ *res = '-';
-+ }
++ if (!pnfs_enabled_sb(nfss))
++ return;
+
-+ /* Currently only support ipv4 address */
-+ if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
-+ dprintk("%s: Only ipv4 addresses supported\n", __func__);
-+ goto out_free;
++ readahead_range(inode, pages, &loff, &count);
++ pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ);
++ if (pgio->pg_lseg) {
++ pnfs_set_pg_test(inode, pgio);
++ *rsize = NFS_SERVER(inode)->ds_rsize;
+ }
-+
-+ /* port */
-+ pstr = ipend;
-+ sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
-+ port = htons((tmp[0] << 8) | (tmp[1]));
-+
-+ ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
-+ dprintk("%s Decoded address and port %s\n", __func__, buf);
-+out_free:
-+ kfree(buf);
-+out_err:
-+ return ds;
+}
+
-+
-+
-+/*Decode opaque device data and return the result */
-+static struct nfs4_file_layout_dsaddr*
-+decode_device(struct inode *ino, struct pnfs_device *pdev)
++void
++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
++ size_t *wsize)
+{
-+ int i, dummy;
-+ u32 cnt, num;
-+ u8 *indexp;
-+ __be32 *p = (__be32 *)pdev->area, *indicesp;
-+ struct nfs4_file_layout_dsaddr *dsaddr;
-+
-+ /* Get the stripe count (number of stripe index) */
-+ cnt = be32_to_cpup(p++);
-+ dprintk("%s stripe count %d\n", __func__, cnt);
-+ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
-+ printk(KERN_WARNING "%s: stripe count %d greater than "
-+ "supported maximum %d\n", __func__,
-+ cnt, NFS4_PNFS_MAX_STRIPE_CNT);
-+ goto out_err;
-+ }
++ struct nfs_server *server = NFS_SERVER(inode);
+
-+ /* Check the multipath list count */
-+ indicesp = p;
-+ p += XDR_QUADLEN(cnt << 2);
-+ num = be32_to_cpup(p++);
-+ dprintk("%s ds_num %u\n", __func__, num);
-+ if (num > NFS4_PNFS_MAX_MULTI_CNT) {
-+ printk(KERN_WARNING "%s: multipath count %d greater than "
-+ "supported maximum %d\n", __func__,
-+ num, NFS4_PNFS_MAX_MULTI_CNT);
-+ goto out_err;
++ pgio->pg_iswrite = 1;
++ if (!pnfs_enabled_sb(server))
++ pgio->pg_test = NULL;
++ else {
++ pnfs_set_pg_test(inode, pgio);
++ *wsize = server->ds_wsize;
+ }
-+ dsaddr = kzalloc(sizeof(*dsaddr) +
-+ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
-+ GFP_KERNEL);
-+ if (!dsaddr)
-+ goto out_err;
-+
-+ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
-+ if (!dsaddr->stripe_indices)
-+ goto out_err_free;
-+
-+ dsaddr->stripe_count = cnt;
-+ dsaddr->ds_num = num;
++}
+
-+ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
++/* Set buffer size for data servers */
++void
++pnfs_set_ds_iosize(struct nfs_server *server)
++{
++ unsigned dssize = 0;
+
-+ /* Go back an read stripe indices */
-+ p = indicesp;
-+ indexp = &dsaddr->stripe_indices[0];
-+ for (i = 0; i < dsaddr->stripe_count; i++) {
-+ *indexp = be32_to_cpup(p++);
-+ if (*indexp >= num)
-+ goto out_err_free;
-+ indexp++;
++ if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize)
++ dssize = server->pnfs_curr_ld->get_blocksize();
++ if (dssize)
++ server->ds_rsize = server->ds_wsize =
++ nfs_block_size(dssize, NULL);
++ else {
++ server->ds_wsize = server->wsize;
++ server->ds_rsize = server->rsize;
+ }
-+ /* Skip already read multipath list count */
-+ p++;
-+
-+ for (i = 0; i < dsaddr->ds_num; i++) {
-+ int j;
++}
+
-+ dummy = be32_to_cpup(p++); /* multipath count */
-+ if (dummy > 1) {
-+ printk(KERN_WARNING
-+ "%s: Multipath count %d not supported, "
-+ "skipping all greater than 1\n", __func__,
-+ dummy);
-+ }
-+ for (j = 0; j < dummy; j++) {
-+ if (j == 0) {
-+ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
-+ if (dsaddr->ds_list[i] == NULL)
-+ goto out_err_free;
-+ } else {
-+ u32 len;
-+ /* skip extra multipath */
-+ len = be32_to_cpup(p++);
-+ p += XDR_QUADLEN(len);
-+ len = be32_to_cpup(p++);
-+ p += XDR_QUADLEN(len);
-+ continue;
-+ }
-+ }
++static int
++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data)
++{
++ put_lseg(pdata->lseg);
++ pdata->lseg = NULL;
++ pdata->call_ops->rpc_call_done(task, data);
++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN)
++ return -EAGAIN;
++ if (pdata->pnfsflags & PNFS_NO_RPC) {
++ pdata->call_ops->rpc_release(data);
++ } else {
++ /*
++ * just restore original rpc call ops
++ * rpc_release will be called later by the rpc scheduling layer.
++ */
++ task->tk_ops = pdata->call_ops;
+ }
-+ return dsaddr;
-+
-+out_err_free:
-+ nfs4_fl_free_deviceid(dsaddr);
-+out_err:
-+ dprintk("%s ERROR: returning NULL\n", __func__);
-+ return NULL;
++ return 0;
+}
+
-+/*
-+ * Decode the opaque device specified in 'dev'
-+ * and add it to the list of available devices.
-+ * If the deviceid is already cached, nfs4_add_deviceid will return
-+ * a pointer to the cached struct and throw away the new.
++/* Post-write completion function
++ * Invoked by all layout drivers when write_pagelist is done.
++ *
++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC
++ * so that the NFS cleanup routines perform only the page cache
++ * cleanup.
+ */
-+static struct nfs4_file_layout_dsaddr*
-+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
++static void
++pnfs_write_retry(struct work_struct *work)
+{
-+ struct nfs4_file_layout_dsaddr *dsaddr;
-+ struct pnfs_deviceid_node *d;
-+
-+ dsaddr = decode_device(inode, dev);
-+ if (!dsaddr) {
-+ printk(KERN_WARNING "%s: Could not decode or add device\n",
-+ __func__);
-+ return NULL;
-+ }
-+
-+ d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
-+ &dsaddr->deviceid);
++ struct rpc_task *task;
++ struct nfs_write_data *wdata;
++ struct pnfs_layout_range range;
+
-+ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
++ dprintk("%s enter\n", __func__);
++ task = container_of(work, struct rpc_task, u.tk_work);
++ wdata = container_of(task, struct nfs_write_data, task);
++ range.iomode = IOMODE_RW;
++ range.offset = wdata->args.offset;
++ range.length = wdata->args.count;
++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true);
++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode),
++ wdata->pdata.call_ops, wdata->pdata.how);
+}
+
-+/*
-+ * Retrieve the information for dev_id, add it to the list
-+ * of available devices, and return it.
-+ */
-+struct nfs4_file_layout_dsaddr *
-+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
++void
++pnfs_writeback_done(struct nfs_write_data *data)
+{
-+ struct pnfs_device *pdev = NULL;
-+ u32 max_resp_sz;
-+ int max_pages;
-+ struct page **pages = NULL;
-+ struct nfs4_file_layout_dsaddr *dsaddr = NULL;
-+ int rc, i;
-+ struct nfs_server *server = NFS_SERVER(inode);
++ struct pnfs_call_data *pdata = &data->pdata;
+
-+ /*
-+ * Use the session max response size as the basis for setting
-+ * GETDEVICEINFO's maxcount
-+ */
-+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-+ max_pages = max_resp_sz >> PAGE_SHIFT;
-+ dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
-+ __func__, inode, max_resp_sz, max_pages);
++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
+
-+ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
-+ if (pdev == NULL)
-+ return NULL;
++ /* update last write offset and need layout commit
++ * for non-files layout types (files layout calls
++ * pnfs4_write_done for this)
++ */
++ if ((pdata->pnfsflags & PNFS_NO_RPC) &&
++ data->task.tk_status >= 0 && data->res.count > 0) {
++ struct nfs_inode *nfsi = NFS_I(data->inode);
+
-+ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
-+ if (pages == NULL) {
-+ kfree(pdev);
-+ return NULL;
-+ }
-+ for (i = 0; i < max_pages; i++) {
-+ pages[i] = alloc_page(GFP_KERNEL);
-+ if (!pages[i])
-+ goto out_free;
++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
++ pnfs_need_layoutcommit(nfsi, data->args.context);
+ }
+
-+ /* set pdev->area */
-+ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
-+ if (!pdev->area)
-+ goto out_free;
-+
-+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
-+ pdev->layout_type = LAYOUT_NFSV4_1_FILES;
-+ pdev->pages = pages;
-+ pdev->pgbase = 0;
-+ pdev->pglen = PAGE_SIZE * max_pages;
-+ pdev->mincount = 0;
-+
-+ rc = nfs4_proc_getdeviceinfo(server, pdev);
-+ dprintk("%s getdevice info returns %d\n", __func__, rc);
-+ if (rc)
-+ goto out_free;
-+
-+ /*
-+ * Found new device, need to decode it and then add it to the
-+ * list of known devices for this mountpoint.
-+ */
-+ dsaddr = decode_and_add_device(inode, pdev);
-+out_free:
-+ if (pdev->area != NULL)
-+ vunmap(pdev->area);
-+ for (i = 0; i < max_pages; i++)
-+ __free_page(pages[i]);
-+ kfree(pages);
-+ kfree(pdev);
-+ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
-+ return dsaddr;
++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry);
++ queue_work(nfsiod_workqueue, &data->task.u.tk_work);
++ }
+}
++EXPORT_SYMBOL_GPL(pnfs_writeback_done);
+
-+struct nfs4_file_layout_dsaddr *
-+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
++static void _pnfs_clear_lseg_from_pages(struct list_head *head)
+{
-+ struct pnfs_deviceid_node *d;
++ struct nfs_page *req;
+
-+ d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
-+ return (d == NULL) ? NULL :
-+ container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
++ list_for_each_entry(req, head, wb_list) {
++ put_lseg(req->wb_lseg);
++ req->wb_lseg = NULL;
++ }
+}
+
+/*
-+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
-+ * Then: ((res + fsi) % dsaddr->stripe_count)
++ * Call the appropriate parallel I/O subsystem write function.
++ * If no I/O device driver exists, or one does match the returned
++ * fstype, then return a positive status for regular NFS processing.
++ *
++ * TODO: Is wdata->how and wdata->args.stable always the same value?
++ * TODO: It seems in NFS, the server may not do a stable write even
++ * though it was requested (and vice-versa?). To check, it looks
++ * in data->res.verf->committed. Do we need this ability
++ * for non-file layout drivers?
+ */
-+static inline u32
-+_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
++enum pnfs_try_status
++pnfs_try_to_write_data(struct nfs_write_data *wdata,
++ const struct rpc_call_ops *call_ops, int how)
+{
-+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
-+ u64 tmp;
++ struct inode *inode = wdata->inode;
++ enum pnfs_try_status trypnfs;
++ struct nfs_server *nfss = NFS_SERVER(inode);
++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg;
+
-+ tmp = offset - flseg->pattern_offset;
-+ do_div(tmp, flseg->stripe_unit);
-+ tmp += flseg->first_stripe_index;
-+ return do_div(tmp, flseg->dsaddr->stripe_count);
-+}
++ wdata->pdata.call_ops = call_ops;
++ wdata->pdata.pnfs_error = 0;
++ wdata->pdata.how = how;
+
-+u32
-+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset)
-+{
-+ u32 j;
++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
++ inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
-+ j = _nfs4_fl_calc_j_index(lseg, offset);
-+ return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
-+}
++ get_lseg(lseg);
+
-+struct nfs_fh *
-+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset)
-+{
-+ struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
-+ u32 i;
++ if (!pnfs_use_rpc(nfss))
++ wdata->pdata.pnfsflags |= PNFS_NO_RPC;
++ wdata->pdata.lseg = lseg;
++ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata,
++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count),
++ how);
+
-+ if (flseg->stripe_type == STRIPE_SPARSE) {
-+ if (flseg->num_fh == 1)
-+ i = 0;
-+ else if (flseg->num_fh == 0)
-+ return NULL;
-+ else
-+ i = nfs4_fl_calc_ds_index(lseg, offset);
-+ } else
-+ i = _nfs4_fl_calc_j_index(lseg, offset);
-+ return flseg->fh_array[i];
++ if (trypnfs == PNFS_NOT_ATTEMPTED) {
++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
++ wdata->pdata.lseg = NULL;
++ put_lseg(lseg);
++ _pnfs_clear_lseg_from_pages(&wdata->pages);
++ } else {
++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
++ }
++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
++ return trypnfs;
+}
+
-+struct nfs4_pnfs_ds *
-+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
++/* Post-read completion function. Invoked by all layout drivers when
++ * read_pagelist is done
++ */
++static void
++pnfs_read_retry(struct work_struct *work)
+{
-+ struct nfs4_file_layout_dsaddr *dsaddr;
-+
-+ dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
-+ if (dsaddr->ds_list[ds_idx] == NULL) {
-+ printk(KERN_ERR "%s: No data server for device id!\n",
-+ __func__);
-+ return NULL;
-+ }
++ struct rpc_task *task;
++ struct nfs_read_data *rdata;
++ struct pnfs_layout_range range;
+
-+ if (!dsaddr->ds_list[ds_idx]->ds_clp) {
-+ int err;
++ dprintk("%s enter\n", __func__);
++ task = container_of(work, struct rpc_task, u.tk_work);
++ rdata = container_of(task, struct nfs_read_data, task);
++ range.iomode = IOMODE_RW;
++ range.offset = rdata->args.offset;
++ range.length = rdata->args.count;
++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true);
++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode),
++ rdata->pdata.call_ops);
++}
+
-+ err = nfs4_pnfs_ds_create(NFS_SERVER(lseg->layout->inode),
-+ dsaddr->ds_list[ds_idx]);
-+ if (err) {
-+ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n",
-+ __func__, err);
-+ return NULL;
-+ }
++void
++pnfs_read_done(struct nfs_read_data *data)
++{
++ struct pnfs_call_data *pdata = &data->pdata;
++
++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
++
++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry);
++ queue_work(nfsiod_workqueue, &data->task.u.tk_work);
+ }
-+ return dsaddr->ds_list[ds_idx];
+}
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h
---- linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h.orig 2010-09-30 12:25:08.304283000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h 2010-09-30 12:25:08.306279000 -0400
-@@ -0,0 +1,100 @@
++EXPORT_SYMBOL_GPL(pnfs_read_done);
++
+/*
-+ * NFSv4 file layout driver data structures.
-+ *
-+ * Copyright (c) 2002
-+ * The Regents of the University of Michigan
-+ * All Rights Reserved
-+ *
-+ * Dean Hildebrand <dhildebz at umich.edu>
-+ *
-+ * Permission is granted to use, copy, create derivative works, and
-+ * redistribute this software and such derivative works for any purpose,
-+ * so long as the name of the University of Michigan is not used in
-+ * any advertising or publicity pertaining to the use or distribution
-+ * of this software without specific, written prior authorization. If
-+ * the above copyright notice or any other identification of the
-+ * University of Michigan is included in any copy of any portion of
-+ * this software, then the disclaimer below must also be included.
-+ *
-+ * This software is provided as is, without representation or warranty
-+ * of any kind either express or implied, including without limitation
-+ * the implied warranties of merchantability, fitness for a particular
-+ * purpose, or noninfringement. The Regents of the University of
-+ * Michigan shall not be liable for any damages, including special,
-+ * indirect, incidental, or consequential damages, with respect to any
-+ * claim arising out of or in connection with the use of the software,
-+ * even if it has been or is hereafter advised of the possibility of
-+ * such damages.
++ * Call the appropriate parallel I/O subsystem read function.
++ * If no I/O device driver exists, or one does match the returned
++ * fstype, then return a positive status for regular NFS processing.
+ */
++enum pnfs_try_status
++pnfs_try_to_read_data(struct nfs_read_data *rdata,
++ const struct rpc_call_ops *call_ops)
++{
++ struct inode *inode = rdata->inode;
++ struct nfs_server *nfss = NFS_SERVER(inode);
++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg;
++ enum pnfs_try_status trypnfs;
+
-+#ifndef FS_NFS_NFS4FILELAYOUT_H
-+#define FS_NFS_NFS4FILELAYOUT_H
++ rdata->pdata.call_ops = call_ops;
++ rdata->pdata.pnfs_error = 0;
+
-+#include "pnfs.h"
++ dprintk("%s: Reading ino:%lu %u@%llu\n",
++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
++
++ get_lseg(lseg);
++
++ if (!pnfs_use_rpc(nfss))
++ rdata->pdata.pnfsflags |= PNFS_NO_RPC;
++ rdata->pdata.lseg = lseg;
++ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata,
++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count));
++ if (trypnfs == PNFS_NOT_ATTEMPTED) {
++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
++ rdata->pdata.lseg = NULL;
++ put_lseg(lseg);
++ _pnfs_clear_lseg_from_pages(&rdata->pages);
++ } else {
++ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
++ }
++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
++ return trypnfs;
++}
+
+/*
-+ * Field testing shows we need to support upto 4096 stripe indices.
-+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
-+ * reasonable. This in turn means we support a maximum of 256
-+ * RFC 5661 multipath_list4 structures.
++ * This gives the layout driver an opportunity to read in page "around"
++ * the data to be written. It returns 0 on success, otherwise an error code
++ * which will either be passed up to user, or ignored if
++ * some previous part of write succeeded.
++ * Note the range [pos, pos+len-1] is entirely within the page.
+ */
-+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
-+#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
-+
-+enum stripetype4 {
-+ STRIPE_SPARSE = 1,
-+ STRIPE_DENSE = 2
-+};
++int _pnfs_write_begin(struct inode *inode, struct page *page,
++ loff_t pos, unsigned len,
++ struct pnfs_layout_segment *lseg,
++ struct pnfs_fsdata **fsdata)
++{
++ struct pnfs_fsdata *data;
++ int status = 0;
+
-+/* Individual ip address */
-+struct nfs4_pnfs_ds {
-+ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
-+ u32 ds_ip_addr;
-+ u32 ds_port;
-+ struct nfs_client *ds_clp;
-+ atomic_t ds_count;
-+};
++ dprintk("--> %s: pos=%llu len=%u\n",
++ __func__, (unsigned long long)pos, len);
++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
++ if (!data) {
++ status = -ENOMEM;
++ goto out;
++ }
++ data->lseg = lseg; /* refcount passed into data to be managed there */
++ status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
++ lseg, page, pos, len, data);
++ if (status) {
++ kfree(data);
++ data = NULL;
++ }
++out:
++ *fsdata = data;
++ dprintk("<-- %s: status=%d\n", __func__, status);
++ return status;
++}
+
-+struct nfs4_file_layout_dsaddr {
-+ struct pnfs_deviceid_node deviceid;
-+ u32 stripe_count;
-+ u8 *stripe_indices;
-+ u32 ds_num;
-+ struct nfs4_pnfs_ds *ds_list[1];
-+};
++/* pNFS Commit callback function for all layout drivers */
++void
++pnfs_commit_done(struct nfs_write_data *data)
++{
++ struct pnfs_call_data *pdata = &data->pdata;
+
-+struct nfs4_filelayout_segment {
-+ struct pnfs_layout_segment generic_hdr;
-+ u32 stripe_type;
-+ u32 commit_through_mds;
-+ u32 stripe_unit;
-+ u32 first_stripe_index;
-+ u64 pattern_offset;
-+ struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
-+ unsigned int num_fh;
-+ struct nfs_fh **fh_array;
-+};
++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
+
-+static inline struct nfs4_filelayout_segment *
-+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
-+{
-+ return container_of(lseg,
-+ struct nfs4_filelayout_segment,
-+ generic_hdr);
++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
++ struct pnfs_layout_range range = {
++ .iomode = IOMODE_RW,
++ .offset = data->args.offset,
++ .length = data->args.count,
++ };
++ dprintk("%s: retrying\n", __func__);
++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE,
++ true);
++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode),
++ pdata->call_ops, pdata->how, 1);
++ }
+}
++EXPORT_SYMBOL_GPL(pnfs_commit_done);
+
-+extern struct nfs_fh *
-+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset);
++enum pnfs_try_status
++pnfs_try_to_commit(struct nfs_write_data *data,
++ const struct rpc_call_ops *call_ops, int sync)
++{
++ struct inode *inode = data->inode;
++ struct nfs_server *nfss = NFS_SERVER(data->inode);
++ enum pnfs_try_status trypnfs;
+
-+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
-+extern void print_ds(struct nfs4_pnfs_ds *ds);
-+extern void print_deviceid(struct nfs4_deviceid *dev_id);
-+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset);
-+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
-+ u32 ds_idx);
-+extern struct nfs4_file_layout_dsaddr *
-+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
-+struct nfs4_file_layout_dsaddr *
-+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
++ dprintk("%s: Begin\n", __func__);
+
-+#endif /* FS_NFS_NFS4FILELAYOUT_H */
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.35.noarch/fs/nfs/nfs4_fs.h
---- linux-2.6.35.noarch/fs/nfs/nfs4_fs.h.orig 2010-09-30 12:22:45.152042000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4_fs.h 2010-09-30 12:25:08.297278000 -0400
-@@ -46,6 +46,7 @@ enum nfs4_client_state {
- NFS4CLNT_DELEGRETURN,
- NFS4CLNT_SESSION_RESET,
- NFS4CLNT_RECALL_SLOT,
-+ NFS4CLNT_LAYOUT_RECALL,
- };
-
- enum nfs4_session_state {
-@@ -256,10 +257,12 @@ static inline struct nfs4_session *nfs4_
- }
-
- extern int nfs4_setup_sequence(const struct nfs_server *server,
-+ struct nfs4_session *ds_session,
- struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- int cache_reply, struct rpc_task *task);
- extern void nfs4_destroy_session(struct nfs4_session *session);
- extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
-+extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *);
- extern int nfs4_proc_create_session(struct nfs_client *);
- extern int nfs4_proc_destroy_session(struct nfs4_session *);
- extern int nfs4_init_session(struct nfs_server *server);
-@@ -272,6 +275,7 @@ static inline struct nfs4_session *nfs4_
- }
-
- static inline int nfs4_setup_sequence(const struct nfs_server *server,
-+ struct nfs4_session *ds_session,
- struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
- int cache_reply, struct rpc_task *task)
- {
-@@ -289,7 +293,7 @@ extern const struct nfs4_minor_version_o
- extern const u32 nfs4_fattr_bitmap[2];
- extern const u32 nfs4_statfs_bitmap[2];
- extern const u32 nfs4_pathconf_bitmap[2];
--extern const u32 nfs4_fsinfo_bitmap[2];
-+extern const u32 nfs4_fsinfo_bitmap[3];
- extern const u32 nfs4_fs_locations_bitmap[2];
-
- /* nfs4renewd.c */
-@@ -299,13 +303,24 @@ extern void nfs4_kill_renewd(struct nfs_
- extern void nfs4_renew_state(struct work_struct *);
-
- /* nfs4state.c */
-+struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
- struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
- struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
- #if defined(CONFIG_NFS_V4_1)
--struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
- struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
- #endif /* CONFIG_NFS_V4_1 */
-
-+static inline struct rpc_cred *
-+nfs4_get_machine_cred(struct nfs_client *clp)
++ if (!pnfs_use_rpc(nfss))
++ data->pdata.pnfsflags |= PNFS_NO_RPC;
++ /* We need to account for possibility that
++ * each nfs_page can point to a different lseg (or be NULL).
++ * For the immediate case of whole-file-only layouts, we at
++ * least know there can be only a single lseg.
++ * We still have to account for the possibility of some being NULL.
++ * This will be done by passing the buck to the layout driver.
++ */
++ data->pdata.call_ops = call_ops;
++ data->pdata.pnfs_error = 0;
++ data->pdata.how = sync;
++ data->pdata.lseg = NULL;
++ trypnfs = nfss->pnfs_curr_ld->commit(data, sync);
++ if (trypnfs == PNFS_NOT_ATTEMPTED) {
++ data->pdata.pnfsflags &= ~PNFS_NO_RPC;
++ _pnfs_clear_lseg_from_pages(&data->pages);
++ } else
++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT);
++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
++ return trypnfs;
++}
++
++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
-+ struct rpc_cred *cred;
++ struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
-+ spin_lock(&clp->cl_lock);
-+ cred = nfs4_get_machine_cred_locked(clp);
-+ spin_unlock(&clp->cl_lock);
-+ return cred;
++ /* TODO: Maybe we should avoid this by allowing the layout driver
++ * to directly xdr its layout on the wire.
++ */
++ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
++ nfss->pnfs_curr_ld->cleanup_layoutcommit(
++ NFS_I(data->args.inode)->layout,
++ &data->args, data->status);
+}
+
- extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
- extern void nfs4_put_state_owner(struct nfs4_state_owner *);
- extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.35.noarch/fs/nfs/nfs4proc.c
---- linux-2.6.35.noarch/fs/nfs/nfs4proc.c.orig 2010-09-30 12:22:45.160048000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4proc.c 2010-09-30 12:25:08.320280000 -0400
-@@ -55,6 +55,7 @@
- #include "internal.h"
- #include "iostat.h"
- #include "callback.h"
-+#include "pnfs.h"
-
- #define NFSDBG_FACILITY NFSDBG_PROC
-
-@@ -67,7 +68,7 @@ struct nfs4_opendata;
- static int _nfs4_proc_open(struct nfs4_opendata *data);
- static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
- static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
--static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *);
- static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
- static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
- static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-@@ -125,11 +126,12 @@ const u32 nfs4_pathconf_bitmap[2] = {
- 0
- };
-
--const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
-+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
- | FATTR4_WORD0_MAXREAD
- | FATTR4_WORD0_MAXWRITE
- | FATTR4_WORD0_LEASE_TIME,
-- 0
-+ FATTR4_WORD1_FS_LAYOUT_TYPES,
-+ FATTR4_WORD2_LAYOUT_BLKSIZE
- };
-
- const u32 nfs4_fs_locations_bitmap[2] = {
-@@ -562,6 +564,7 @@ static int nfs41_setup_sequence(struct n
- }
-
- int nfs4_setup_sequence(const struct nfs_server *server,
-+ struct nfs4_session *ds_session,
- struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res,
- int cache_reply,
-@@ -570,6 +573,8 @@ int nfs4_setup_sequence(const struct nfs
- struct nfs4_session *session = nfs4_get_session(server);
- int ret = 0;
-
-+ if (ds_session)
-+ session = ds_session;
- if (session == NULL) {
- args->sa_session = NULL;
- res->sr_session = NULL;
-@@ -599,7 +604,7 @@ static void nfs41_call_sync_prepare(stru
-
- dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
-
-- if (nfs4_setup_sequence(data->seq_server, data->seq_args,
-+ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args,
- data->seq_res, data->cache_reply, task))
- return;
- rpc_call_start(task);
-@@ -1378,7 +1383,7 @@ static void nfs4_open_prepare(struct rpc
- nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
- }
- data->timestamp = jiffies;
-- if (nfs4_setup_sequence(data->o_arg.server,
-+ if (nfs4_setup_sequence(data->o_arg.server, NULL,
- &data->o_arg.seq_args,
- &data->o_res.seq_res, 1, task))
- return;
-@@ -1553,9 +1558,8 @@ static int _nfs4_proc_open(struct nfs4_o
- return 0;
- }
-
--static int nfs4_recover_expired_lease(struct nfs_server *server)
-+int nfs4_recover_expired_lease(struct nfs_client *clp)
- {
-- struct nfs_client *clp = server->nfs_client;
- unsigned int loop;
- int ret;
-
-@@ -1571,6 +1575,7 @@ static int nfs4_recover_expired_lease(st
- }
- return ret;
- }
-+EXPORT_SYMBOL(nfs4_recover_expired_lease);
-
- /*
- * OPEN_EXPIRED:
-@@ -1660,7 +1665,7 @@ static int _nfs4_do_open(struct inode *d
- dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
- goto out_err;
- }
-- status = nfs4_recover_expired_lease(server);
-+ status = nfs4_recover_expired_lease(server->nfs_client);
- if (status != 0)
- goto err_put_state_owner;
- if (path->dentry->d_inode != NULL)
-@@ -1871,7 +1876,7 @@ static void nfs4_close_done(struct rpc_t
- if (calldata->arg.fmode == 0)
- break;
- default:
-- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
-+ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
- rpc_restart_call_prepare(task);
- }
- nfs_release_seqid(calldata->arg.seqid);
-@@ -1916,7 +1921,7 @@ static void nfs4_close_prepare(struct rp
-
- nfs_fattr_init(calldata->res.fattr);
- calldata->timestamp = jiffies;
-- if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
-+ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL,
- &calldata->arg.seq_args, &calldata->res.seq_res,
- 1, task))
- return;
-@@ -2337,6 +2342,9 @@ nfs4_proc_setattr(struct dentry *dentry,
- struct nfs4_state *state = NULL;
- int status;
-
-+ if (pnfs_ld_layoutret_on_setattr(inode))
-+ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true);
++/*
++ * Set up the argument/result storage required for the RPC call.
++ */
++static int
++pnfs_layoutcommit_setup(struct inode *inode,
++ struct nfs4_layoutcommit_data *data,
++ loff_t write_begin_pos, loff_t write_end_pos)
++{
++ struct nfs_server *nfss = NFS_SERVER(inode);
++ int result = 0;
+
- nfs_fattr_init(fattr);
-
- /* Search for an existing open(O_WRITE) file */
-@@ -2664,7 +2672,7 @@ static int nfs4_proc_unlink_done(struct
-
- if (!nfs4_sequence_done(task, &res->seq_res))
- return 0;
-- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
-+ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN)
- return 0;
- update_changeattr(dir, &res->cinfo);
- nfs_post_op_update_inode(dir, res->dir_attr);
-@@ -3105,19 +3113,31 @@ static int nfs4_proc_pathconf(struct nfs
- static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
- {
- struct nfs_server *server = NFS_SERVER(data->inode);
-+ struct nfs_client *client = server->nfs_client;
-
- dprintk("--> %s\n", __func__);
-
-+#ifdef CONFIG_NFS_V4_1
-+ if (data->pdata.pnfsflags & PNFS_NO_RPC)
-+ return 0;
++ dprintk("--> %s\n", __func__);
+
-+ /* Is this a DS session */
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS read\n", __func__);
-+ client = data->fldata.ds_nfs_client;
-+ }
-+#endif /* CONFIG_NFS_V4_1 */
++ data->args.inode = inode;
++ data->args.fh = NFS_FH(inode);
++ data->args.layout_type = nfss->pnfs_curr_ld->id;
++ data->res.fattr = &data->fattr;
++ nfs_fattr_init(&data->fattr);
+
- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
-- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-- nfs_restart_rpc(task, server->nfs_client);
-+ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
-+ nfs_restart_rpc(task, client);
- return -EAGAIN;
- }
-
- nfs_invalidate_atime(data->inode);
-- if (task->tk_status > 0)
-+ if (task->tk_status > 0 && client == server->nfs_client)
- renew_lease(server, data->timestamp);
- return 0;
- }
-@@ -3128,20 +3148,56 @@ static void nfs4_proc_read_setup(struct
- msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
- }
-
-+static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data)
-+{
-+#ifdef CONFIG_NFS_V4_1
-+ pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
-+ pnfs_need_layoutcommit(nfsi, data->args.context);
-+#endif /* CONFIG_NFS_V4_1 */
++ /* TODO: Need to determine the correct values */
++ data->args.time_modify_changed = 0;
++
++ /* Set values from inode so it can be reset
++ */
++ data->args.range.iomode = IOMODE_RW;
++ data->args.range.offset = write_begin_pos;
++ data->args.range.length = write_end_pos - write_begin_pos + 1;
++ data->args.lastbytewritten = min(write_end_pos,
++ i_size_read(inode) - 1);
++ data->args.bitmask = nfss->attr_bitmask;
++ data->res.server = nfss;
++
++ /* Call layout driver to set the arguments */
++ if (nfss->pnfs_curr_ld->setup_layoutcommit)
++ result = nfss->pnfs_curr_ld->setup_layoutcommit(
++ NFS_I(inode)->layout, &data->args);
++
++ dprintk("<-- %s Status %d\n", __func__, result);
++ return result;
+}
+
- static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
- {
- struct inode *inode = data->inode;
--
-+ struct nfs_server *server = NFS_SERVER(inode);
-+ struct nfs_client *client = server->nfs_client;
++/* Issue a async layoutcommit for an inode.
++ */
++int
++pnfs_layoutcommit_inode(struct inode *inode, int sync)
++{
++ struct nfs4_layoutcommit_data *data;
++ struct nfs_inode *nfsi = NFS_I(inode);
++ loff_t write_begin_pos;
++ loff_t write_end_pos;
+
- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
-- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
-+#ifdef CONFIG_NFS_V4_1
-+ /* restore original count after retry? */
-+ if (data->pdata.orig_count) {
-+ dprintk("%s: restoring original count %u\n", __func__,
-+ data->pdata.orig_count);
-+ data->args.count = data->pdata.orig_count;
++ int status = 0;
++
++ dprintk("%s Begin (sync:%d)\n", __func__, sync);
++
++ BUG_ON(!has_layout(nfsi));
++
++ data = kzalloc(sizeof(*data), GFP_NOFS);
++ if (!data)
++ return -ENOMEM;
++
++ spin_lock(&inode->i_lock);
++ if (!layoutcommit_needed(nfsi)) {
++ spin_unlock(&inode->i_lock);
++ goto out_free;
+ }
+
-+ if (data->pdata.pnfsflags & PNFS_NO_RPC)
-+ return 0;
++ /* Clear layoutcommit properties in the inode so
++ * new lc info can be generated
++ */
++ write_begin_pos = nfsi->layout->write_begin_pos;
++ write_end_pos = nfsi->layout->write_end_pos;
++ data->cred = nfsi->layout->cred;
++ nfsi->layout->write_begin_pos = 0;
++ nfsi->layout->write_end_pos = 0;
++ nfsi->layout->cred = NULL;
++ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout, NULL);
+
-+ /* Is this a DS session */
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS write\n", __func__);
-+ client = data->fldata.ds_nfs_client;
-+ }
-+#endif /* CONFIG_NFS_V4_1 */
++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */
++ get_layout_hdr_locked(NFS_I(inode)->layout);
+
-+ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) {
-+ nfs_restart_rpc(task, client);
- return -EAGAIN;
- }
++ spin_unlock(&inode->i_lock);
+
-+ /*
-+ * MDS write: renew lease
-+ * DS write: update lastbyte written, mark for layout commit
-+ */
- if (task->tk_status >= 0) {
-- renew_lease(NFS_SERVER(inode), data->timestamp);
-- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
-+ if (client == server->nfs_client) {
-+ renew_lease(server, data->timestamp);
-+ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
-+ } else
-+ pnfs4_update_write_done(NFS_I(inode), data);
- }
- return 0;
- }
-@@ -3154,21 +3210,42 @@ static void nfs4_proc_write_setup(struct
- data->res.server = server;
- data->timestamp = jiffies;
-
-+#ifdef CONFIG_NFS_V4_1
-+ /* writes to DS use pnfs vector */
-+ if (data->fldata.ds_nfs_client) {
-+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE];
-+ return;
++ /* Set up layout commit args */
++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos,
++ write_end_pos);
++ if (status) {
++ /* The layout driver failed to setup the layoutcommit */
++ put_rpccred(data->cred);
++ put_layout_hdr(inode);
++ goto out_free;
+ }
-+#endif /* CONFIG_NFS_V4_1 */
- msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- }
-
- static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
- {
- struct inode *inode = data->inode;
--
-+ struct nfs_server *server = NFS_SERVER(data->inode);
-+ struct nfs_client *client = server->nfs_client;
++ status = nfs4_proc_layoutcommit(data, sync);
++out:
++ dprintk("%s end (err:%d)\n", __func__, status);
++ return status;
++out_free:
++ kfree(data);
++ goto out;
++}
+
-+#ifdef CONFIG_NFS_V4_1
-+ if (data->pdata.pnfsflags & PNFS_NO_RPC)
-+ return 0;
++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
++{
++ /* lseg refcounting handled directly in nfs_write_end */
++ kfree(fsdata);
++}
+
-+ /* Is this a DS session */
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS commit\n", __func__);
-+ client = data->fldata.ds_nfs_client;
-+ }
-+#endif /* CONFIG_NFS_V4_1 */
++/*
++ * Device ID cache. Currently supports one layout type per struct nfs_client.
++ * Add layout type to the lookup key to expand to support multiple types.
++ */
++int
++pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
++ void (*free_callback)(struct pnfs_deviceid_node *))
++{
++ struct pnfs_deviceid_cache *c;
+
- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
-- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-+ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) {
- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
- return -EAGAIN;
- }
-- nfs_refresh_inode(inode, data->res.fattr);
-+ if (client == server->nfs_client)
-+ nfs_refresh_inode(inode, data->res.fattr);
- return 0;
- }
-
-@@ -3178,6 +3255,12 @@ static void nfs4_proc_commit_setup(struc
-
- data->args.bitmask = server->cache_consistency_bitmask;
- data->res.server = server;
-+#if defined(CONFIG_NFS_V4_1)
-+ if (data->fldata.ds_nfs_client) {
-+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT];
-+ return;
++ c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
++ if (!c)
++ return -ENOMEM;
++ spin_lock(&clp->cl_lock);
++ if (clp->cl_devid_cache != NULL) {
++ atomic_inc(&clp->cl_devid_cache->dc_ref);
++ dprintk("%s [kref [%d]]\n", __func__,
++ atomic_read(&clp->cl_devid_cache->dc_ref));
++ kfree(c);
++ } else {
++ /* kzalloc initializes hlists */
++ spin_lock_init(&c->dc_lock);
++ atomic_set(&c->dc_ref, 1);
++ c->dc_free_callback = free_callback;
++ clp->cl_devid_cache = c;
++ dprintk("%s [new]\n", __func__);
+ }
-+#endif /* CONFIG_NFS_V4_1 */
- msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
- }
-
-@@ -3475,9 +3558,10 @@ static int nfs4_proc_set_acl(struct inod
- }
-
- static int
--nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
-+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp)
- {
-- struct nfs_client *clp = server->nfs_client;
-+ if (!clp)
-+ clp = server->nfs_client;
-
- if (task->tk_status >= 0)
- return 0;
-@@ -3504,14 +3588,16 @@ nfs4_async_handle_error(struct rpc_task
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
-- dprintk("%s ERROR %d, Reset session\n", __func__,
-- task->tk_status);
-+ dprintk("%s ERROR %d, Reset session. Exchangeid "
-+ "flags 0x%x\n", __func__, task->tk_status,
-+ clp->cl_exchange_flags);
- nfs4_schedule_state_recovery(clp);
- task->tk_status = 0;
- return -EAGAIN;
- #endif /* CONFIG_NFS_V4_1 */
- case -NFS4ERR_DELAY:
-- nfs_inc_server_stats(server, NFSIOS_DELAY);
-+ if (server)
-+ nfs_inc_server_stats(server, NFSIOS_DELAY);
- case -NFS4ERR_GRACE:
- case -EKEYEXPIRED:
- rpc_delay(task, NFS4_POLL_RETRY_MAX);
-@@ -3524,6 +3610,8 @@ nfs4_async_handle_error(struct rpc_task
- task->tk_status = nfs4_map_errors(task->tk_status);
- return 0;
- do_state_recovery:
-+ if (is_ds_only_client(clp))
-+ return 0;
- rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
- nfs4_schedule_state_recovery(clp);
- if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
-@@ -3657,8 +3745,8 @@ static void nfs4_delegreturn_done(struct
- renew_lease(data->res.server, data->timestamp);
- break;
- default:
-- if (nfs4_async_handle_error(task, data->res.server, NULL) ==
-- -EAGAIN) {
-+ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL)
-+ == -EAGAIN) {
- nfs_restart_rpc(task, data->res.server->nfs_client);
- return;
- }
-@@ -3678,7 +3766,7 @@ static void nfs4_delegreturn_prepare(str
-
- d_data = (struct nfs4_delegreturndata *)data;
-
-- if (nfs4_setup_sequence(d_data->res.server,
-+ if (nfs4_setup_sequence(d_data->res.server, NULL,
- &d_data->args.seq_args,
- &d_data->res.seq_res, 1, task))
- return;
-@@ -3913,7 +4001,7 @@ static void nfs4_locku_done(struct rpc_t
- case -NFS4ERR_EXPIRED:
- break;
- default:
-- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-+ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN)
- nfs_restart_rpc(task,
- calldata->server->nfs_client);
- }
-@@ -3931,7 +4019,7 @@ static void nfs4_locku_prepare(struct rp
- return;
- }
- calldata->timestamp = jiffies;
-- if (nfs4_setup_sequence(calldata->server,
-+ if (nfs4_setup_sequence(calldata->server, NULL,
- &calldata->arg.seq_args,
- &calldata->res.seq_res, 1, task))
- return;
-@@ -4086,7 +4174,7 @@ static void nfs4_lock_prepare(struct rpc
- } else
- data->arg.new_lock_owner = 0;
- data->timestamp = jiffies;
-- if (nfs4_setup_sequence(data->server,
-+ if (nfs4_setup_sequence(data->server, NULL,
- &data->arg.seq_args,
- &data->res.seq_res, 1, task))
- return;
-@@ -4557,7 +4645,7 @@ int nfs4_proc_exchange_id(struct nfs_cli
- nfs4_verifier verifier;
- struct nfs41_exchange_id_args args = {
- .client = clp,
-- .flags = clp->cl_exchange_flags,
-+ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R,
- };
- struct nfs41_exchange_id_res res = {
- .client = clp,
-@@ -5081,7 +5169,7 @@ int nfs4_init_session(struct nfs_server
- session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
- session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
-
-- ret = nfs4_recover_expired_lease(server);
-+ ret = nfs4_recover_expired_lease(server->nfs_client);
- if (!ret)
- ret = nfs4_check_client_ready(clp);
- return ret;
-@@ -5333,6 +5421,448 @@ out:
- dprintk("<-- %s status=%d\n", __func__, status);
- return status;
- }
++ spin_unlock(&clp->cl_lock);
++ return 0;
++}
++EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
++
++/* Must be called with locked c->dc_lock */
++static struct pnfs_deviceid_node *
++pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c,
++ struct nfs4_deviceid *id)
++{
++ struct pnfs_deviceid_node *d;
++ struct hlist_node *n;
++ long h = nfs4_deviceid_hash(id);
++
++ dprintk("%s hash %ld\n", __func__, h);
++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
++ if (!memcmp(&d->de_id, id, sizeof(*id))) {
++ hlist_del_rcu(&d->de_node);
++ synchronize_rcu();
++ return d;
++ }
+
-+static void
-+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
-+{
-+ struct nfs4_layoutget *lgp = calldata;
-+ struct inode *ino = lgp->args.inode;
-+ struct nfs_inode *nfsi = NFS_I(ino);
-+ struct nfs_server *server = NFS_SERVER(ino);
-+ struct pnfs_layout_segment *lseg;
++ return NULL;
++}
+
-+ dprintk("--> %s\n", __func__);
-+ spin_lock(&ino->i_lock);
-+ lseg = pnfs_has_layout(nfsi->layout, &lgp->args.range);
-+ if (likely(!lseg)) {
-+ spin_unlock(&ino->i_lock);
-+ dprintk("%s: no lseg found, proceeding\n", __func__);
-+ if (!nfs4_setup_sequence(server, NULL, &lgp->args.seq_args,
-+ &lgp->res.seq_res, 0, task))
-+ rpc_call_start(task);
++/*
++ * Called from pnfs_layoutdriver_type->free_lseg
++ * last layout segment reference frees deviceid
++ */
++void
++pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
++ struct pnfs_deviceid_node *devid)
++{
++ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
++ if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+ return;
++
++ pnfs_unhash_deviceid(c, &devid->de_id);
++ spin_unlock(&c->dc_lock);
++
++ c->dc_free_callback(devid);
++}
++EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
++
++void
++pnfs_delete_deviceid(struct pnfs_deviceid_cache *c,
++ struct nfs4_deviceid *id)
++{
++ struct pnfs_deviceid_node *devid;
++
++ spin_lock(&c->dc_lock);
++ devid = pnfs_unhash_deviceid(c, id);
++ spin_unlock(&c->dc_lock);
++
++ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
++ if (atomic_dec_and_test(&devid->de_ref))
++ c->dc_free_callback(devid);
++}
++EXPORT_SYMBOL_GPL(pnfs_delete_deviceid);
++
++/* Find and reference a deviceid */
++struct pnfs_deviceid_node *
++pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
++{
++ struct pnfs_deviceid_node *d;
++ struct hlist_node *n;
++ long hash = nfs4_deviceid_hash(id);
++
++ dprintk("--> %s hash %ld\n", __func__, hash);
++ rcu_read_lock();
++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
++ if (!memcmp(&d->de_id, id, sizeof(*id))) {
++ if (!atomic_inc_not_zero(&d->de_ref)) {
++ goto fail;
++ } else {
++ rcu_read_unlock();
++ return d;
++ }
++ }
+ }
-+ if (!lseg->valid) {
-+ put_lseg_locked(lseg);
-+ spin_unlock(&ino->i_lock);
-+ dprintk("%s: invalid lseg found, waiting\n", __func__);
-+ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
-+ return;
++fail:
++ rcu_read_unlock();
++ return NULL;
++}
++EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
++
++/*
++ * Add a deviceid to the cache.
++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
++ */
++struct pnfs_deviceid_node *
++pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
++{
++ struct pnfs_deviceid_node *d;
++ long hash = nfs4_deviceid_hash(&new->de_id);
++
++ dprintk("--> %s hash %ld\n", __func__, hash);
++ spin_lock(&c->dc_lock);
++ d = pnfs_find_get_deviceid(c, &new->de_id);
++ if (d) {
++ spin_unlock(&c->dc_lock);
++ dprintk("%s [discard]\n", __func__);
++ c->dc_free_callback(new);
++ return d;
+ }
-+ *lgp->lsegpp = lseg;
-+ spin_unlock(&ino->i_lock);
-+ dprintk("%s: valid lseg found, no rpc required\n", __func__);
-+ rpc_exit(task, NFS4_OK);
++ INIT_HLIST_NODE(&new->de_node);
++ atomic_set(&new->de_ref, 1);
++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
++ spin_unlock(&c->dc_lock);
++ dprintk("%s [new]\n", __func__);
++ return new;
+}
++EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+
-+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
++void
++pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
-+ struct nfs4_layoutget *lgp = calldata;
++ struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+
-+ dprintk("--> %s\n", __func__);
++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
++ if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
++ int i;
++ /* Verify cache is empty */
++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
++ BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
++ clp->cl_devid_cache = NULL;
++ spin_unlock(&clp->cl_lock);
++ kfree(local);
++ }
++}
++EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
+diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
+new file mode 100644
+index 0000000..0e90b0e
+--- /dev/null
++++ b/fs/nfs/pnfs.h
+@@ -0,0 +1,595 @@
++/*
++ * pNFS client data structures.
++ *
++ * Copyright (c) 2002
++ * The Regents of the University of Michigan
++ * All Rights Reserved
++ *
++ * Dean Hildebrand <dhildebz at umich.edu>
++ *
++ * Permission is granted to use, copy, create derivative works, and
++ * redistribute this software and such derivative works for any purpose,
++ * so long as the name of the University of Michigan is not used in
++ * any advertising or publicity pertaining to the use or distribution
++ * of this software without specific, written prior authorization. If
++ * the above copyright notice or any other identification of the
++ * University of Michigan is included in any copy of any portion of
++ * this software, then the disclaimer below must also be included.
++ *
++ * This software is provided as is, without representation or warranty
++ * of any kind either express or implied, including without limitation
++ * the implied warranties of merchantability, fitness for a particular
++ * purpose, or noninfringement. The Regents of the University of
++ * Michigan shall not be liable for any damages, including special,
++ * indirect, incidental, or consequential damages, with respect to any
++ * claim arising out of or in connection with the use of the software,
++ * even if it has been or is hereafter advised of the possibility of
++ * such damages.
++ */
+
-+ if (!nfs4_sequence_done(task, &lgp->res.seq_res))
-+ return;
++#ifndef FS_NFS_PNFS_H
++#define FS_NFS_PNFS_H
++
++#include <linux/nfs_page.h>
++
++struct pnfs_layout_segment {
++ struct list_head fi_list;
++ struct pnfs_layout_range range;
++ struct kref kref;
++ bool valid;
++ struct pnfs_layout_hdr *layout;
++};
++
++enum pnfs_try_status {
++ PNFS_ATTEMPTED = 0,
++ PNFS_NOT_ATTEMPTED = 1,
++};
++
++struct pnfs_fsdata {
++ struct pnfs_layout_segment *lseg;
++ int bypass_eof;
++ void *private;
++};
++
++#ifdef CONFIG_NFS_V4_1
+
-+ /* Error handling done later using nfs4_handle_exception to get
-+ * exponential backoff.
++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
++
++enum {
++ NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
++ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
++ NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
++ NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
++};
++
++enum layoutdriver_policy_flags {
++ /* Should the full nfs rpc cleanup code be used after io */
++ PNFS_USE_RPC_CODE = 1 << 0,
++
++ /* Should the pNFS client commit and return the layout upon a setattr */
++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 1,
++};
++
++/* Per-layout driver specific registration structure */
++struct pnfs_layoutdriver_type {
++ struct list_head pnfs_tblid;
++ const u32 id;
++ const char *name;
++ struct module *owner;
++ unsigned flags;
++ int (*initialize_mountpoint) (struct nfs_server *, const struct nfs_fh *);
++ int (*uninitialize_mountpoint) (struct nfs_server *);
++
++ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
++ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
++
++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
++ void (*free_lseg) (struct pnfs_layout_segment *lseg);
++
++ /* test for nfs page cache coalescing */
++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
++
++ /* Retreive the block size of the file system.
++ * If gather_across_stripes == 1, then the file system will gather
++ * requests into the block size.
++ * TODO: Where will the layout driver get this info? It is hard
++ * coded in PVFS2.
++ */
++ ssize_t (*get_blocksize) (void);
++
++/* read and write pagelist should return just 0 (to indicate that
++ * the layout code has taken control) or 1 (to indicate that the
++ * layout code wishes to fall back to normal nfs.) If 0 is returned,
++ * information can be passed back through nfs_data->res and
++ * nfs_data->task.tk_status, and the appropriate pnfs done function
++ * MUST be called.
++ */
++ enum pnfs_try_status
++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages);
++ enum pnfs_try_status
++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how);
++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
++ loff_t pos, unsigned count,
++ struct pnfs_fsdata *fsdata);
++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
++ unsigned count, unsigned copied,
++ struct pnfs_layout_segment *lseg);
++ void (*write_end_cleanup)(struct file *filp,
++ struct pnfs_fsdata *fsdata);
++
++ /* Consistency ops */
++ /* 2 problems:
++ * 1) the page list contains nfs_pages, NOT pages
++ * 2) currently the NFS code doesn't create a page array (as it does with read/write)
+ */
-+ lgp->status = task->tk_status;
-+ dprintk("<-- %s\n", __func__);
-+}
++ enum pnfs_try_status
++ (*commit) (struct nfs_write_data *nfs_data, int how);
+
-+static void nfs4_layoutget_release(void *calldata)
-+{
-+ struct nfs4_layoutget *lgp = calldata;
++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
++ struct nfs4_layoutcommit_args *args);
+
-+ dprintk("--> %s\n", __func__);
-+ pnfs_layoutget_release(NFS_I(lgp->args.inode)->layout);
-+ if (lgp->res.layout.buf != NULL)
-+ free_page((unsigned long) lgp->res.layout.buf);
-+ put_nfs_open_context(lgp->args.ctx);
-+ kfree(calldata);
-+ dprintk("<-- %s\n", __func__);
-+}
++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
++ struct xdr_stream *xdr,
++ const struct nfs4_layoutcommit_args *args);
+
-+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
-+ .rpc_call_prepare = nfs4_layoutget_prepare,
-+ .rpc_call_done = nfs4_layoutget_done,
-+ .rpc_release = nfs4_layoutget_release,
++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
++ struct nfs4_layoutcommit_args *args,
++ int status);
++
++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
++ struct xdr_stream *xdr,
++ const struct nfs4_layoutreturn_args *args);
+};
+
-+static int _nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
-+{
-+ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-+ struct rpc_task *task;
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
-+ .rpc_argp = &lgp->args,
-+ .rpc_resp = &lgp->res,
-+ };
-+ struct rpc_task_setup task_setup_data = {
-+ .rpc_client = server->client,
-+ .rpc_message = &msg,
-+ .callback_ops = &nfs4_layoutget_call_ops,
-+ .callback_data = lgp,
-+ .flags = RPC_TASK_ASYNC,
-+ };
-+ int status = 0;
++struct pnfs_layout_hdr {
++ unsigned long refcount;
++ struct list_head layouts; /* other client layouts */
++ struct list_head segs; /* layout segments list */
++ int roc_iomode;/* return on close iomode, 0=none */
++ seqlock_t seqlock; /* Protects the stateid */
++ nfs4_stateid stateid;
++ unsigned long state;
++ struct rpc_cred *cred; /* layoutcommit credential */
++ /* DH: These vars keep track of the maximum write range
++ * so the values can be used for layoutcommit.
++ */
++ loff_t write_begin_pos;
++ loff_t write_end_pos;
++ struct inode *inode;
++};
+
-+ dprintk("--> %s\n", __func__);
++struct pnfs_device {
++ struct nfs4_deviceid dev_id;
++ unsigned int layout_type;
++ unsigned int mincount;
++ struct page **pages;
++ void *area;
++ unsigned int pgbase;
++ unsigned int pglen;
++};
+
-+ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
-+ if (lgp->res.layout.buf == NULL) {
-+ nfs4_layoutget_release(lgp);
-+ return -ENOMEM;
-+ }
++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
-+ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-+ task = rpc_run_task(&task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+ status = nfs4_wait_for_completion_rpc_task(task);
-+ if (status != 0)
-+ goto out;
-+ status = lgp->status;
-+ if (status != 0)
-+ goto out;
-+ status = pnfs_layout_process(lgp);
-+out:
-+ rpc_put_task(task);
-+ dprintk("<-- %s status=%d\n", __func__, status);
-+ return status;
-+}
++struct pnfs_devicelist {
++ unsigned int eof;
++ unsigned int num_devs;
++ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
++};
+
-+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
-+{
-+ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
-+ struct nfs4_exception exception = { };
-+ int err;
-+ do {
-+ err = _nfs4_proc_layoutget(lgp);
-+ switch (err) {
-+ case -NFS4ERR_LAYOUTTRYLATER:
-+ case -NFS4ERR_RECALLCONFLICT:
-+ err = -NFS4ERR_DELAY;
-+ /* Fall through */
-+ default:
-+ err = nfs4_handle_exception(server, err, &exception);
-+ }
-+ } while (exception.retry);
-+ return err;
-+}
++/*
++ * Device ID RCU cache. A device ID is unique per client ID and layout type.
++ */
++#define NFS4_DEVICE_ID_HASH_BITS 5
++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
-+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data)
++static inline u32
++nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
-+ struct nfs4_layoutcommit_data *ldata =
-+ (struct nfs4_layoutcommit_data *)data;
-+ struct nfs_server *server = NFS_SERVER(ldata->args.inode);
++ unsigned char *cptr = (unsigned char *)id->data;
++ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
++ u32 x = 0;
+
-+ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args,
-+ &ldata->res.seq_res, 1, task))
-+ return;
-+ rpc_call_start(task);
++ while (nbytes--) {
++ x *= 37;
++ x += *cptr++;
++ }
++ return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
-+static void
-+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
-+{
-+ struct nfs4_layoutcommit_data *data =
-+ (struct nfs4_layoutcommit_data *)calldata;
-+ struct nfs_server *server = NFS_SERVER(data->args.inode);
-+
-+ if (!nfs4_sequence_done(task, &data->res.seq_res))
-+ return;
-+
-+ if (RPC_ASSASSINATED(task))
-+ return;
-+
-+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
-+ nfs_restart_rpc(task, server->nfs_client);
-+
-+ data->status = task->tk_status;
-+}
++struct pnfs_deviceid_node {
++ struct hlist_node de_node;
++ struct nfs4_deviceid de_id;
++ atomic_t de_ref;
++};
+
-+static void nfs4_layoutcommit_release(void *lcdata)
-+{
-+ struct nfs4_layoutcommit_data *data =
-+ (struct nfs4_layoutcommit_data *)lcdata;
++struct pnfs_deviceid_cache {
++ spinlock_t dc_lock;
++ atomic_t dc_ref;
++ void (*dc_free_callback)(struct pnfs_deviceid_node *);
++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
++};
+
-+ pnfs_cleanup_layoutcommit(lcdata);
-+ /* Matched by get_layout in pnfs_layoutcommit_inode */
-+ put_layout_hdr(data->args.inode);
-+ put_rpccred(data->cred);
-+ kfree(lcdata);
-+}
++extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
++ void (*free_callback)(struct pnfs_deviceid_node *));
++extern void pnfs_put_deviceid_cache(struct nfs_client *);
++extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
++ struct pnfs_deviceid_cache *,
++ struct nfs4_deviceid *);
++extern struct pnfs_deviceid_node *pnfs_add_deviceid(
++ struct pnfs_deviceid_cache *,
++ struct pnfs_deviceid_node *);
++extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
++ struct pnfs_deviceid_node *devid);
++extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *,
++ struct nfs4_deviceid *);
+
-+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
-+ .rpc_call_prepare = nfs4_layoutcommit_prepare,
-+ .rpc_call_done = nfs4_layoutcommit_done,
-+ .rpc_release = nfs4_layoutcommit_release,
-+};
++extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
-+/* Execute a layoutcommit to the server */
-+static int
-+_nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
-+{
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
-+ .rpc_argp = &data->args,
-+ .rpc_resp = &data->res,
-+ .rpc_cred = data->cred,
-+ };
-+ struct rpc_task_setup task_setup_data = {
-+ .task = &data->task,
-+ .rpc_client = NFS_CLIENT(data->args.inode),
-+ .rpc_message = &msg,
-+ .callback_ops = &nfs4_layoutcommit_ops,
-+ .callback_data = data,
-+ .flags = RPC_TASK_ASYNC,
-+ };
-+ struct rpc_task *task;
-+ int status = 0;
++/* nfs4proc.c */
++extern int nfs4_proc_getdevicelist(struct nfs_server *server,
++ const struct nfs_fh *fh,
++ struct pnfs_devicelist *devlist);
++extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
++ struct pnfs_device *dev);
++extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
++ int issync);
++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
+
-+ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu "
-+ "type: %d issync %d\n",
-+ data->task.tk_pid,
-+ data->args.range.length,
-+ data->args.range.offset,
-+ data->args.lastbytewritten,
-+ data->args.layout_type, issync);
++/* pnfs.c */
++void get_lseg(struct pnfs_layout_segment *lseg);
++void put_lseg(struct pnfs_layout_segment *lseg);
++struct pnfs_layout_segment *
++pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
++struct pnfs_layout_segment *
++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
++ loff_t pos, u64 count, enum pnfs_iomode access_type);
++bool pnfs_return_layout_barrier(struct nfs_inode *, struct pnfs_layout_range *);
++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *,
++ const nfs4_stateid *stateid, /* optional */
++ enum pnfs_layoutreturn_type, bool wait);
++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id);
++void unset_pnfs_layoutdriver(struct nfs_server *);
++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
++ const struct rpc_call_ops *, int);
++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
++ const struct rpc_call_ops *);
++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
++int pnfs_layoutcommit_inode(struct inode *inode, int sync);
++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent);
++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx);
++void pnfs_set_ds_iosize(struct nfs_server *server);
++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
++ const struct rpc_call_ops *, int);
++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
++ struct nfs_open_context *, struct list_head *,
++ size_t *);
++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
++ size_t *);
++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
++int pnfs_layout_process(struct nfs4_layoutget *lgp);
++void pnfs_layoutreturn_release(struct nfs4_layoutreturn *lpr);
++void pnfs_destroy_layout(struct nfs_inode *);
++void pnfs_destroy_all_layouts(struct nfs_client *);
++void put_layout_hdr(struct inode *inode);
++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
++ struct nfs4_state *open_state);
++void pnfs_read_done(struct nfs_read_data *);
++void pnfs_writeback_done(struct nfs_write_data *);
++void pnfs_commit_done(struct nfs_write_data *);
++int _pnfs_write_begin(struct inode *inode, struct page *page,
++ loff_t pos, unsigned len,
++ struct pnfs_layout_segment *lseg,
++ struct pnfs_fsdata **fsdata);
+
-+ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-+ task = rpc_run_task(&task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+ if (!issync)
-+ goto out;
-+ status = nfs4_wait_for_completion_rpc_task(task);
-+ if (status != 0)
-+ goto out;
-+ status = data->status;
-+out:
-+ dprintk("%s: status %d\n", __func__, status);
-+ rpc_put_task(task);
-+ return 0;
++static inline bool
++has_layout(struct nfs_inode *nfsi)
++{
++ return nfsi->layout != NULL;
+}
+
-+int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync)
++static inline int lo_fail_bit(u32 iomode)
+{
-+ struct nfs4_exception exception = { };
-+ struct nfs_server *server = NFS_SERVER(data->args.inode);
-+ int err;
-+
-+ do {
-+ err = nfs4_handle_exception(server,
-+ _nfs4_proc_layoutcommit(data, issync),
-+ &exception);
-+ } while (exception.retry);
-+ return err;
++ return iomode == IOMODE_RW ?
++ NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
-+static void
-+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
++static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
+{
-+ struct nfs4_layoutreturn *lrp = calldata;
-+ struct inode *ino = lrp->args.inode;
-+ struct nfs_inode *nfsi = NFS_I(ino);
-+ struct nfs_server *server = NFS_SERVER(ino);
-+
-+ dprintk("--> %s\n", __func__);
-+ if ((lrp->args.return_type == RETURN_FILE) &&
-+ pnfs_return_layout_barrier(nfsi, &lrp->args.range)) {
-+ dprintk("%s: waiting on barrier\n", __func__);
-+ rpc_sleep_on(&nfsi->lo_rpcwaitq, task, NULL);
-+ return;
-+ }
-+
-+ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args,
-+ &lrp->res.seq_res, 0, task))
-+ return;
-+ rpc_call_start(task);
++ write_seqlock(&lo->seqlock);
++ clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
++ write_sequnlock(&lo->seqlock);
+}
+
-+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
++/* Return true if a layout driver is being used for this mountpoint */
++static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
-+ struct nfs4_layoutreturn *lrp = calldata;
-+ struct inode *ino = lrp->args.inode;
-+ struct nfs_server *server = NFS_SERVER(ino);
-+
-+ dprintk("--> %s\n", __func__);
-+
-+ if (!nfs4_sequence_done(task, &lrp->res.seq_res))
-+ return;
-+
-+ if (RPC_ASSASSINATED(task))
-+ return;
-+
-+ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN)
-+ nfs_restart_rpc(task, server->nfs_client);
-+
-+ dprintk("<-- %s\n", __func__);
++ return nfss->pnfs_curr_ld != NULL;
+}
+
-+static void nfs4_layoutreturn_release(void *calldata)
++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
++ struct pnfs_fsdata *fsdata)
+{
-+ struct nfs4_layoutreturn *lrp = calldata;
-+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
-+
-+ dprintk("--> %s return_type %d lo %p\n", __func__,
-+ lrp->args.return_type, lo);
-+
-+ if (lrp->args.return_type == RETURN_FILE) {
-+ if (!lrp->res.lrs_present)
-+ pnfs_invalidate_layout_stateid(lo);
-+ pnfs_layoutreturn_release(lo, &lrp->args.range);
-+ }
-+ kfree(calldata);
-+ dprintk("<-- %s\n", __func__);
++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) ||
++ !fsdata->bypass_eof;
+}
+
-+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
-+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
-+ .rpc_call_done = nfs4_layoutreturn_done,
-+ .rpc_release = nfs4_layoutreturn_release,
-+};
-+
-+int _nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
++/* Should the pNFS client commit and return the layout upon a setattr */
++static inline bool
++pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
-+ struct inode *ino = lrp->args.inode;
-+ struct nfs_server *server = NFS_SERVER(ino);
-+ struct rpc_task *task;
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
-+ .rpc_argp = &lrp->args,
-+ .rpc_resp = &lrp->res,
-+ };
-+ struct rpc_task_setup task_setup_data = {
-+ .rpc_client = server->client,
-+ .rpc_message = &msg,
-+ .callback_ops = &nfs4_layoutreturn_call_ops,
-+ .callback_data = lrp,
-+ .flags = RPC_TASK_ASYNC,
-+ };
-+ int status = 0;
-+
-+ dprintk("--> %s\n", __func__);
-+ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-+ task = rpc_run_task(&task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+ if (!issync)
-+ goto out;
-+ status = nfs4_wait_for_completion_rpc_task(task);
-+ if (status != 0)
-+ goto out;
-+ status = task->tk_status;
-+out:
-+ dprintk("<-- %s\n", __func__);
-+ rpc_put_task(task);
-+ return status;
++ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
++ return false;
++ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
++ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
-+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
++static inline bool pnfs_use_rpc(struct nfs_server *nfss)
+{
-+ struct nfs_server *server = NFS_SERVER(lrp->args.inode);
-+ struct nfs4_exception exception = { };
-+ int err;
-+ do {
-+ err = nfs4_handle_exception(server,
-+ _nfs4_proc_layoutreturn(lrp, issync),
-+ &exception);
-+ } while (exception.retry);
++ if (pnfs_enabled_sb(nfss))
++ return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE;
+
-+ return err;
++ return true;
+}
+
-+/*
-+ * Retrieve the list of Data Server devices from the MDS.
++/* Should the pNFS client commit and return the layout on close
+ */
-+static int _nfs4_getdevicelist(struct nfs_server *server,
-+ const struct nfs_fh *fh,
-+ struct pnfs_devicelist *devlist)
++static inline int
++pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
+{
-+ struct nfs4_getdevicelist_args args = {
-+ .fh = fh,
-+ .layoutclass = server->pnfs_curr_ld->id,
-+ };
-+ struct nfs4_getdevicelist_res res = {
-+ .devlist = devlist,
-+ };
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
-+ .rpc_argp = &args,
-+ .rpc_resp = &res,
-+ .rpc_cred = nfs4_get_machine_cred(server->nfs_client),
-+ };
-+ int status;
++ return nfsi->layout->roc_iomode;
++}
+
-+ dprintk("--> %s\n", __func__);
-+ status = nfs4_call_sync(server, &msg, &args, &res, 0);
-+ put_rpccred(msg.rpc_cred);
-+ dprintk("<-- %s status=%d\n", __func__, status);
++static inline int pnfs_write_begin(struct file *filp, struct page *page,
++ loff_t pos, unsigned len,
++ struct pnfs_layout_segment *lseg,
++ void **fsdata)
++{
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct nfs_server *nfss = NFS_SERVER(inode);
++ int status = 0;
++
++ *fsdata = lseg;
++ if (lseg && nfss->pnfs_curr_ld->write_begin)
++ status = _pnfs_write_begin(inode, page, pos, len, lseg,
++ (struct pnfs_fsdata **) fsdata);
+ return status;
+}
+
-+int nfs4_proc_getdevicelist(struct nfs_server *server,
-+ const struct nfs_fh *fh,
-+ struct pnfs_devicelist *devlist)
++/* CAREFUL - what happens if copied < len??? */
++static inline int pnfs_write_end(struct file *filp, struct page *page,
++ loff_t pos, unsigned len, unsigned copied,
++ struct pnfs_layout_segment *lseg)
+{
-+ struct nfs4_exception exception = { };
-+ int err;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct nfs_server *nfss = NFS_SERVER(inode);
+
-+ do {
-+ err = nfs4_handle_exception(server,
-+ _nfs4_getdevicelist(server, fh, devlist),
-+ &exception);
-+ } while (exception.retry);
++ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
++ return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
++ copied, lseg);
++ else
++ return 0;
++}
+
-+ dprintk("%s: err=%d, num_devs=%u\n", __func__,
-+ err, devlist->num_devs);
++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
++{
++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
+
-+ return err;
++ if (fsdata && nfss->pnfs_curr_ld) {
++ if (nfss->pnfs_curr_ld->write_end_cleanup)
++ nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
++ if (nfss->pnfs_curr_ld->write_begin)
++ pnfs_free_fsdata(fsdata);
++ }
+}
-+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
+
-+static int
-+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
-+{
-+ struct nfs4_getdeviceinfo_args args = {
-+ .pdev = pdev,
-+ };
-+ struct nfs4_getdeviceinfo_res res = {
-+ .pdev = pdev,
-+ };
-+ struct rpc_message msg = {
-+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
-+ .rpc_argp = &args,
-+ .rpc_resp = &res,
-+ .rpc_cred = nfs4_get_machine_cred(server->nfs_client),
-+ };
-+ int status;
++static inline int pnfs_return_layout(struct inode *ino,
++ struct pnfs_layout_range *range,
++ const nfs4_stateid *stateid, /* optional */
++ enum pnfs_layoutreturn_type type,
++ bool wait)
++{
++ struct nfs_inode *nfsi = NFS_I(ino);
++ struct nfs_server *nfss = NFS_SERVER(ino);
+
-+ dprintk("--> %s\n", __func__);
-+ status = nfs4_call_sync(server, &msg, &args, &res, 0);
-+ put_rpccred(msg.rpc_cred);
-+ dprintk("<-- %s status=%d\n", __func__, status);
++ if (pnfs_enabled_sb(nfss) &&
++ (type != RETURN_FILE || has_layout(nfsi)))
++ return _pnfs_return_layout(ino, range, stateid, type, wait);
+
-+ return status;
++ return 0;
+}
+
-+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
++static inline bool
++layoutcommit_needed(struct nfs_inode *nfsi)
+{
-+ struct nfs4_exception exception = { };
-+ int err;
++ return has_layout(nfsi) &&
++ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
++}
+
-+ do {
-+ err = nfs4_handle_exception(server,
-+ _nfs4_proc_getdeviceinfo(server, pdev),
-+ &exception);
-+ } while (exception.retry);
-+ return err;
++static inline int pnfs_get_write_status(struct nfs_write_data *data)
++{
++ return data->pdata.pnfs_error;
+}
-+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
- #endif /* CONFIG_NFS_V4_1 */
-
- struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.35.noarch/fs/nfs/nfs4renewd.c
---- linux-2.6.35.noarch/fs/nfs/nfs4renewd.c.orig 2010-09-30 12:22:45.165044000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4renewd.c 2010-09-30 12:25:08.325282000 -0400
-@@ -64,7 +64,7 @@ nfs4_renew_state(struct work_struct *wor
- ops = clp->cl_mvops->state_renewal_ops;
- dprintk("%s: start\n", __func__);
- /* Are there any active superblocks? */
-- if (list_empty(&clp->cl_superblocks))
-+ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp))
- goto out;
- spin_lock(&clp->cl_lock);
- lease = clp->cl_lease_time;
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4state.c.orig linux-2.6.35.noarch/fs/nfs/nfs4state.c
---- linux-2.6.35.noarch/fs/nfs/nfs4state.c.orig 2010-09-30 12:22:45.171042000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4state.c 2010-09-30 12:25:08.331279000 -0400
-@@ -53,6 +53,7 @@
- #include "callback.h"
- #include "delegation.h"
- #include "internal.h"
-+#include "pnfs.h"
-
- #define OPENOWNER_POOL_SIZE 8
-
-@@ -126,6 +127,11 @@ static int nfs41_setup_state_renewal(str
- int status;
- struct nfs_fsinfo fsinfo;
-
-+ if (is_ds_only_client(clp)) {
-+ nfs4_schedule_state_renewal(clp);
-+ return 0;
-+ }
++static inline int pnfs_get_read_status(struct nfs_read_data *data)
++{
++ return data->pdata.pnfs_error;
++}
+
- status = nfs4_proc_get_lease_time(clp, &fsinfo);
- if (status == 0) {
- /* Update lease time and schedule renewal */
-@@ -583,8 +589,24 @@ static void __nfs4_close(struct path *pa
- if (!call_close) {
- nfs4_put_open_state(state);
- nfs4_put_state_owner(owner);
-- } else
-+ } else {
-+ u32 roc_iomode;
-+ struct nfs_inode *nfsi = NFS_I(state->inode);
++static inline struct pnfs_layout_segment *
++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
++{
++ if (fsdata) {
++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
+
-+ if (has_layout(nfsi) &&
-+ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) {
-+ struct pnfs_layout_range range = {
-+ .iomode = roc_iomode,
-+ .offset = 0,
-+ .length = NFS4_MAX_UINT64,
-+ };
++ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
++ return ((struct pnfs_fsdata *) fsdata)->lseg;
++ return (struct pnfs_layout_segment *)fsdata;
++ }
++ return NULL;
++}
+
-+ pnfs_return_layout(state->inode, &range, NULL,
-+ RETURN_FILE, wait);
-+ }
++#else /* CONFIG_NFS_V4_1 */
+
- nfs4_do_close(path, state, gfp_mask, wait);
-+ }
- }
-
- void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
-@@ -1447,6 +1469,7 @@ static void nfs4_state_manager(struct nf
- }
- clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
- set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
-+ pnfs_destroy_all_layouts(clp);
- }
-
- if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
-diff -up linux-2.6.35.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.35.noarch/fs/nfs/nfs4xdr.c
---- linux-2.6.35.noarch/fs/nfs/nfs4xdr.c.orig 2010-09-30 12:22:45.180044000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/nfs4xdr.c 2010-09-30 12:25:08.340283000 -0400
-@@ -52,6 +52,7 @@
- #include <linux/nfs_idmap.h>
- #include "nfs4_fs.h"
- #include "internal.h"
-+#include "pnfs.h"
-
- #define NFSDBG_FACILITY NFSDBG_XDR
-
-@@ -89,7 +90,7 @@ static int nfs4_stat_to_errno(int);
- #define encode_getfh_maxsz (op_encode_hdr_maxsz)
- #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \
- ((3+NFS4_FHSIZE) >> 2))
--#define nfs4_fattr_bitmap_maxsz 3
-+#define nfs4_fattr_bitmap_maxsz 4
- #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
- #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
- #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
-@@ -111,7 +112,11 @@ static int nfs4_stat_to_errno(int);
- #define encode_restorefh_maxsz (op_encode_hdr_maxsz)
- #define decode_restorefh_maxsz (op_decode_hdr_maxsz)
- #define encode_fsinfo_maxsz (encode_getattr_maxsz)
--#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11)
-+/* The 5 accounts for the PNFS attributes, and assumes that at most three
-+ * layout types will be returned.
-+ */
-+#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
-+ nfs4_fattr_bitmap_maxsz + 8 + 5)
- #define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
- #define decode_renew_maxsz (op_decode_hdr_maxsz)
- #define encode_setclientid_maxsz \
-@@ -310,6 +315,41 @@ static int nfs4_stat_to_errno(int);
- XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
- #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
- #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
-+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
-+ encode_verifier_maxsz)
-+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
-+ 2 /* nfs_cookie4 gdlr_cookie */ + \
-+ decode_verifier_maxsz \
-+ /* verifier4 gdlr_verifier */ + \
-+ 1 /* gdlr_deviceid_list count */ + \
-+ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
-+ NFS4_DEVICEID4_SIZE) \
-+ /* gdlr_deviceid_list */ + \
-+ 1 /* bool gdlr_eof */)
-+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
-+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
-+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
-+ 1 /* layout type */ + \
-+ 1 /* opaque devaddr4 length */ + \
-+ /* devaddr4 payload is read into page */ \
-+ 1 /* notification bitmap length */ + \
-+ 1 /* notification bitmap */)
-+#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
-+ encode_stateid_maxsz)
-+#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
-+ decode_stateid_maxsz + \
-+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
-+#define encode_layoutcommit_maxsz (18 + \
-+ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
-+ op_encode_hdr_maxsz + \
-+ encode_stateid_maxsz)
-+#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz)
-+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
-+ encode_stateid_maxsz + \
-+ 1 /* FIXME: opaque lrf_body always empty at
-+ *the moment */)
-+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
-+ 1 + decode_stateid_maxsz)
- #else /* CONFIG_NFS_V4_1 */
- #define encode_sequence_maxsz 0
- #define decode_sequence_maxsz 0
-@@ -699,6 +739,60 @@ static int nfs4_stat_to_errno(int);
- #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
- decode_sequence_maxsz + \
- decode_reclaim_complete_maxsz)
-+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
-+ encode_sequence_maxsz + \
-+ encode_putfh_maxsz + \
-+ encode_getdevicelist_maxsz)
-+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
-+ decode_sequence_maxsz + \
-+ decode_putfh_maxsz + \
-+ decode_getdevicelist_maxsz)
-+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
-+ encode_sequence_maxsz +\
-+ encode_getdeviceinfo_maxsz)
-+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \
-+ decode_sequence_maxsz + \
-+ decode_getdeviceinfo_maxsz)
-+#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \
-+ encode_sequence_maxsz + \
-+ encode_putfh_maxsz + \
-+ encode_layoutget_maxsz)
-+#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \
-+ decode_sequence_maxsz + \
-+ decode_putfh_maxsz + \
-+ decode_layoutget_maxsz)
-+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
-+ encode_sequence_maxsz +\
-+ encode_putfh_maxsz + \
-+ encode_layoutcommit_maxsz + \
-+ encode_getattr_maxsz)
-+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
-+ decode_sequence_maxsz + \
-+ decode_putfh_maxsz + \
-+ decode_layoutcommit_maxsz + \
-+ decode_getattr_maxsz)
-+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
-+ encode_sequence_maxsz + \
-+ encode_putfh_maxsz + \
-+ encode_layoutreturn_maxsz)
-+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
-+ decode_sequence_maxsz + \
-+ decode_putfh_maxsz + \
-+ decode_layoutreturn_maxsz)
-+#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \
-+ encode_sequence_maxsz +\
-+ encode_putfh_maxsz + \
-+ encode_write_maxsz)
-+#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \
-+ decode_sequence_maxsz + \
-+ decode_putfh_maxsz + \
-+ decode_write_maxsz)
-+#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \
-+ encode_putfh_maxsz + \
-+ encode_commit_maxsz)
-+#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \
-+ decode_putfh_maxsz + \
-+ decode_commit_maxsz)
-
- const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
- compound_encode_hdr_maxsz +
-@@ -1003,6 +1097,35 @@ static void encode_getattr_two(struct xd
- hdr->replen += decode_getattr_maxsz;
- }
-
-+static void
-+encode_getattr_three(struct xdr_stream *xdr,
-+ uint32_t bm0, uint32_t bm1, uint32_t bm2,
-+ struct compound_hdr *hdr)
++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
++{
++}
++
++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
++{
++}
++
++static inline void get_lseg(struct pnfs_layout_segment *lseg)
+{
-+ __be32 *p;
++}
+
-+ p = reserve_space(xdr, 4);
-+ *p = cpu_to_be32(OP_GETATTR);
-+ if (bm2) {
-+ p = reserve_space(xdr, 16);
-+ *p++ = cpu_to_be32(3);
-+ *p++ = cpu_to_be32(bm0);
-+ *p++ = cpu_to_be32(bm1);
-+ *p = cpu_to_be32(bm2);
-+ } else if (bm1) {
-+ p = reserve_space(xdr, 12);
-+ *p++ = cpu_to_be32(2);
-+ *p++ = cpu_to_be32(bm0);
-+ *p = cpu_to_be32(bm1);
-+ } else {
-+ p = reserve_space(xdr, 8);
-+ *p++ = cpu_to_be32(1);
-+ *p = cpu_to_be32(bm0);
-+ }
-+ hdr->nops++;
-+ hdr->replen += decode_getattr_maxsz;
++static inline void put_lseg(struct pnfs_layout_segment *lseg)
++{
+}
+
- static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
- {
- encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-@@ -1011,8 +1134,11 @@ static void encode_getfattr(struct xdr_s
++static inline struct pnfs_layout_segment *
++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
++ loff_t pos, u64 count, enum pnfs_iomode access_type)
++{
++ return NULL;
++}
++
++static inline bool
++has_layout(struct nfs_inode *nfsi)
++{
++ return false;
++}
++
++static inline bool
++layoutcommit_needed(struct nfs_inode *nfsi)
++{
++ return 0;
++}
++
++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
++ struct pnfs_fsdata *fsdata)
++{
++ return 1;
++}
++
++static inline enum pnfs_try_status
++pnfs_try_to_read_data(struct nfs_read_data *data,
++ const struct rpc_call_ops *call_ops)
++{
++ return PNFS_NOT_ATTEMPTED;
++}
++
++static inline enum pnfs_try_status
++pnfs_try_to_write_data(struct nfs_write_data *data,
++ const struct rpc_call_ops *call_ops, int how)
++{
++ return PNFS_NOT_ATTEMPTED;
++}
++
++static inline enum pnfs_try_status
++pnfs_try_to_commit(struct nfs_write_data *data,
++ const struct rpc_call_ops *call_ops, int how)
++{
++ return PNFS_NOT_ATTEMPTED;
++}
++
++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync)
++{
++ return 0;
++}
++
++static inline bool
++pnfs_ld_layoutret_on_setattr(struct inode *inode)
++{
++ return false;
++}
++
++static inline bool pnfs_use_rpc(struct nfs_server *nfss)
++{
++ return true;
++}
++
++static inline int
++pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
++{
++ return 0;
++}
++
++static inline int pnfs_return_layout(struct inode *ino,
++ struct pnfs_layout_range *range,
++ const nfs4_stateid *stateid, /* optional */
++ enum pnfs_layoutreturn_type type,
++ bool wait)
++{
++ return 0;
++}
++
++static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id)
++{
++}
++
++static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
++{
++}
++
++static inline void pnfs_set_ds_iosize(struct nfs_server *server)
++{
++ server->ds_wsize = server->ds_rsize = -1;
++}
++
++static inline int pnfs_write_begin(struct file *filp, struct page *page,
++ loff_t pos, unsigned len,
++ struct pnfs_layout_segment *lseg,
++ void **fsdata)
++{
++ *fsdata = NULL;
++ return 0;
++}
++
++static inline int pnfs_write_end(struct file *filp, struct page *page,
++ loff_t pos, unsigned len, unsigned copied,
++ struct pnfs_layout_segment *lseg)
++{
++ return 0;
++}
++
++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
++{
++}
++
++static inline int pnfs_get_write_status(struct nfs_write_data *data)
++{
++ return 0;
++}
++
++static inline int pnfs_get_read_status(struct nfs_read_data *data)
++{
++ return 0;
++}
++
++static inline void
++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino,
++ struct nfs_open_context *ctx, struct list_head *pages,
++ size_t *rsize)
++{
++ pgio->pg_lseg = NULL;
++}
++
++static inline void
++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino,
++ size_t *wsize)
++{
++ pgio->pg_lseg = NULL;
++}
++
++static inline struct pnfs_layout_segment *
++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
++{
++ return NULL;
++}
++
++#endif /* CONFIG_NFS_V4_1 */
++
++#endif /* FS_NFS_PNFS_H */
+diff --git a/fs/nfs/read.c b/fs/nfs/read.c
+index 87adc27..1df536a 100644
+--- a/fs/nfs/read.c
++++ b/fs/nfs/read.c
+@@ -18,8 +18,12 @@
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_page.h>
++#include <linux/smp_lock.h>
++#include <linux/module.h>
- static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
- {
-- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
-- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
-+ encode_getattr_three(xdr,
-+ bitmask[0] & nfs4_fsinfo_bitmap[0],
-+ bitmask[1] & nfs4_fsinfo_bitmap[1],
-+ bitmask[2] & nfs4_fsinfo_bitmap[2],
-+ hdr);
- }
+ #include <asm/system.h>
++#include <linux/module.h>
++#include "pnfs.h"
- static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
-@@ -1726,6 +1852,155 @@ static void encode_sequence(struct xdr_s
- #endif /* CONFIG_NFS_V4_1 */
+ #include "nfs4_fs.h"
+ #include "internal.h"
+@@ -117,11 +121,16 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+ LIST_HEAD(one_request);
+ struct nfs_page *new;
+ unsigned int len;
++ loff_t pgoffs;
++ struct pnfs_layout_segment *lseg;
+
+ len = nfs_page_length(page);
+ if (len == 0)
+ return nfs_return_empty_page(page);
+- new = nfs_create_request(ctx, inode, page, 0, len);
++ pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT;
++ lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ);
++ new = nfs_create_request(ctx, inode, page, 0, len, lseg);
++ put_lseg(lseg);
+ if (IS_ERR(new)) {
+ unlock_page(page);
+ return PTR_ERR(new);
+@@ -155,24 +164,20 @@ static void nfs_readpage_release(struct nfs_page *req)
+ nfs_release_request(req);
}
-+#ifdef CONFIG_NFS_V4_1
-+static void
-+encode_getdevicelist(struct xdr_stream *xdr,
-+ const struct nfs4_getdevicelist_args *args,
-+ struct compound_hdr *hdr)
-+{
-+ __be32 *p;
-+ nfs4_verifier dummy = {
-+ .data = "dummmmmy",
-+ };
-+
-+ p = reserve_space(xdr, 20);
-+ *p++ = cpu_to_be32(OP_GETDEVICELIST);
-+ *p++ = cpu_to_be32(args->layoutclass);
-+ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
-+ xdr_encode_hyper(p, 0ULL); /* cookie */
-+ encode_nfs4_verifier(xdr, &dummy);
-+ hdr->nops++;
-+ hdr->replen += decode_getdevicelist_maxsz;
-+}
+-/*
+- * Set up the NFS read request struct
+- */
+-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+- const struct rpc_call_ops *call_ops,
+- unsigned int count, unsigned int offset)
++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops)
+ {
+- struct inode *inode = req->wb_context->path.dentry->d_inode;
++ struct inode *inode = data->inode;
+ int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+- .rpc_cred = req->wb_context->cred,
++ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+- .rpc_client = NFS_CLIENT(inode),
++ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = data,
+@@ -180,9 +185,46 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+ .flags = RPC_TASK_ASYNC | swap_flags,
+ };
+
++ /* Set up the initial task struct. */
++ NFS_PROTO(inode)->read_setup(data, &msg);
+
-+static void
-+encode_getdeviceinfo(struct xdr_stream *xdr,
-+ const struct nfs4_getdeviceinfo_args *args,
-+ struct compound_hdr *hdr)
-+{
-+ __be32 *p;
++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
++ data->task.tk_pid,
++ inode->i_sb->s_id,
++ (long long)NFS_FILEID(inode),
++ data->args.count,
++ (unsigned long long)data->args.offset);
+
-+ p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
-+ *p++ = cpu_to_be32(OP_GETDEVICEINFO);
-+ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
-+ NFS4_DEVICEID4_SIZE);
-+ *p++ = cpu_to_be32(args->pdev->layout_type);
-+ *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
-+ *p++ = cpu_to_be32(0); /* bitmap length 0 */
-+ hdr->nops++;
-+ hdr->replen += decode_getdeviceinfo_maxsz;
++ task = rpc_run_task(&task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
++ rpc_put_task(task);
++ return 0;
+}
++EXPORT_SYMBOL(nfs_initiate_read);
+
-+static void
-+encode_layoutget(struct xdr_stream *xdr,
-+ const struct nfs4_layoutget_args *args,
-+ struct compound_hdr *hdr)
++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops)
+{
-+ nfs4_stateid stateid;
-+ __be32 *p;
-+
-+ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
-+ *p++ = cpu_to_be32(OP_LAYOUTGET);
-+ *p++ = cpu_to_be32(0); /* Signal layout available */
-+ *p++ = cpu_to_be32(args->type);
-+ *p++ = cpu_to_be32(args->range.iomode);
-+ p = xdr_encode_hyper(p, args->range.offset);
-+ p = xdr_encode_hyper(p, args->range.length);
-+ p = xdr_encode_hyper(p, args->minlength);
-+ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
-+ args->ctx->state);
-+ p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
-+ *p = cpu_to_be32(args->maxcount);
++ if (data->req->wb_lseg &&
++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
++ return pnfs_get_read_status(data);
+
-+ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
-+ __func__,
-+ args->type,
-+ args->range.iomode,
-+ (unsigned long)args->range.offset,
-+ (unsigned long)args->range.length,
-+ args->maxcount);
-+ hdr->nops++;
-+ hdr->replen += decode_layoutget_maxsz;
++ return nfs_initiate_read(data, clnt, call_ops);
+}
+
-+static int
-+encode_layoutcommit(struct xdr_stream *xdr,
-+ const struct nfs4_layoutcommit_args *args,
-+ struct compound_hdr *hdr)
++/*
++ * Set up the NFS read request struct
++ */
++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
++ const struct rpc_call_ops *call_ops,
++ unsigned int count, unsigned int offset)
+{
-+ __be32 *p;
-+
-+ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__,
-+ args->range.length, args->range.offset, args->lastbytewritten,
-+ args->layout_type);
++ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
-+ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE);
-+ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
-+ p = xdr_encode_hyper(p, args->range.offset);
-+ p = xdr_encode_hyper(p, args->range.length);
-+ *p++ = cpu_to_be32(0); /* reclaim */
-+ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
-+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
-+ p = xdr_encode_hyper(p, args->lastbytewritten);
-+ *p = cpu_to_be32(args->time_modify_changed != 0);
-+ if (args->time_modify_changed) {
-+ p = reserve_space(xdr, 12);
-+ *p++ = cpu_to_be32(0);
-+ *p++ = cpu_to_be32(args->time_modify.tv_sec);
-+ *p = cpu_to_be32(args->time_modify.tv_nsec);
+ data->req = req;
+ data->inode = inode;
+- data->cred = msg.rpc_cred;
++ data->cred = req->wb_context->cred;
+
+ data->args.fh = NFS_FH(inode);
+ data->args.offset = req_offset(req) + offset;
+@@ -197,21 +239,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+ data->res.eof = 0;
+ nfs_fattr_init(&data->fattr);
+
+- /* Set up the initial task struct. */
+- NFS_PROTO(inode)->read_setup(data, &msg);
+-
+- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+- data->task.tk_pid,
+- inode->i_sb->s_id,
+- (long long)NFS_FILEID(inode),
+- count,
+- (unsigned long long)data->args.offset);
+-
+- task = rpc_run_task(&task_setup_data);
+- if (IS_ERR(task))
+- return PTR_ERR(task);
+- rpc_put_task(task);
+- return 0;
++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
+ }
+
+ static void
+@@ -355,7 +383,14 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
+ {
+ struct nfs_readargs *argp = &data->args;
+ struct nfs_readres *resp = &data->res;
++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client;
+
++#ifdef CONFIG_NFS_V4_1
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS read\n", __func__);
++ clp = data->fldata.ds_nfs_client;
+ }
-+
-+ p = reserve_space(xdr, 4);
-+ *p = cpu_to_be32(args->layout_type);
-+
-+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit) {
-+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutcommit(
-+ NFS_I(args->inode)->layout, xdr, args);
-+ } else {
-+ p = reserve_space(xdr, 4);
-+ xdr_encode_opaque(p, NULL, 0);
++#endif /* CONFIG_NFS_V4_1 */
+ if (resp->eof || resp->count == argp->count)
+ return;
+
+@@ -369,7 +404,10 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
+ argp->offset += resp->count;
+ argp->pgbase += resp->count;
+ argp->count -= resp->count;
+- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
++#ifdef CONFIG_NFS_V4_1
++ data->pdata.pnfs_error = -EAGAIN;
++#endif /* CONFIG_NFS_V4_1 */
++ nfs_restart_rpc(task, clp);
+ }
+
+ /*
+@@ -410,13 +448,19 @@ static void nfs_readpage_release_partial(void *calldata)
+ void nfs_read_prepare(struct rpc_task *task, void *calldata)
+ {
+ struct nfs_read_data *data = calldata;
++ struct nfs4_session *ds_session = NULL;
+
+- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS read\n", __func__);
++ ds_session = data->fldata.ds_nfs_client->cl_session;
+ }
-+
-+ hdr->nops++;
-+ hdr->replen += decode_layoutcommit_maxsz;
-+ return 0;
-+}
-+
-+static void
-+encode_layoutreturn(struct xdr_stream *xdr,
-+ const struct nfs4_layoutreturn_args *args,
-+ struct compound_hdr *hdr)
-+{
-+ nfs4_stateid stateid;
-+ __be32 *p;
-+
-+ p = reserve_space(xdr, 20);
-+ *p++ = cpu_to_be32(OP_LAYOUTRETURN);
-+ *p++ = cpu_to_be32(args->reclaim);
-+ *p++ = cpu_to_be32(args->layout_type);
-+ *p++ = cpu_to_be32(args->range.iomode);
-+ *p = cpu_to_be32(args->return_type);
-+ if (args->return_type == RETURN_FILE) {
-+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
-+ p = xdr_encode_hyper(p, args->range.offset);
-+ p = xdr_encode_hyper(p, args->range.length);
-+ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
-+ NULL);
-+ p = xdr_encode_opaque_fixed(p, &stateid.data,
-+ NFS4_STATEID_SIZE);
-+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
-+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
-+ NFS_I(args->inode)->layout, xdr, args);
-+ } else {
-+ p = reserve_space(xdr, 4);
-+ *p = cpu_to_be32(0);
++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
+ &data->args.seq_args, &data->res.seq_res,
+ 0, task))
+ return;
+ rpc_call_start(task);
+ }
++EXPORT_SYMBOL(nfs_read_prepare);
+ #endif /* CONFIG_NFS_V4_1 */
+
+ static const struct rpc_call_ops nfs_read_partial_ops = {
+@@ -569,7 +613,20 @@ readpage_async_filler(void *data, struct page *page)
+ if (len == 0)
+ return nfs_return_empty_page(page);
+
+- new = nfs_create_request(desc->ctx, inode, page, 0, len);
++ if (desc->pgio->pg_lseg) {
++ loff_t pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT;
++ struct pnfs_layout_range *range = &desc->pgio->pg_lseg->range;
++
++ /* retry later with the right lseg? */
++ if (range->offset > pgoff + len ||
++ range->offset + range->length < pgoff) {
++ new = ERR_PTR(-EAGAIN);
++ goto out_error;
+ }
+ }
-+ hdr->nops++;
-+ hdr->replen += decode_layoutreturn_maxsz;
++
++ new = nfs_create_request(desc->ctx, inode, page, 0, len,
++ desc->pgio->pg_lseg);
+ if (IS_ERR(new))
+ goto out_error;
+
+@@ -625,6 +682,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
+ if (ret == 0)
+ goto read_complete; /* all pages were read */
+
++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize);
+ if (rsize < PAGE_CACHE_SIZE)
+ nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
+ else
+@@ -633,6 +691,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
+ ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+
+ nfs_pageio_complete(&pgio);
++ put_lseg(pgio.pg_lseg);
+ npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+ read_complete:
+diff --git a/fs/nfs/super.c b/fs/nfs/super.c
+index f4cbf0c..91606fb 100644
+--- a/fs/nfs/super.c
++++ b/fs/nfs/super.c
+@@ -64,6 +64,7 @@
+ #include "iostat.h"
+ #include "internal.h"
+ #include "fscache.h"
++#include "pnfs.h"
+
+ #define NFSDBG_FACILITY NFSDBG_VFS
+
+@@ -687,6 +688,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+
+ return 0;
+ }
++#ifdef CONFIG_NFS_V4_1
++void show_sessions(struct seq_file *m, struct nfs_server *server)
++{
++ if (nfs4_has_session(server->nfs_client))
++ seq_printf(m, ",sessions");
++}
++#else
++void show_sessions(struct seq_file *m, struct nfs_server *server) {}
++#endif
++
++#ifdef CONFIG_NFS_V4_1
++void show_pnfs(struct seq_file *m, struct nfs_server *server)
++{
++ seq_printf(m, ",pnfs=");
++ if (server->pnfs_curr_ld)
++ seq_printf(m, "%s", server->pnfs_curr_ld->name);
++ else
++ seq_printf(m, "not configured");
+}
++#else /* CONFIG_NFS_V4_1 */
++void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
-+
+
/*
- * END OF "GENERIC" ENCODE ROUTINES.
- */
-@@ -2374,7 +2649,7 @@ static int nfs4_xdr_enc_setclientid_conf
- struct compound_hdr hdr = {
- .nops = 0,
- };
-- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
+ * Present statistical information for this VFS mountpoint
+@@ -725,6 +748,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+ seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+ seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+ seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
++ show_sessions(m, nfss);
++ show_pnfs(m, nfss);
+ }
+ #endif
- xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- encode_compound_hdr(&xdr, req, &hdr);
-@@ -2513,7 +2788,7 @@ static int nfs4_xdr_enc_get_lease_time(s
- struct compound_hdr hdr = {
- .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
- };
-- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 };
+diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
+index 2f84ada..51ae53b 100644
+--- a/fs/nfs/unlink.c
++++ b/fs/nfs/unlink.c
+@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
+ struct nfs_unlinkdata *data = calldata;
+ struct nfs_server *server = NFS_SERVER(data->dir);
- xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- encode_compound_hdr(&xdr, req, &hdr);
-@@ -2543,6 +2818,153 @@ static int nfs4_xdr_enc_reclaim_complete
- return 0;
+- if (nfs4_setup_sequence(server, &data->args.seq_args,
++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
+ &data->res.seq_res, 1, task))
+ return;
+ rpc_call_start(task);
+diff --git a/fs/nfs/write.c b/fs/nfs/write.c
+index 874972d..988b65a 100644
+--- a/fs/nfs/write.c
++++ b/fs/nfs/write.c
+@@ -28,6 +28,7 @@
+ #include "iostat.h"
+ #include "nfs4_fs.h"
+ #include "fscache.h"
++#include "pnfs.h"
+
+ #define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+@@ -59,6 +60,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
+ }
+ return p;
}
++EXPORT_SYMBOL(nfs_commitdata_alloc);
-+/*
-+ * Encode GETDEVICELIST request
-+ */
-+static int
-+nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs4_getdevicelist_args *args)
+ void nfs_commit_free(struct nfs_write_data *p)
+ {
+@@ -429,6 +431,17 @@ static void nfs_inode_remove_request(struct nfs_page *req)
+ nfs_clear_request(req);
+ nfs_release_request(req);
+ }
++static void
++nfs_mark_request_nopnfs(struct nfs_page *req)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
++ struct pnfs_layout_segment *lseg = req->wb_lseg;
+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_putfh(&xdr, args->fh, &hdr);
-+ encode_getdevicelist(&xdr, args, &hdr);
-+ encode_nops(&hdr);
-+ return 0;
++ if (req->wb_lseg == NULL)
++ return;
++ req->wb_lseg = NULL;
++ put_lseg(lseg);
++ dprintk(" retry through MDS\n");
+}
+
+ static void
+ nfs_mark_request_dirty(struct nfs_page *req)
+@@ -534,7 +547,7 @@ nfs_need_commit(struct nfs_inode *nfsi)
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ */
+ static int
+-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs)
+ {
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int ret;
+@@ -542,7 +555,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
+ if (!nfs_need_commit(nfsi))
+ return 0;
+
+- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT,
++ use_pnfs);
+ if (ret > 0)
+ nfsi->ncommit -= ret;
+ if (nfs_need_commit(NFS_I(inode)))
+@@ -571,7 +585,8 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
+ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ struct page *page,
+ unsigned int offset,
+- unsigned int bytes)
++ unsigned int bytes,
++ struct pnfs_layout_segment *lseg)
+ {
+ struct nfs_page *req;
+ unsigned int rqend;
+@@ -596,8 +611,8 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
+ */
+- if (offset > rqend
+- || end < req->wb_offset)
++ if (offset > rqend || end < req->wb_offset ||
++ req->wb_lseg != lseg)
+ goto out_flushme;
+
+ if (nfs_set_page_tag_locked(req))
+@@ -645,16 +660,17 @@ out_err:
+ * already called nfs_flush_incompatible() if necessary.
+ */
+ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+- struct page *page, unsigned int offset, unsigned int bytes)
++ struct page *page, unsigned int offset, unsigned int bytes,
++ struct pnfs_layout_segment *lseg)
+ {
+ struct inode *inode = page->mapping->host;
+ struct nfs_page *req;
+ int error;
+
+- req = nfs_try_to_update_request(inode, page, offset, bytes);
++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg);
+ if (req != NULL)
+ goto out;
+- req = nfs_create_request(ctx, inode, page, offset, bytes);
++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg);
+ if (IS_ERR(req))
+ goto out;
+ error = nfs_inode_add_request(inode, req);
+@@ -667,23 +683,27 @@ out:
+ }
+
+ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+- unsigned int offset, unsigned int count)
++ unsigned int offset, unsigned int count,
++ struct pnfs_layout_segment *lseg,
++ void *fsdata)
+ {
+ struct nfs_page *req;
+
+- req = nfs_setup_write_request(ctx, page, offset, count);
++ req = nfs_setup_write_request(ctx, page, offset, count, lseg);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ nfs_mark_request_dirty(req);
+ /* Update file length */
+- nfs_grow_file(page, offset, count);
++ if (pnfs_grow_ok(lseg, fsdata))
++ nfs_grow_file(page, offset, count);
+ nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_mark_request_dirty(req);
+ nfs_clear_page_tag_locked(req);
+ return 0;
+ }
+
+-int nfs_flush_incompatible(struct file *file, struct page *page)
++int nfs_flush_incompatible(struct file *file, struct page *page,
++ struct pnfs_layout_segment *lseg)
+ {
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct nfs_page *req;
+@@ -702,7 +722,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
+ return 0;
+ do_flush = req->wb_page != page || req->wb_context != ctx ||
+ req->wb_lock_context->lockowner != current->files ||
+- req->wb_lock_context->pid != current->tgid;
++ req->wb_lock_context->pid != current->tgid ||
++ req->wb_lseg != lseg;
+ nfs_release_request(req);
+ if (!do_flush)
+ return 0;
+@@ -729,7 +750,8 @@ static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
+ * things with a page scheduled for an RPC call (e.g. invalidate it).
+ */
+ int nfs_updatepage(struct file *file, struct page *page,
+- unsigned int offset, unsigned int count)
++ unsigned int offset, unsigned int count,
++ struct pnfs_layout_segment *lseg, void *fsdata)
+ {
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct inode *inode = page->mapping->host;
+@@ -754,7 +776,7 @@ int nfs_updatepage(struct file *file, struct page *page,
+ offset = 0;
+ }
+
+- status = nfs_writepage_setup(ctx, page, offset, count);
++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
+ if (status < 0)
+ nfs_set_pageerror(page);
+
+@@ -784,25 +806,21 @@ static int flush_task_priority(int how)
+ return RPC_PRIORITY_NORMAL;
+ }
+
+-/*
+- * Set up the argument/result storage required for the RPC call.
+- */
+-static int nfs_write_rpcsetup(struct nfs_page *req,
+- struct nfs_write_data *data,
+- const struct rpc_call_ops *call_ops,
+- unsigned int count, unsigned int offset,
+- int how)
++int nfs_initiate_write(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how)
+ {
+- struct inode *inode = req->wb_context->path.dentry->d_inode;
++ struct inode *inode = data->inode;
+ int priority = flush_task_priority(how);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+- .rpc_cred = req->wb_context->cred,
++ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+- .rpc_client = NFS_CLIENT(inode),
++ .rpc_client = clnt,
+ .task = &data->task,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+@@ -813,12 +831,62 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
+ };
+ int ret = 0;
+
++ /* Set up the initial task struct. */
++ NFS_PROTO(inode)->write_setup(data, &msg);
+
-+/*
-+ * Encode GETDEVICEINFO request
-+ */
-+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs4_getdeviceinfo_args *args)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
-+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_getdeviceinfo(&xdr, args, &hdr);
-+
-+ /* set up reply kvec. Subtract notification bitmap max size (2)
-+ * so that notification bitmap is put in xdr_buf tail */
-+ xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
-+ args->pdev->pages, args->pdev->pgbase,
-+ args->pdev->pglen);
++ dprintk("NFS: %5u initiated write call "
++ "(req %s/%lld, %u bytes @ offset %llu)\n",
++ data->task.tk_pid,
++ inode->i_sb->s_id,
++ (long long)NFS_FILEID(inode),
++ data->args.count,
++ (unsigned long long)data->args.offset);
+
-+ encode_nops(&hdr);
-+ return 0;
++ task = rpc_run_task(&task_setup_data);
++ if (IS_ERR(task)) {
++ ret = PTR_ERR(task);
++ goto out;
++ }
++ if (how & FLUSH_SYNC) {
++ ret = rpc_wait_for_completion_task(task);
++ if (ret == 0)
++ ret = task->tk_status;
++ }
++ rpc_put_task(task);
++out:
++ return ret;
+}
++EXPORT_SYMBOL(nfs_initiate_write);
+
-+/*
-+ * Encode LAYOUTGET request
-+ */
-+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs4_layoutget_args *args)
++int pnfs_initiate_write(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
++ if (data->req->wb_lseg &&
++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
++ return pnfs_get_write_status(data);
+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-+ encode_layoutget(&xdr, args, &hdr);
-+ encode_nops(&hdr);
-+ return 0;
++ return nfs_initiate_write(data, clnt, call_ops, how);
+}
+
+/*
-+ * Encode LAYOUTCOMMIT request
++ * Set up the argument/result storage required for the RPC call.
+ */
-+static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs4_layoutcommit_args *args)
++static int nfs_write_rpcsetup(struct nfs_page *req,
++ struct nfs_write_data *data,
++ const struct rpc_call_ops *call_ops,
++ unsigned int count, unsigned int offset,
++ int how)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
++ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_putfh(&xdr, args->fh, &hdr);
-+ encode_layoutcommit(&xdr, args, &hdr);
-+ encode_getfattr(&xdr, args->bitmask, &hdr);
-+ encode_nops(&hdr);
-+ return 0;
-+}
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with data->commit et al. */
+
+ data->req = req;
+ data->inode = inode = req->wb_context->path.dentry->d_inode;
+- data->cred = msg.rpc_cred;
++ data->cred = req->wb_context->cred;
+
+ data->args.fh = NFS_FH(inode);
+ data->args.offset = req_offset(req) + offset;
+@@ -839,30 +907,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+
+- /* Set up the initial task struct. */
+- NFS_PROTO(inode)->write_setup(data, &msg);
+-
+- dprintk("NFS: %5u initiated write call "
+- "(req %s/%lld, %u bytes @ offset %llu)\n",
+- data->task.tk_pid,
+- inode->i_sb->s_id,
+- (long long)NFS_FILEID(inode),
+- count,
+- (unsigned long long)data->args.offset);
+-
+- task = rpc_run_task(&task_setup_data);
+- if (IS_ERR(task)) {
+- ret = PTR_ERR(task);
+- goto out;
+- }
+- if (how & FLUSH_SYNC) {
+- ret = rpc_wait_for_completion_task(task);
+- if (ret == 0)
+- ret = task->tk_status;
+- }
+- rpc_put_task(task);
+-out:
+- return ret;
++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
+ }
+
+ /* If a nfs_flush_* function fails, it should remove reqs from @head and
+@@ -873,6 +918,7 @@ static void nfs_redirty_request(struct nfs_page *req)
+ {
+ struct page *page = req->wb_page;
+
++ nfs_mark_request_nopnfs(req);
+ nfs_mark_request_dirty(req);
+ nfs_clear_page_tag_locked(req);
+ nfs_end_page_writeback(page);
+@@ -985,6 +1031,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+ {
+ size_t wsize = NFS_SERVER(inode)->wsize;
+
++ pnfs_pageio_init_write(pgio, inode, &wsize);
+
-+/*
-+ * Encode LAYOUTRETURN request
-+ */
-+static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs4_layoutreturn_args *args)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
+ if (wsize < PAGE_CACHE_SIZE)
+ nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
+ else
+@@ -1050,13 +1098,27 @@ out:
+ void nfs_write_prepare(struct rpc_task *task, void *calldata)
+ {
+ struct nfs_write_data *data = calldata;
++ struct nfs4_session *ds_session = NULL;
++
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS read\n", __func__);
++ ds_session = data->fldata.ds_nfs_client->cl_session;
++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) {
++ /* retrying via MDS? */
++ data->pdata.orig_count = data->args.count;
++ data->args.count = NFS_SERVER(data->inode)->wsize;
++ dprintk("%s: trimmed count %u to wsize %u\n", __func__,
++ data->pdata.orig_count, data->args.count);
++ } else
++ data->pdata.orig_count = 0;
+
+- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
+ &data->args.seq_args,
+ &data->res.seq_res, 1, task))
+ return;
+ rpc_call_start(task);
+ }
++EXPORT_SYMBOL(nfs_write_prepare);
+ #endif /* CONFIG_NFS_V4_1 */
+
+ static const struct rpc_call_ops nfs_write_partial_ops = {
+@@ -1140,10 +1202,11 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+ struct nfs_writeargs *argp = &data->args;
+ struct nfs_writeres *resp = &data->res;
+ struct nfs_server *server = NFS_SERVER(data->inode);
++ struct nfs_client *clp = server->nfs_client;
+ int status;
+
+- dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
+- task->tk_pid, task->tk_status);
++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n",
++ task->tk_pid, task->tk_status, resp->count);
+
+ /*
+ * ->write_done will attempt to use post-op attributes to detect
+@@ -1156,6 +1219,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+ if (status != 0)
+ return status;
+ nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
++#ifdef CONFIG_NFS_V4_1
++ /* Is this a DS session */
++ if (data->fldata.ds_nfs_client) {
++ dprintk("%s DS write\n", __func__);
++ clp = data->fldata.ds_nfs_client;
++ }
++#endif /* CONFIG_NFS_V4_1 */
+
+ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+ if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+@@ -1172,7 +1242,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+ if (time_before(complain, jiffies)) {
+ dprintk("NFS: faulty NFS server %s:"
+ " (committed = %d) != (stable = %d)\n",
+- server->nfs_client->cl_hostname,
++ clp->cl_hostname,
+ resp->verf->committed, argp->stable);
+ complain = jiffies + 300 * HZ;
+ }
+@@ -1198,6 +1268,9 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+ */
+ argp->stable = NFS_FILE_SYNC;
+ }
++#ifdef CONFIG_NFS_V4_1
++ data->pdata.pnfs_error = -EAGAIN;
++#endif /* CONFIG_NFS_V4_1 */
+ nfs_restart_rpc(task, server->nfs_client);
+ return -EAGAIN;
+ }
+@@ -1242,40 +1315,73 @@ static void nfs_commitdata_release(void *data)
+ nfs_commit_free(wdata);
+ }
+
+-/*
+- * Set up the argument/result storage required for the RPC call.
+- */
+-static int nfs_commit_rpcsetup(struct list_head *head,
+- struct nfs_write_data *data,
+- int how)
++int nfs_initiate_commit(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how)
+ {
+- struct nfs_page *first = nfs_list_entry(head->next);
+- struct inode *inode = first->wb_context->path.dentry->d_inode;
++ struct inode *inode = data->inode;
+ int priority = flush_task_priority(how);
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+- .rpc_cred = first->wb_context->cred,
++ .rpc_cred = data->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .task = &data->task,
+- .rpc_client = NFS_CLIENT(inode),
++ .rpc_client = clnt,
+ .rpc_message = &msg,
+- .callback_ops = &nfs_commit_ops,
++ .callback_ops = call_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC,
+ .priority = priority,
+ };
+
++ /* Set up the initial task struct. */
++ NFS_PROTO(inode)->commit_setup(data, &msg);
+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
-+ encode_layoutreturn(&xdr, args, &hdr);
-+ encode_nops(&hdr);
++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
++
++ task = rpc_run_task(&task_setup_data);
++ if (IS_ERR(task))
++ return PTR_ERR(task);
++ rpc_put_task(task);
+ return 0;
+}
++EXPORT_SYMBOL(nfs_initiate_commit);
+
-+/*
-+ * Encode a pNFS File Layout Data Server WRITE request
-+ */
-+static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs_writeargs *args)
++
++int pnfs_initiate_commit(struct nfs_write_data *data,
++ struct rpc_clnt *clnt,
++ const struct rpc_call_ops *call_ops,
++ int how, int pnfs)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
++ if (pnfs &&
++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED))
++ return pnfs_get_write_status(data);
+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_putfh(&xdr, args->fh, &hdr);
-+ encode_write(&xdr, args, &hdr);
-+ encode_nops(&hdr);
-+ return 0;
++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how);
+}
+
+/*
-+ * Encode a pNFS File Layout Data Server COMMIT request
++ * Set up the argument/result storage required for the RPC call.
+ */
-+static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p,
-+ struct nfs_writeargs *args)
++static int nfs_commit_rpcsetup(struct list_head *head,
++ struct nfs_write_data *data,
++ int how, int pnfs)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr = {
-+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
-+ };
++ struct nfs_page *first = nfs_list_entry(head->next);
++ struct inode *inode = first->wb_context->path.dentry->d_inode;
+
-+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-+ encode_compound_hdr(&xdr, req, &hdr);
-+ encode_sequence(&xdr, &args->seq_args, &hdr);
-+ encode_putfh(&xdr, args->fh, &hdr);
-+ encode_commit(&xdr, args, &hdr);
-+ encode_nops(&hdr);
-+ return 0;
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with data->commit et al. */
+
+ list_splice_init(head, &data->pages);
+
+ data->inode = inode;
+- data->cred = msg.rpc_cred;
++ data->cred = first->wb_context->cred;
+
+ data->args.fh = NFS_FH(data->inode);
+ /* Note: we always request a commit of the entire inode */
+@@ -1286,45 +1392,47 @@ static int nfs_commit_rpcsetup(struct list_head *head,
+ data->res.fattr = &data->fattr;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
++ kref_init(&data->refcount);
++ data->parent = NULL;
++ data->args.context = first->wb_context; /* used by commit done */
+
+- /* Set up the initial task struct. */
+- NFS_PROTO(inode)->commit_setup(data, &msg);
++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops,
++ how, pnfs);
+}
- #endif /* CONFIG_NFS_V4_1 */
- static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
-@@ -2643,14 +3065,17 @@ static int decode_attr_bitmap(struct xdr
- goto out_overflow;
- bmlen = be32_to_cpup(p);
+- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
++/* Handle memory error during commit */
++void nfs_mark_list_commit(struct list_head *head)
++{
++ struct nfs_page *req;
-- bitmap[0] = bitmap[1] = 0;
-+ bitmap[0] = bitmap[1] = bitmap[2] = 0;
- p = xdr_inline_decode(xdr, (bmlen << 2));
- if (unlikely(!p))
- goto out_overflow;
- if (bmlen > 0) {
- bitmap[0] = be32_to_cpup(p++);
-- if (bmlen > 1)
-- bitmap[1] = be32_to_cpup(p);
-+ if (bmlen > 1) {
-+ bitmap[1] = be32_to_cpup(p++);
-+ if (bmlen > 2)
-+ bitmap[2] = be32_to_cpup(p);
-+ }
- }
- return 0;
- out_overflow:
-@@ -2679,8 +3104,9 @@ static int decode_attr_supported(struct
- decode_attr_bitmap(xdr, bitmask);
- bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
- } else
-- bitmask[0] = bitmask[1] = 0;
-- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
-+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
-+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
-+ bitmask[0], bitmask[1], bitmask[2]);
- return 0;
+- task = rpc_run_task(&task_setup_data);
+- if (IS_ERR(task))
+- return PTR_ERR(task);
+- rpc_put_task(task);
+- return 0;
++ while (!list_empty(head)) {
++ req = nfs_list_entry(head->next);
++ nfs_list_remove_request(req);
++ nfs_mark_request_commit(req);
++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
++ BDI_RECLAIMABLE);
++ nfs_clear_page_tag_locked(req);
++ }
}
++EXPORT_SYMBOL(nfs_mark_list_commit);
-@@ -3665,7 +4091,7 @@ out_overflow:
- static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
- {
- __be32 *savep;
-- uint32_t attrlen, bitmap[2] = {0};
-+ uint32_t attrlen, bitmap[3] = {0};
- int status;
-
- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3691,7 +4117,7 @@ xdr_error:
- static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
+ /*
+ * Commit dirty pages
+ */
+ static int
+-nfs_commit_list(struct inode *inode, struct list_head *head, int how)
++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs)
{
- __be32 *savep;
-- uint32_t attrlen, bitmap[2] = {0};
-+ uint32_t attrlen, bitmap[3] = {0};
- int status;
+ struct nfs_write_data *data;
+- struct nfs_page *req;
- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3723,7 +4149,7 @@ xdr_error:
- static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
- {
- __be32 *savep;
-- uint32_t attrlen, bitmap[2] = {0};
-+ uint32_t attrlen, bitmap[3] = {0};
- int status;
+ data = nfs_commitdata_alloc();
+-
+ if (!data)
+ goto out_bad;
- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3749,7 +4175,7 @@ static int decode_getfattr(struct xdr_st
- {
- __be32 *savep;
- uint32_t attrlen,
-- bitmap[2] = {0},
-+ bitmap[3] = {0},
- type;
- int status;
- umode_t fmode = 0;
-@@ -3868,11 +4294,87 @@ xdr_error:
- return status;
+ /* Set up the argument struct */
+- return nfs_commit_rpcsetup(head, data, how);
++ return nfs_commit_rpcsetup(head, data, how, pnfs);
+ out_bad:
+- while (!list_empty(head)) {
+- req = nfs_list_entry(head->next);
+- nfs_list_remove_request(req);
+- nfs_mark_request_commit(req);
+- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+- BDI_RECLAIMABLE);
+- nfs_clear_page_tag_locked(req);
+- }
++ nfs_mark_list_commit(head);
+ nfs_commit_clear_lock(NFS_I(inode));
+ return -ENOMEM;
+ }
+@@ -1344,6 +1452,19 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
+ return;
}
-+/*
-+ * Decode potentially multiple layout types. Currently we only support
-+ * one layout driver per file system.
-+ */
-+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
-+ uint32_t *layouttype)
-+{
-+ uint32_t *p;
-+ int num;
-+
-+ p = xdr_inline_decode(xdr, 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ num = be32_to_cpup(p);
-+
-+ /* pNFS is not supported by the underlying file system */
-+ if (num == 0) {
-+ *layouttype = 0;
-+ return 0;
-+ }
-+ if (num > 1)
-+ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
-+ "per filesystem not supported\n", __func__);
-+
-+ /* Decode and set first layout type, move xdr->p past unused types */
-+ p = xdr_inline_decode(xdr, num * 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ *layouttype = be32_to_cpup(p);
-+ return 0;
-+out_overflow:
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
-+}
-+
-+/*
-+ * The type of file system exported.
-+ * Note we must ensure that layouttype is set in any non-error case.
-+ */
-+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
-+ uint32_t *layouttype)
-+{
-+ int status = 0;
-+
-+ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
-+ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
-+ return -EIO;
-+ if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-+ status = decode_first_pnfs_layout_type(xdr, layouttype);
-+ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
-+ } else
-+ *layouttype = 0;
-+ return status;
-+}
-+
-+/*
-+ * The prefered block size for layout directed io
-+ */
-+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
-+ uint32_t *res)
++static inline void nfs_commit_cleanup(struct kref *kref)
+{
-+ __be32 *p;
++ struct nfs_write_data *data;
+
-+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
-+ *res = 0;
-+ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
-+ p = xdr_inline_decode(xdr, 4);
-+ if (unlikely(!p)) {
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
-+ }
-+ *res = be32_to_cpup(p);
-+ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
-+ }
-+ return 0;
++ data = container_of(kref, struct nfs_write_data, refcount);
++ /* Clear lock only when all cloned commits are finished */
++ if (data->parent)
++ kref_put(&data->parent->refcount, nfs_commit_cleanup);
++ else
++ nfs_commit_clear_lock(NFS_I(data->inode));
++ nfs_commitdata_release(data);
+}
-
- static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
++
+ static void nfs_commit_release(void *calldata)
{
- __be32 *savep;
-- uint32_t attrlen, bitmap[2];
-+ uint32_t attrlen, bitmap[3];
- int status;
+ struct nfs_write_data *data = calldata;
+@@ -1361,6 +1482,11 @@ static void nfs_commit_release(void *calldata)
+ req->wb_bytes,
+ (long long)req_offset(req));
+ if (status < 0) {
++ if (req->wb_lseg) {
++ nfs_mark_request_nopnfs(req);
++ nfs_mark_request_dirty(req);
++ goto next;
++ }
+ nfs_context_set_write_error(req->wb_context, status);
+ nfs_inode_remove_request(req);
+ dprintk(", error = %d\n", status);
+@@ -1377,12 +1503,12 @@ static void nfs_commit_release(void *calldata)
+ }
+ /* We have a mismatch. Write the page again */
+ dprintk(" mismatch\n");
++ nfs_mark_request_nopnfs(req);
+ nfs_mark_request_dirty(req);
+ next:
+ nfs_clear_page_tag_locked(req);
+ }
+- nfs_commit_clear_lock(NFS_I(data->inode));
+- nfs_commitdata_release(calldata);
++ kref_put(&data->refcount, nfs_commit_cleanup);
+ }
- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
-@@ -3894,6 +4396,12 @@ static int decode_fsinfo(struct xdr_stre
- if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
- goto xdr_error;
- fsinfo->wtpref = fsinfo->wtmax;
-+ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
-+ if (status)
-+ goto xdr_error;
-+ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
-+ if (status)
-+ goto xdr_error;
+ static const struct rpc_call_ops nfs_commit_ops = {
+@@ -1398,21 +1524,22 @@ int nfs_commit_inode(struct inode *inode, int how)
+ LIST_HEAD(head);
+ int may_wait = how & FLUSH_SYNC;
+ int res = 0;
++ int use_pnfs = 0;
- status = verify_attr_len(xdr, savep, attrlen);
- xdr_error:
-@@ -4382,7 +4890,7 @@ static int decode_getacl(struct xdr_stre
- {
- __be32 *savep;
- uint32_t attrlen,
-- bitmap[2] = {0};
-+ bitmap[3] = {0};
- struct kvec *iov = req->rq_rcv_buf.head;
- int status;
+ if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+ goto out_mark_dirty;
+ spin_lock(&inode->i_lock);
+- res = nfs_scan_commit(inode, &head, 0, 0);
++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs);
+ spin_unlock(&inode->i_lock);
+ if (res) {
+- int error = nfs_commit_list(inode, &head, how);
++ int error = nfs_commit_list(inode, &head, how, use_pnfs);
+ if (error < 0)
+ return error;
+- if (may_wait)
++ if (may_wait) {
+ wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE);
+- else
++ } else
+ goto out_mark_dirty;
+ } else
+ nfs_commit_clear_lock(NFS_I(inode));
+@@ -1465,7 +1592,18 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
-@@ -4731,14 +5239,235 @@ out_overflow:
- #endif /* CONFIG_NFS_V4_1 */
+ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+ {
+- return nfs_commit_unstable_pages(inode, wbc);
++ int ret;
++ ret = nfs_commit_unstable_pages(inode, wbc);
++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) {
++ int err, sync = wbc->sync_mode;
++
++ if (wbc->nonblocking || wbc->for_background)
++ sync = 0;
++ err = pnfs_layoutcommit_inode(inode, sync);
++ if (err < 0)
++ ret = err;
++ }
++ return ret;
}
-+#if defined(CONFIG_NFS_V4_1)
/*
-- * END OF "GENERIC" DECODE ROUTINES.
-- */
--
--/*
-- * Decode OPEN_DOWNGRADE response
-+ * TODO: Need to handle case when EOF != true;
- */
--static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
-+static int decode_getdevicelist(struct xdr_stream *xdr,
-+ struct pnfs_devicelist *res)
-+{
-+ __be32 *p;
-+ int status, i;
-+ struct nfs_writeverf verftemp;
+diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
+index 4264377..62033eb 100644
+--- a/fs/nfsd/Kconfig
++++ b/fs/nfsd/Kconfig
+@@ -79,3 +79,52 @@ config NFSD_V4
+ available from http://linux-nfs.org/.
+
+ If unsure, say N.
+
-+ status = decode_op_hdr(xdr, OP_GETDEVICELIST);
-+ if (status)
-+ return status;
++config PNFSD
++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)"
++ depends on NFSD_V4 && EXPERIMENTAL
++ select EXPORTFS_FILE_LAYOUT
++ help
++ This option enables support for the parallel NFS features of the
++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1)
++ in the kernel's NFS server.
+
-+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
++ Unless you're an NFS developer, say N.
+
-+ /* TODO: Skip cookie for now */
-+ p += 2;
++config PNFSD_LOCAL_EXPORT
++ bool "Enable pNFS support for exporting local filesystems for debugging purposes"
++ depends on PNFSD
++ help
++ Say Y here if you want your pNFS server to export local file systems
++ over the files layout type. With this option the MDS (metadata
++ server) functions also as a single DS (data server). This is mostly
++ useful for development and debugging purposes.
+
-+ /* Read verifier */
-+ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
++ If unsure, say N.
+
-+ res->num_devs = be32_to_cpup(p);
++config SPNFS
++ bool "Provide spNFS server support (EXPERIMENTAL)"
++ depends on PNFSD
++ select RPCSEC_GSS_KRB5
++ help
++ Say Y here if you want spNFS server support.
+
-+ dprintk("%s: num_dev %d\n", __func__, res->num_devs);
++ If unsure, say N.
++
++config SPNFS_LAYOUTSEGMENTS
++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)"
++ depends on SPNFS
++ select RPCSEC_GSS_KRB5
++ help
++ Say Y here if you want spNFS to be able to return layout segments.
++
++ If unsure, say N.
++
++config SPNFS_BLOCK
++ bool "Provide Block Layout server support (EXPERIMENTAL)"
++ depends on SPNFS
++ select EXPORTFS_BLOCK_LAYOUT
++ help
++ Say Y here if you want spNFS block layout support
++
++ If unsure, say N.
+diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
+index 9b118ee..fed6c25 100644
+--- a/fs/nfsd/Makefile
++++ b/fs/nfsd/Makefile
+@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
+ nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
+ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+ nfs4acl.o nfs4callback.o nfs4recover.o
++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o
++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o
++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o
++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o
+diff --git a/fs/nfsd/bl_com.c b/fs/nfsd/bl_com.c
+new file mode 100644
+index 0000000..aac98c7
+--- /dev/null
++++ b/fs/nfsd/bl_com.c
+@@ -0,0 +1,292 @@
++#if defined(CONFIG_SPNFS_BLOCK)
++
++#include <linux/module.h>
++#include <linux/mutex.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/socket.h>
++#include <linux/in.h>
++#include <linux/sched.h>
++#include <linux/exportfs.h>
++#include <linux/namei.h>
++#include <linux/mount.h>
++#include <linux/path.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/workqueue.h>
++#include <linux/sunrpc/rpc_pipe_fs.h>
++#include <linux/proc_fs.h>
++#include <linux/nfs_fs.h>
++
++#include <linux/nfsd/debug.h>
++#include <linux/nfsd4_block.h>
++
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
++
++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
++ char __user *, size_t);
++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
++
++static struct rpc_pipe_ops bl_upcall_ops = {
++ .upcall = bl_pipe_upcall,
++ .downcall = bl_pipe_downcall,
++ .destroy_msg = bl_pipe_destroy_msg,
++};
++
++bl_comm_t *bl_comm_global;
++
++int
++nfsd_bl_start(void)
++{
++ bl_comm_t *bl_comm = NULL;
++ struct path path;
++ struct nameidata nd;
++ int rc;
++
++ dprintk("%s: starting pipe\n", __func__);
++ if (bl_comm_global)
++ return -EEXIST;
++
++ path.mnt = rpc_get_mount();
++ if (IS_ERR(path.mnt))
++ return PTR_ERR(path.mnt);
++
++ /* FIXME: do not abuse rpc_pipefs/nfs */
++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
++ if (rc)
++ goto err;
++
++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL);
++ if (!bl_comm) {
++ rc = -ENOMEM;
++ goto err;
++ }
+
-+ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM)
-+ return -NFS4ERR_REP_TOO_BIG;
++ /* FIXME: rename to "spnfs_block" */
++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm,
++ &bl_upcall_ops, 0);
++ if (IS_ERR(bl_comm->pipe_dentry)) {
++ rc = -EPIPE;
++ goto err;
++ }
++ mutex_init(&bl_comm->lock);
++ mutex_init(&bl_comm->pipe_lock);
++ init_waitqueue_head(&bl_comm->pipe_wq);
+
-+ p = xdr_inline_decode(xdr,
-+ res->num_devs * NFS4_DEVICEID4_SIZE + 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ for (i = 0; i < res->num_devs; i++)
-+ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
-+ NFS4_DEVICEID4_SIZE);
-+ res->eof = be32_to_cpup(p);
++ bl_comm_global = bl_comm;
+ return 0;
-+out_overflow:
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
++err:
++ rpc_put_mount();
++ kfree(bl_comm);
++ return rc;
+}
+
-+static int decode_getdeviceinfo(struct xdr_stream *xdr,
-+ struct pnfs_device *pdev)
++void
++nfsd_bl_stop(void)
+{
-+ __be32 *p;
-+ uint32_t len, type;
-+ int status;
++ bl_comm_t *c = bl_comm_global;
+
-+ status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
-+ if (status) {
-+ if (status == -ETOOSMALL) {
-+ p = xdr_inline_decode(xdr, 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ pdev->mincount = be32_to_cpup(p);
-+ dprintk("%s: Min count too small. mincnt = %u\n",
-+ __func__, pdev->mincount);
-+ }
-+ return status;
-+ }
++ dprintk("%s: stopping pipe\n", __func__);
++ if (!c)
++ return;
++ rpc_unlink(c->pipe_dentry);
++ rpc_put_mount();
++ bl_comm_global = NULL;
++ kfree(c);
++}
+
-+ p = xdr_inline_decode(xdr, 8);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ type = be32_to_cpup(p++);
-+ if (type != pdev->layout_type) {
-+ dprintk("%s: layout mismatch req: %u pdev: %u\n",
-+ __func__, pdev->layout_type, type);
-+ return -EINVAL;
-+ }
-+ /*
-+ * Get the length of the opaque device_addr4. xdr_read_pages places
-+ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
-+ * and places the remaining xdr data in xdr_buf->tail
-+ */
-+ pdev->mincount = be32_to_cpup(p);
-+ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
++static ssize_t
++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst,
++ size_t buflen)
++{
++ char *data = (char *)msg->data + msg->copied;
++ ssize_t mlen = msg->len - msg->copied,
++ left;
+
-+ /* Parse notification bitmap, verifying that it is zero. */
-+ p = xdr_inline_decode(xdr, 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ len = be32_to_cpup(p);
-+ if (len) {
-+ int i;
++ if (mlen > buflen)
++ mlen = buflen;
+
-+ p = xdr_inline_decode(xdr, 4 * len);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ for (i = 0; i < len; i++, p++) {
-+ if (be32_to_cpup(p)) {
-+ dprintk("%s: notifications not supported\n",
-+ __func__);
-+ return -EIO;
-+ }
-+ }
++ left = copy_to_user(dst, data, mlen);
++ if (left < 0) {
++ msg->errno = left;
++ return left;
+ }
-+ return 0;
-+out_overflow:
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
++ mlen -= left;
++ msg->copied += mlen;
++ msg->errno = 0;
++
++ return mlen;
+}
+
-+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
-+ struct nfs4_layoutget_res *res)
++static ssize_t
++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
-+ __be32 *p;
-+ int status;
-+ u32 layout_count;
++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
++ bl_comm_t *bc = (bl_comm_t *)rpci->private;
++ bl_comm_msg_t *im = &bc->msg;
++ int ret;
++ bl_comm_res_t *res;
++
+
-+ status = decode_op_hdr(xdr, OP_LAYOUTGET);
-+ if (status)
-+ return status;
-+ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ res->return_on_close = be32_to_cpup(p++);
-+ p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
-+ layout_count = be32_to_cpup(p);
-+ if (!layout_count) {
-+ dprintk("%s: server responded with empty layout array\n",
-+ __func__);
-+ return -EINVAL;
++ if (mlen == 0) {
++ im->msg_status = PNFS_BLOCK_FAILURE;
++ im->msg_res = NULL;
++ wake_up(&bc->pipe_wq);
++ return -EFAULT;
+ }
++
++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL)
++ return -ENOMEM;
++
++ if (copy_from_user(res, src, mlen)) {
++ kfree(res);
++ return -EFAULT;
++ }
++
++ mutex_lock(&bc->pipe_lock);
++
++ ret = mlen;
++ im->msg_status = res->res_status;
++ im->msg_res = res;
++
++ wake_up(&bc->pipe_wq);
++ mutex_unlock(&bc->pipe_lock);
++ return ret;
++}
+
-+ p = xdr_inline_decode(xdr, 24);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ p = xdr_decode_hyper(p, &res->range.offset);
-+ p = xdr_decode_hyper(p, &res->range.length);
-+ res->range.iomode = be32_to_cpup(p++);
-+ res->type = be32_to_cpup(p++);
-+
-+ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
-+ if (unlikely(status))
-+ return status;
-+
-+ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
-+ __func__,
-+ (unsigned long)res->range.offset,
-+ (unsigned long)res->range.length,
-+ res->range.iomode,
-+ res->type,
-+ res->layout.len);
++static void
++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
++{
++ bl_comm_msg_t *im = msg->data;
++ bl_comm_t *bc = container_of(im, struct bl_comm, msg);
++
++ if (msg->errno >= 0)
++ return;
+
-+ /* nfs4_proc_layoutget allocated a single page */
-+ if (res->layout.len > PAGE_SIZE)
-+ return -ENOMEM;
-+ memcpy(res->layout.buf, p, res->layout.len);
++ mutex_lock(&bc->pipe_lock);
++ im->msg_status = PNFS_BLOCK_FAILURE;
++ wake_up(&bc->pipe_wq);
++ mutex_unlock(&bc->pipe_lock);
++}
+
-+ if (layout_count > 1) {
-+ /* We only handle a length one array at the moment. Any
-+ * further entries are just ignored. Note that this means
-+ * the client may see a response that is less than the
-+ * minimum it requested.
-+ */
-+ dprintk("%s: server responded with %d layouts, dropping tail\n",
-+ __func__, layout_count);
++int
++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res)
++{
++ struct rpc_pipe_msg msg;
++ DECLARE_WAITQUEUE(wq, current);
++ int rval = 1;
++ bl_comm_msg_t *m = &bc->msg;
++
++ if (bc == NULL) {
++ dprintk("%s: No pNFS block daemon available\n", __func__);
++ return 1;
+ }
-+
-+ return 0;
-+out_overflow:
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
++
++ mutex_lock(&bc->lock);
++ mutex_lock(&bc->pipe_lock);
++
++ memcpy(m, upmsg, sizeof (*m));
++
++ memset(&msg, 0, sizeof (msg));
++ msg.data = m;
++ msg.len = sizeof (*m);
++
++ add_wait_queue(&bc->pipe_wq, &wq);
++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg);
++ if (rval < 0) {
++ remove_wait_queue(&bc->pipe_wq, &wq);
++ goto out;
++ }
++
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ mutex_unlock(&bc->pipe_lock);
++ schedule();
++ __set_current_state(TASK_RUNNING);
++ remove_wait_queue(&bc->pipe_wq, &wq);
++ mutex_lock(&bc->pipe_lock);
++
++ if (m->msg_status == PNFS_BLOCK_SUCCESS) {
++ *res = m->msg_res;
++ rval = 0;
++ } else
++ rval = 1;
++
++out:
++ mutex_unlock(&bc->pipe_lock);
++ mutex_unlock(&bc->lock);
++ return rval;
+}
+
-+static int decode_layoutreturn(struct xdr_stream *xdr,
-+ struct nfs4_layoutreturn_res *res)
++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len,
++ loff_t *offset)
+{
-+ __be32 *p;
-+ int status;
++ int cmd,
++ rc;
++ bl_comm_t *bc = bl_comm_global;
++ bl_comm_msg_t msg;
++ bl_comm_res_t *res;
+
-+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
-+ if (status)
-+ return status;
-+ p = xdr_inline_decode(xdr, 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ res->lrs_present = be32_to_cpup(p);
-+ if (res->lrs_present)
-+ status = decode_stateid(xdr, &res->stateid);
-+ return status;
-+out_overflow:
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int)))
++ return -EFAULT;
++ switch (cmd) {
++ case PNFS_BLOCK_CTL_STOP:
++ msg.msg_type = PNFS_UPCALL_MSG_STOP;
++ (void) bl_upcall(bc, &msg, &res);
++ kfree(res);
++ nfsd_bl_stop();
++ break;
++
++ case PNFS_BLOCK_CTL_START:
++ rc = nfsd_bl_start();
++ if (rc != 0)
++ return rc;
++ break;
++
++ case PNFS_BLOCK_CTL_VERS:
++ msg.msg_type = PNFS_UPCALL_MSG_VERS;
++ msg.u.msg_vers = PNFS_UPCALL_VERS;
++ if (bl_upcall(bc, &msg, &res)) {
++ dprintk("%s: Failed to contact pNFS block daemon\n",
++ __func__);
++ return 0;
++ }
++ kfree(res);
++ break;
++
++ default:
++ dprintk("%s: unknown ctl command %d\n", __func__, cmd);
++ break;
++ }
++ return len;
+}
+
-+static int decode_layoutcommit(struct xdr_stream *xdr,
-+ struct rpc_rqst *req,
-+ struct nfs4_layoutcommit_res *res)
++static struct file_operations ctl_ops = {
++ .write = ctl_write,
++};
++
++/*
++ * bl_init_proc -- set up proc interfaces
++ *
++ * Creating a pnfs_block directory isn't really required at this point
++ * since we've only got a single node in that directory. If the need for
++ * more nodes doesn't present itself shortly this code should revert
++ * to a single top level node. McNeal 11-Aug-2008.
++ */
++int
++bl_init_proc(void)
+{
-+ __be32 *p;
-+ int status;
++ struct proc_dir_entry *e;
+
-+ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
-+ if (status)
-+ return status;
++ e = proc_mkdir("fs/pnfs_block", NULL);
++ if (!e)
++ return -ENOMEM;
+
-+ p = xdr_inline_decode(xdr, 4);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ res->sizechanged = be32_to_cpup(p);
++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL);
++ if (!e)
++ return -ENOMEM;
++ e->proc_fops = &ctl_ops;
+
-+ if (res->sizechanged) {
-+ p = xdr_inline_decode(xdr, 8);
-+ if (unlikely(!p))
-+ goto out_overflow;
-+ xdr_decode_hyper(p, &res->newsize);
-+ }
+ return 0;
-+out_overflow:
-+ print_overflow_msg(__func__, xdr);
-+ return -EIO;
+}
-+#endif /* CONFIG_NFS_V4_1 */
-+
++#endif /* CONFIG_SPNFS_BLOCK */
+diff --git a/fs/nfsd/bl_ops.c b/fs/nfsd/bl_ops.c
+new file mode 100644
+index 0000000..e41b61b
+--- /dev/null
++++ b/fs/nfsd/bl_ops.c
+@@ -0,0 +1,1672 @@
+/*
-+ * END OF "GENERIC" DECODE ROUTINES.
++ * bl_ops.c
++ * spNFS
++ *
++ * Created by Rick McNeal on 4/1/08.
++ * Copyright 2008 __MyCompanyName__. All rights reserved.
++ *
+ */
+
+/*
-+ * Decode OPEN_DOWNGRADE response
++ * Block layout operations.
++ *
++ * These functions, with the exception of pnfs_block_enabled, are assigned to
++ * the super block s_export_op structure.
+ */
-+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
- {
- struct xdr_stream xdr;
- struct compound_hdr hdr;
-@@ -5758,6 +6487,186 @@ static int nfs4_xdr_dec_reclaim_complete
- status = decode_reclaim_complete(&xdr, (void *)NULL);
- return status;
- }
++#if defined(CONFIG_SPNFS_BLOCK)
+
-+/*
-+ * Decode GETDEVICELIST response
-+ */
-+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs4_getdevicelist_res *res)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++#include <linux/module.h>
++#include <linux/genhd.h>
++#include <linux/fs.h>
++#include <linux/exportfs.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/nfsd/export.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
++#include <linux/nfsd/debug.h>
++#include <linux/spinlock_types.h>
++#include <linux/dm-ioctl.h>
++#include <asm/uaccess.h>
++#include <linux/falloc.h>
++#include <linux/nfsd4_block.h>
+
-+ dprintk("encoding getdevicelist!\n");
++#include "pnfsd.h"
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status != 0)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status != 0)
-+ goto out;
-+ status = decode_putfh(&xdr);
-+ if (status != 0)
-+ goto out;
-+ status = decode_getdevicelist(&xdr, res->devlist);
-+out:
-+ return status;
-+}
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
-+/*
-+ * Decode GETDEVINFO response
-+ */
-+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs4_getdeviceinfo_res *res)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status != 0)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status != 0)
-+ goto out;
-+ status = decode_getdeviceinfo(&xdr, res->pdev);
-+out:
-+ return status;
-+}
++#define BL_LAYOUT_HASH_BITS 4
++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS)
++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1)
++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256)
+
-+/*
-+ * Decode LAYOUTGET response
-+ */
-+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs4_layoutget_res *res)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++#define bl_layout_hashval(id) \
++ ((id) & BL_LAYOUT_HASH_MASK)
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_putfh(&xdr);
-+ if (status)
-+ goto out;
-+ status = decode_layoutget(&xdr, rqstp, res);
-+out:
-+ return status;
-+}
++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len)
++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len)
++#define _2SECTS(v) ((v) >> 9)
+
-+/*
-+ * Decode LAYOUTRETURN response
-+ */
-+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs4_layoutreturn_res *res)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++#ifndef READ32
++#define READ32(x) (x) = ntohl(*p++)
++#define READ64(x) do { \
++(x) = (u64)ntohl(*p++) << 32; \
++(x) |= ntohl(*p++); \
++} while (0)
++#endif
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_putfh(&xdr);
-+ if (status)
-+ goto out;
-+ status = decode_layoutreturn(&xdr, res);
-+out:
-+ return status;
-+}
+
-+/*
-+ * Decode LAYOUTCOMMIT response
-+ */
-+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs4_layoutcommit_res *res)
-+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++typedef enum {True, False} boolean_t;
++/* ---- block layoutget and commit structure ---- */
++typedef struct bl_layout_rec {
++ struct list_head blr_hash,
++ blr_layouts;
++ dev_t blr_rdev;
++ struct inode *blr_inode;
++ int blr_recalled; // debug
++ u64 blr_orig_size,
++ blr_commit_size,
++ blr_ext_size;
++ spinlock_t blr_lock; // Protects blr_layouts
++} bl_layout_rec_t;
++
++static struct list_head layout_hash;
++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE];
++static spinlock_t layout_hashtbl_lock;
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_putfh(&xdr);
-+ if (status)
-+ goto out;
-+ status = decode_layoutcommit(&xdr, rqstp, res);
-+ if (status)
-+ goto out;
-+ decode_getfattr(&xdr, res->fattr, res->server,
-+ !RPC_IS_ASYNC(rqstp->rq_task));
-+out:
-+ return status;
-+}
++/* ---- prototypes ---- */
++static boolean_t device_slice(dev_t devid);
++static boolean_t device_dm(dev_t devid);
++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **);
++static bl_layout_rec_t *layout_inode_find(struct inode *i);
++static void layout_inode_del(struct inode *i);
++static char *map_state2name(enum pnfs_block_extent_state4 s);
++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type);
++static void bld_free(pnfs_blocklayout_devinfo_t *bld);
++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes,
++ dev_t devid, int local_index);
++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes,
++ dev_t devid, int my_loc, int idx);
++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
++ struct nfsd4_layout_seg *seg);
++struct list_head *layout_cache_iter(bl_layout_rec_t *r,
++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg);
++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h);
++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h);
++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg);
++static void print_bll(pnfs_blocklayout_layout_t *b, char *);
++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r,
++ struct list_head *h, struct nfsd4_layout_seg *seg);
++static inline void bll_collapse(bl_layout_rec_t *r,
++ pnfs_blocklayout_layout_t *c);
++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len,
++ enum bl_cache_state state, struct list_head *h);
++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b,
++ enum bl_cache_state c, struct list_head *h);
++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
++ enum pnfs_block_extent_state4 *s);
++static void extents_setup(struct fiemap_extent_info *fei);
++static void extents_count(struct fiemap_extent_info *fei, struct inode *i,
++ u64 foff, u64 len);
++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i,
++ u64 foff, u64 len);
++static boolean_t extents_process(struct fiemap_extent_info *fei,
++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev,
++ pnfs_blocklayout_layout_t *b);
++static void extents_cleanup(struct fiemap_extent_info *fei);
+
-+/*
-+ * Decode pNFS File Layout Data Server WRITE response
-+ */
-+static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs_writeres *res)
++void
++nfsd_bl_init(void)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++ int i;
++ dprintk("%s loaded\n", __func__);
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_putfh(&xdr);
-+ if (status)
-+ goto out;
-+ status = decode_write(&xdr, res);
-+ if (!status)
-+ return res->count;
-+out:
-+ return status;
++ spin_lock_init(&layout_hashtbl_lock);
++ INIT_LIST_HEAD(&layout_hash);
++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++)
++ INIT_LIST_HEAD(&layout_hashtbl[i]);
++ bl_init_proc();
+}
+
+/*
-+ * Decode pNFS File Layout Data Server COMMIT response
++ * pnfs_block_enabled -- check to see if this file system should be export as
++ * block pnfs
+ */
-+static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p,
-+ struct nfs_writeres *res)
++int
++pnfs_block_enabled(struct inode *inode, int ex_flags)
+{
-+ struct xdr_stream xdr;
-+ struct compound_hdr hdr;
-+ int status;
++ bl_comm_msg_t msg;
++ bl_comm_res_t *res = NULL;
++ static int bl_comm_once = 0;
++
++ dprintk("--> %s\n", __func__);
++ /*
++ * FIXME: Figure out method to determine if this file system should
++ * be exported. The following areas need to be checked.
++ * (1) Validate that this file system was exported as a pNFS
++ * block-layout
++ * (2) Has there been successful communication with the
++ * volume daemon?
++ */
++ /* Check #1 */
++#ifdef notyet
++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) {
++ dprintk("%s: pnfs_block not set in export\n", __func__);
++ return 0;
++ }
++#endif
++
++ /* Check #1 */
++ if (!bl_comm_once) {
++ msg.msg_type = PNFS_UPCALL_MSG_VERS;
++ msg.u.msg_vers = PNFS_UPCALL_VERS;
++ if (bl_upcall(bl_comm_global, &msg, &res)) {
++ dprintk("%s: Failed to contact pNFS block daemon\n",
++ __func__);
++ return 0;
++ }
++ if (msg.u.msg_vers != res->u.vers) {
++ dprintk("%s: vers mismatch, kernel != daemon\n",
++ __func__);
++ kfree(res);
++ return 0;
++ }
++ }
++ bl_comm_once = 1;
+
-+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-+ status = decode_compound_hdr(&xdr, &hdr);
-+ if (status)
-+ goto out;
-+ status = decode_sequence(&xdr, &res->seq_res, rqstp);
-+ if (status)
-+ goto out;
-+ status = decode_putfh(&xdr);
-+ if (status)
-+ goto out;
-+ status = decode_commit(&xdr, res);
-+out:
-+ return status;
++ kfree(res);
++
++ dprintk("<-- %s okay\n", __func__);
++ return 1;
+}
- #endif /* CONFIG_NFS_V4_1 */
-
- __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
-@@ -5936,6 +6845,13 @@ struct rpc_procinfo nfs4_procedures[] =
- PROC(SEQUENCE, enc_sequence, dec_sequence),
- PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
- PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
-+ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
-+ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
-+ PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
-+ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
-+ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
-+ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite),
-+ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit),
- #endif /* CONFIG_NFS_V4_1 */
- };
-
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild
---- linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild.orig 2010-09-30 12:25:08.344283000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild 2010-09-30 12:25:08.346279000 -0400
-@@ -0,0 +1,11 @@
-+#
-+# Makefile for the pNFS Objects Layout Driver kernel module
-+#
-+objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o
-+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
-+
-+#
-+# Panasas pNFS Layout Driver kernel module
-+#
-+panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o
-+obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c
---- linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-09-30 12:25:08.349279000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c 2010-09-30 12:25:08.351279000 -0400
-@@ -0,0 +1,1106 @@
-+/*
-+ * objio_osd.c
-+ *
-+ * pNFS Objects layout implementation over open-osd initiator library
-+ *
-+ * Copyright (C) 2009 Panasas Inc.
-+ * All rights reserved.
-+ *
-+ * Benny Halevy <bharrosh at panasas.com>
-+ * Boaz Harrosh <bharrosh at panasas.com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License version 2
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the Panasas company nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
+
-+#include <linux/module.h>
-+#include <scsi/scsi_device.h>
-+#include <scsi/osd_attributes.h>
-+#include <scsi/osd_initiator.h>
-+#include <scsi/osd_sec.h>
-+#include <scsi/osd_sense.h>
-+
-+#include "objlayout.h"
++int
++bl_layout_type(struct super_block *sb)
++{
++ return LAYOUT_BLOCK_VOLUME;
++}
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++int
++bl_getdeviceiter(struct super_block *sb,
++ u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *res)
++{
++ res->gd_eof = 1;
++ if (res->gd_cookie)
++ return -ENOENT;
++ res->gd_devid = sb->s_dev;
++ res->gd_verf = 1;
++ res->gd_cookie = 1;
++ return 0;
++}
+
-+#define _LLU(x) ((unsigned long long)x)
++static int
++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_deviceid *devid)
++{
++ pnfs_blocklayout_devinfo_t *bld_slice_p,
++ *bld_simple_p,
++ *bld;
++ int status = -EIO,
++ location = 0;
++ struct list_head volumes;
++
++ dprintk("--> %s\n", __func__);
++ INIT_LIST_HEAD(&volumes);
+
-+enum { BIO_MAX_PAGES_KMALLOC =
-+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-+};
++ bld_simple_p = bld_simple(&volumes, devid->devid,
++ location++);
++ if (!bld_simple_p)
++ goto out;
++ bld_slice_p = bld_slice(&volumes, devid->devid, location++,
++ bld_simple_p->bld_index_loc);
+
-+/* A per mountpoint struct currently for device cache */
-+struct objio_mount_type {
-+ struct list_head dev_list;
-+ spinlock_t dev_list_lock;
-+};
++ if (!bld_slice_p)
++ goto out;
++
++ status = blocklayout_encode_devinfo(xdr, &volumes);
+
-+struct _dev_ent {
-+ struct list_head list;
-+ struct nfs4_deviceid d_id;
-+ struct osd_dev *od;
-+};
++out:
++ while (!list_empty(&volumes)) {
++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
++ bld_list);
++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE)
++ kfree(bld->u.simple.bld_sig);
++ bld_free(bld);
++ }
++
++ dprintk("<-- %s (rval %d)\n", __func__, status);
++ return status;
++}
+
-+static void _dev_list_remove_all(struct objio_mount_type *omt)
++static int
++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_deviceid *devid)
+{
-+ spin_lock(&omt->dev_list_lock);
++ pnfs_blocklayout_devinfo_t *bld = NULL;
++ int status = -EIO, // default to error
++ i,
++ location = 0;
++ struct list_head volumes;
++ bl_comm_msg_t msg;
++ bl_comm_res_t *res;
++
++ dprintk("--> %s\n", __func__);
++ INIT_LIST_HEAD(&volumes);
++
++ msg.msg_type = PNFS_UPCALL_MSG_DMGET;
++ msg.u.msg_dev = devid->devid;
++ if (bl_upcall(bl_comm_global, &msg, &res)) {
++ dprintk("%s: upcall for DMGET failed\n", __func__);
++ goto out;
++ }
++
++ /*
++ * Don't use bld_alloc() here. If used this will be the first volume
++ * type added to the list whereas the protocol requires it to be the
++ * last.
++ */
++ bld = kmalloc(sizeof (*bld), GFP_KERNEL);
++ if (!bld)
++ goto out;
++ memset(bld, 0, sizeof (*bld));
++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE;
++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes;
++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL;
++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__,
++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL);
++
++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes *
++ sizeof (int), GFP_KERNEL);
++ if (!bld->u.stripe.bld_stripe_indexs)
++ goto out;
+
-+ while (!list_empty(&omt->dev_list)) {
-+ struct _dev_ent *de = list_entry(omt->dev_list.next,
-+ struct _dev_ent, list);
++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) {
++ dev_t dev;
++ pnfs_blocklayout_devinfo_t *bldp;
++
++ dev = MKDEV(res->u.stripe.devs[i].major,
++ res->u.stripe.devs[i].minor);
++ if (dev == 0)
++ goto out;
++
++ bldp = bld_simple(&volumes, dev, location++);
++ if (!bldp) {
++ dprintk("%s: bld_simple failed\n", __func__);
++ goto out;
++ }
++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc);
+
-+ list_del_init(&de->list);
-+ osduld_put_device(de->od);
-+ kfree(de);
-+ }
++ if (!bldp) {
++ dprintk("%s: bld_slice failed\n", __func__);
++ goto out;
++ }
++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc;
+
-+ spin_unlock(&omt->dev_list_lock);
++ }
++ list_add_tail(&bld->bld_list, &volumes);
++ status = blocklayout_encode_devinfo(xdr, &volumes);
++
++out:
++ while (!list_empty(&volumes)) {
++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t,
++ bld_list);
++ switch (bld->bld_type) {
++ case PNFS_BLOCK_VOLUME_SLICE:
++ case PNFS_BLOCK_VOLUME_CONCAT:
++ // No memory to release for these
++ break;
++ case PNFS_BLOCK_VOLUME_SIMPLE:
++ kfree(bld->u.simple.bld_sig);
++ break;
++ case PNFS_BLOCK_VOLUME_STRIPE:
++ kfree(bld->u.stripe.bld_stripe_indexs);
++ break;
++ }
++ bld_free(bld);
++ }
++ kfree(res);
++ dprintk("<-- %s (rval %d)\n", __func__, status);
++ return status;
+}
+
-+static struct osd_dev *___dev_list_find(struct objio_mount_type *omt,
-+ struct nfs4_deviceid *d_id)
++/*
++ * bl_getdeviceinfo -- determine device tree for requested devid
++ */
++int
++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *devid)
+{
-+ struct list_head *le;
++ if (device_slice(devid->devid) == True)
++ return bl_getdeviceinfo_slice(sb, xdr, devid);
++ else if (device_dm(devid->devid) == True)
++ return bl_getdeviceinfo_dm(sb, xdr, devid);
++ return -EINVAL;
++}
+
-+ list_for_each(le, &omt->dev_list) {
-+ struct _dev_ent *de = list_entry(le, struct _dev_ent, list);
++enum nfsstat4
++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_layoutget_arg *arg,
++ struct nfsd4_pnfs_layoutget_res *res)
++{
++ pnfs_blocklayout_layout_t *b;
++ bl_layout_rec_t *r;
++ struct list_head bl_possible,
++ *bl_candidates = NULL;
++ boolean_t del_on_error = False;
++ int adj;
++ enum nfsstat4 nfserr = NFS4_OK;
++
++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n",
++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset),
++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode);
+
-+ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id)))
-+ return de->od;
++ if (res->lg_seg.length == 0) {
++ printk("%s: request length of 0, error condition\n", __func__);
++ return NFS4ERR_BADLAYOUT;
++ }
++
++ /*
++ * Adjust the length as required per spec.
++ * - First case is were the length is set to (u64)-1. Cheap means to
++ * define the end of the file.
++ * - Second case is were the I/O mode is read-only, but the request is
++ * past the end of the file so the request needs to be trimed.
++ */
++ if ((res->lg_seg.length == NFS4_MAX_UINT64) ||
++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) &&
++ (res->lg_seg.iomode == IOMODE_READ)))
++ res->lg_seg.length = i->i_size - res->lg_seg.offset;
++
++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0;
++ res->lg_seg.offset -= adj;
++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511;
++
++ if (res->lg_seg.iomode != IOMODE_READ)
++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE,
++ res->lg_seg.offset, res->lg_seg.length))
++ return NFS4ERR_IO;
++
++ INIT_LIST_HEAD(&bl_possible);
++
++ if ((r = layout_inode_find(i)) == NULL) {
++ if (layout_inode_add(i, &r) == False) {
++ printk("%s: layout_inode_add failed\n", __func__);
++ return NFS4ERR_IO;
++ }
++ del_on_error = True;
++ }
++ BUG_ON(!r);
++
++ spin_lock(&r->blr_lock);
++
++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) {
++ /*
++ * This will send LAYOUTTRYAGAIN error to the client.
++ */
++ dprintk("%s: layout_cache_fill_from() failed\n", __func__);
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
++
++ res->lg_return_on_close = 1;
++ res->lg_seg.length = 0;
++
++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg);
++ if (!bl_candidates) {
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
++
++ layout_cache_merge(r, bl_candidates);
++ if (layout_cache_update(r, bl_candidates)) {
++ /* ---- Failed to allocate memory. ---- */
++ dprintk("%s: layout_cache_update() failed\n", __func__);
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
++
++ nfserr = blocklayout_encode_layout(xdr, bl_candidates);
++ if (nfserr)
++ dprintk("%s: layoutget xdr routine failed\n", __func__);
++
++layoutget_cleanup:
++ if (bl_candidates) {
++ while (!list_empty(bl_candidates)) {
++ b = list_entry(bl_candidates->next,
++ struct pnfs_blocklayout_layout, bll_list);
++ list_del(&b->bll_list);
++ kfree(b);
++ }
+ }
+
-+ return NULL;
++ spin_unlock(&r->blr_lock);
++ if (unlikely(nfserr)) {
++ if (del_on_error == True)
++ layout_inode_del(i);
++ res->lg_seg.length = 0;
++ res->lg_seg.offset = 0;
++ }
++
++ dprintk("<-- %s (rval %u)\n", __func__, nfserr);
++ return nfserr;
+}
+
-+static struct osd_dev *_dev_list_find(struct objio_mount_type *omt,
-+ struct nfs4_deviceid *d_id)
++/*
++ * bl_layoutcommit -- commit changes, especially size, to file systemj
++ *
++ * Currently this routine isn't called and everything is handled within
++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't
++ * handle a partial return, a set of extents, of the layout. The extents
++ * are decoded here, but nothing is done with them. If this routine is
++ * be called the interface must change to pass the 'dentry' pointer such
++ * that notify_change() can be called.
++ */
++int
++bl_layoutcommit(struct inode *i,
++ const struct nfsd4_pnfs_layoutcommit_arg *args,
++ struct nfsd4_pnfs_layoutcommit_res *res)
+{
-+ struct osd_dev *od;
++ bl_layout_rec_t *r;
++ int status = 0;
++ u64 lw_plus;
++
++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
++ r = layout_inode_find(i);
++ if (r) {
++ lw_plus = args->lc_last_wr + 1;
++ if (args->lc_newoffset) {
++ dprintk(" lc_last_wr %Lu\n", lw_plus);
++ if (r->blr_orig_size < lw_plus) {
++ r->blr_orig_size = lw_plus;
++ res->lc_size_chg = 1;
++ res->lc_newsize = lw_plus;
++ }
++ }
+
-+ spin_lock(&omt->dev_list_lock);
-+ od = ___dev_list_find(omt, d_id);
-+ spin_unlock(&omt->dev_list_lock);
-+ return od;
++ if (args->lc_up_len) {
++ int extents,
++ i;
++ struct pnfs_blocklayout_layout *b;
++ __be32 *p = args->lc_up_layout;
++
++ /*
++ * Client is returning a set of extents which
++ * should/could be used to update the file system.
++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08
++ */
++ READ32(extents);
++ dprintk(" Client returning %d extents: data size %d\n",
++ extents, args->lc_up_len);
++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) *
++ extents, GFP_KERNEL);
++ if (b) {
++ for (i = 0; i < extents; i++) {
++ READ64(b[i].bll_vol_id.sbid);
++ READ64(b[i].bll_vol_id.devid);
++ READ64(b[i].bll_foff);
++ READ64(b[i].bll_len);
++ READ64(b[i].bll_soff);
++ READ32(b[i].bll_es);
++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu "
++ "state %s\n",
++ i, _2SECTS(b[i].bll_foff),
++ _2SECTS(b[i].bll_len),
++ _2SECTS(b[i].bll_soff),
++ map_state2name(b[i].bll_es));
++ }
++ kfree(b);
++ } else {
++ status = -ENOMEM;
++ }
++ }
++ } else
++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i);
++
++ dprintk("<-- %s (rval %d)\n", __func__, status);
++ return status;
+}
+
-+static int _dev_list_add(struct objio_mount_type *omt,
-+ struct nfs4_deviceid *d_id, struct osd_dev *od)
++int
++bl_layoutreturn(struct inode *i,
++ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
-+ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL);
-+
-+ if (!de)
-+ return -ENOMEM;
-+
-+ spin_lock(&omt->dev_list_lock);
++ int status = 0;
++ bl_layout_rec_t *r;
+
-+ if (___dev_list_find(omt, d_id)) {
-+ kfree(de);
-+ goto out;
++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino);
++
++ r = layout_inode_find(i);
++ if (r) {
++ spin_lock(&r->blr_lock);
++ layout_cache_del(r, &args->lr_seg);
++ spin_unlock(&r->blr_lock);
++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n",
++ r->blr_ext_size, i->i_size, r->blr_orig_size);
+ }
+
-+ de->d_id = *d_id;
-+ de->od = od;
-+ list_add(&de->list, &omt->dev_list);
-+
-+out:
-+ spin_unlock(&omt->dev_list_lock);
-+ return 0;
++ layout_inode_del(i);
++ dprintk("<-- %s (rval %d)\n", __func__, status);
++ return status;
+}
+
-+struct objio_segment {
-+ struct pnfs_osd_layout *layout;
-+
-+ unsigned mirrors_p1;
-+ unsigned stripe_unit;
-+ unsigned group_width; /* Data stripe_units without integrity comps */
-+ u64 group_depth;
-+ unsigned group_count;
-+
-+ unsigned num_comps;
-+ /* variable length */
-+ struct osd_dev *ods[1];
-+};
-+
-+struct objio_state;
-+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
-+
-+struct objio_state {
-+ /* Generic layer */
-+ struct objlayout_io_state ol_state;
-+
-+ struct objio_segment *objio_seg;
-+
-+ struct kref kref;
-+ objio_done_fn done;
-+ void *private;
-+
-+ unsigned long length;
-+ unsigned numdevs; /* Actually used devs in this IO */
-+ /* A per-device variable array of size numdevs */
-+ struct _objio_per_comp {
-+ struct bio *bio;
-+ struct osd_request *or;
-+ unsigned long length;
-+ u64 offset;
-+ unsigned dev;
-+ } per_dev[];
-+};
-+
-+/* Send and wait for a get_device_info of devices in the layout,
-+ then look them up with the osd_initiator library */
-+static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay,
-+ struct objio_segment *objio_seg, unsigned comp)
++int
++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
+{
-+ struct pnfs_osd_layout *layout = objio_seg->layout;
-+ struct pnfs_osd_deviceaddr *deviceaddr;
-+ struct nfs4_deviceid *d_id;
-+ struct osd_dev *od;
-+ struct osd_dev_info odi;
-+ struct objio_mount_type *omt = NFS_SERVER(pnfslay->inode)->pnfs_ld_data;
-+ int err;
-+
-+ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id;
-+
-+ od = _dev_list_find(omt, d_id);
-+ if (od)
-+ return od;
-+
-+ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr);
-+ if (unlikely(err)) {
-+ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err);
-+ return ERR_PTR(err);
++ struct super_block *sb;
++ struct nfsd4_pnfs_cb_layout lr;
++ bl_layout_rec_t *r;
++ pnfs_blocklayout_layout_t *b;
++ u64 adj;
++
++ dprintk("--> %s\n", __func__);
++ BUG_ON(!len);
++ switch (type) {
++ case RETURN_FILE:
++ sb = inode->i_sb;
++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n",
++ inode->i_sb->s_dev, inode->i_ino,
++ _2SECTS(offset), _2SECTS(len));
++ break;
++ case RETURN_FSID:
++ sb = inode->i_sb;
++ dprintk("%s: recalling layout for fsid x (unimplemented)\n",
++ __func__);
++ return 0;
++ case RETURN_ALL:
++ /*
++ * XXX figure out how to get a sb since there's no
++ * inode ptr
++ */
++ dprintk("%s: recalling all layouts (unimplemented)\n",
++ __func__);
++ return 0;
++ default:
++ return -EINVAL;
+ }
++
++restart:
++ r = layout_inode_find(inode);
++ if (r && len && !r->blr_recalled) {
++ spin_lock(&r->blr_lock);
++ list_for_each_entry(b, &r->blr_layouts, bll_list) {
++ if (!r->blr_recalled && !b->bll_recalled &&
++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) {
++ b->bll_recalled = 1;
++ lr.cbl_recall_type = type;
++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME;
++ lr.cbl_seg.clientid = 0;
++ lr.cbl_seg.offset = 0;
++ lr.cbl_seg.length = NFS4_MAX_UINT64;
++ r->blr_recalled = 1;
++ dprintk(" FULL LAYOUTRECALL\n");
++ lr.cbl_seg.iomode = IOMODE_ANY;
+
-+ odi.systemid_len = deviceaddr->oda_systemid.len;
-+ if (odi.systemid_len > sizeof(odi.systemid)) {
-+ err = -EINVAL;
-+ goto out;
-+ } else if (odi.systemid_len)
-+ memcpy(odi.systemid, deviceaddr->oda_systemid.data,
-+ odi.systemid_len);
-+ odi.osdname_len = deviceaddr->oda_osdname.len;
-+ odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
-+
-+ if (!odi.osdname_len && !odi.systemid_len) {
-+ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
-+ __func__);
-+ err = -ENODEV;
-+ goto out;
-+ }
++ /*
++ * Currently there are only two cases where the
++ * layout is being returned.
++ * (1) Someone is issuing a NFS_WRITE operation
++ * to this layout.
++ * (2) The file has been truncated which means
++ * the layout is immediately made invalid.
++ * In both cases the client must write any
++ * uncommitted modifications to the server via
++ * NFS_WRITE.
++ */
++ lr.cbl_layoutchanged = 1;
+
-+ od = osduld_info_lookup(&odi);
-+ if (unlikely(IS_ERR(od))) {
-+ err = PTR_ERR(od);
-+ dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
-+ goto out;
++ /*
++ * Need to drop the lock because we'll get a
++ * layoutreturn which will block waiting for
++ * the lock. The request will come in on the
++ * same thread which will cause a deadlock.
++ */
++ spin_unlock(&r->blr_lock);
++ nfsd_layout_recall_cb(sb, inode, &lr);
++ adj = MIN(b->bll_len - (offset - b->bll_foff),
++ len);
++ offset += adj;
++ len -= adj;
++ if (!len) {
++ spin_lock(&r->blr_lock);
++ break;
++ }
++ /*
++ * Since layoutreturn will have been called we
++ * can't assume blr_layouts is still valid,
++ * so restart.
++ */
++ goto restart;
++ }
++ }
++ spin_unlock(&r->blr_lock);
+ }
-+
-+ _dev_list_add(omt, d_id, od);
-+
-+out:
-+ dprintk("%s: return=%d\n", __func__, err);
-+ objlayout_put_deviceinfo(deviceaddr);
-+ return err ? ERR_PTR(err) : od;
++
++ dprintk("<-- %s\n", __func__);
++ return 0;
+}
+
-+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
-+ struct objio_segment *objio_seg)
-+{
-+ struct pnfs_osd_layout *layout = objio_seg->layout;
-+ unsigned i, num_comps = layout->olo_num_comps;
-+ int err;
-+
-+ /* lookup all devices */
-+ for (i = 0; i < num_comps; i++) {
-+ struct osd_dev *od;
++/*
++ * []------------------------------------------------------------------[]
++ * | Support functions from here on down. |
++ * []------------------------------------------------------------------[]
++ */
+
-+ od = _device_lookup(pnfslay, objio_seg, i);
-+ if (unlikely(IS_ERR(od))) {
-+ err = PTR_ERR(od);
-+ goto out;
-+ }
-+ objio_seg->ods[i] = od;
++/*
++ * bld_simple -- given a dev_t build a simple volume structure
++ *
++ * Simple volume contains the device signature and offset to that data in
++ * the storage volume.
++ */
++static pnfs_blocklayout_devinfo_t *
++bld_simple(struct list_head *volumes, dev_t devid, int local_index)
++{
++ pnfs_blocklayout_devinfo_t *bld = NULL;
++ bl_comm_msg_t msg;
++ bl_comm_res_t *res = NULL;
++
++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG;
++ msg.u.msg_dev = devid;
++ if (bl_upcall(bl_comm_global, &msg, &res)) {
++ dprintk("%s: Failed to get signature information\n", __func__);
++ goto error;
+ }
-+ objio_seg->num_comps = num_comps;
-+ err = 0;
-+
-+out:
-+ dprintk("%s: return=%d\n", __func__, err);
-+ return err;
++
++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE);
++ if (!bld)
++ return NULL;
++
++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset;
++ bld->u.simple.bld_sig_len = res->u.sig.len;
++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL);
++ if (!bld->u.simple.bld_sig)
++ goto error;
++
++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len);
++ kfree(res);
++ return bld;
++
++error:
++ if (bld)
++ bld_free(bld);
++ if (res)
++ kfree(res);
++ dprintk("%s: error in bld_simple\n", __func__);
++ return NULL;
+}
+
-+static int _verify_data_map(struct pnfs_osd_layout *layout)
++/*
++ * bld_slice -- given a dev_t build a slice volume structure
++ *
++ * A slice volume contains the length of the slice/partition and its offset
++ * from the beginning of the storage volume. There's also a reference to
++ * the "simple" volume which contains this slice.
++ */
++static pnfs_blocklayout_devinfo_t *
++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc)
+{
-+ struct pnfs_osd_data_map *data_map = &layout->olo_map;
-+ u64 stripe_length;
-+ u32 group_width;
-+
-+/* FIXME: Only raid0 for now. if not go through MDS */
-+ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
-+ printk(KERN_ERR "Only RAID_0 for now\n");
-+ return -ENOTSUPP;
-+ }
-+ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
-+ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
-+ data_map->odm_num_comps, data_map->odm_mirror_cnt);
-+ return -EINVAL;
-+ }
-+
-+ if (data_map->odm_group_width)
-+ group_width = data_map->odm_group_width;
-+ else
-+ group_width = data_map->odm_num_comps /
-+ (data_map->odm_mirror_cnt + 1);
-+
-+ stripe_length = (u64)data_map->odm_stripe_unit * group_width;
-+ if (stripe_length >= (1ULL << 32)) {
-+ printk(KERN_ERR "Total Stripe length(0x%llx)"
-+ " >= 32bit is not supported\n", _LLU(stripe_length));
-+ return -ENOTSUPP;
++ pnfs_blocklayout_devinfo_t *bld;
++ bl_comm_msg_t msg;
++ bl_comm_res_t *res;
++
++ dprintk("--> %s\n", __func__);
++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE);
++ if (!bld)
++ return NULL;
++
++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE;
++ msg.u.msg_dev = devid;
++ if (bl_upcall(bl_comm_global, &msg, &res)) {
++ dprintk("Upcall to get slice info failed\n");
++ bld_free(bld);
++ return NULL;
+ }
++
++ bld->bld_devid.devid = devid;
++ bld->bld_index_loc = my_loc;
++ bld->u.slice.bld_start = res->u.slice.start * 512LL;
++ bld->u.slice.bld_len = res->u.slice.length * 512LL;
++ bld->u.slice.bld_index = simple_loc;
+
-+ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
-+ printk(KERN_ERR "Stripe Unit(0x%llx)"
-+ " must be Multples of PAGE_SIZE(0x%lx)\n",
-+ _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
-+ return -ENOTSUPP;
-+ }
++ dprintk("%s: start %Lu, len %Lu\n", __func__,
++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL);
+
-+ return 0;
++ kfree(res);
++ dprintk("<-- %s (rval %p)\n", __func__, bld);
++ return bld;
+}
+
-+int objio_alloc_lseg(void **outp,
-+ struct pnfs_layout_hdr *pnfslay,
-+ struct pnfs_layout_segment *lseg,
-+ struct pnfs_osd_layout *layout)
++static int
++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h,
++ struct nfsd4_layout_seg *seg)
+{
-+ struct objio_segment *objio_seg;
-+ int err;
-+
-+ err = _verify_data_map(layout);
-+ if (unlikely(err))
-+ return err;
-+
-+ objio_seg = kzalloc(sizeof(*objio_seg) +
-+ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
-+ GFP_KERNEL);
-+ if (!objio_seg)
-+ return -ENOMEM;
-+
-+ objio_seg->layout = layout;
-+ err = objio_devices_lookup(pnfslay, objio_seg);
-+ if (err)
-+ goto free_seg;
-+
-+ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
-+ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
-+ if (layout->olo_map.odm_group_width) {
-+ objio_seg->group_width = layout->olo_map.odm_group_width;
-+ objio_seg->group_depth = layout->olo_map.odm_group_depth;
-+ objio_seg->group_count = layout->olo_map.odm_num_comps /
-+ objio_seg->mirrors_p1 /
-+ objio_seg->group_width;
-+ } else {
-+ objio_seg->group_width = layout->olo_map.odm_num_comps /
-+ objio_seg->mirrors_p1;
-+ objio_seg->group_depth = -1;
-+ objio_seg->group_count = 1;
++ pnfs_blocklayout_layout_t *n;
++
++ dprintk("--> %s\n", __func__);
++
++ if (!list_empty(&r->blr_layouts))
++ if (layout_cache_fill_from_list(r, h, seg) == False)
++ return -EIO;
++
++ /*
++ * This deals with two conditions.
++ * (1) When blr_layouts is empty we need to create the first entry
++ * (2) When the range requested falls past the end of any current
++ * layout the residual must be taken care of.
++ */
++ if (seg->length) {
++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h);
++ if (!n)
++ return -ENOMEM;
++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff),
++ _2SECTS(n->bll_len));
+ }
-+
-+ *outp = objio_seg;
++
++ dprintk("<-- %s\n", __func__);
+ return 0;
-+
-+free_seg:
-+ dprintk("%s: Error: return %d\n", __func__, err);
-+ kfree(objio_seg);
-+ *outp = NULL;
-+ return err;
-+}
-+
-+void objio_free_lseg(void *p)
-+{
-+ struct objio_segment *objio_seg = p;
-+
-+ kfree(objio_seg);
+}
+
-+int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++struct list_head *
++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible,
++ struct nfsd4_layout_seg *seg)
+{
-+ struct objio_segment *objio_seg = seg;
-+ struct objio_state *ios;
-+ const unsigned first_size = sizeof(*ios) +
-+ objio_seg->num_comps * sizeof(ios->per_dev[0]);
-+ const unsigned sec_size = objio_seg->num_comps *
-+ sizeof(ios->ol_state.ioerrs[0]);
-+
-+ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
-+ ios = kzalloc(first_size + sec_size, GFP_KERNEL);
-+ if (unlikely(!ios))
-+ return -ENOMEM;
-+
-+ ios->objio_seg = objio_seg;
-+ ios->ol_state.ioerrs = ((void *)ios) + first_size;
-+ ios->ol_state.num_comps = objio_seg->num_comps;
++ pnfs_blocklayout_layout_t *b,
++ *n = NULL;
++ struct list_head *bl_candidates = NULL;
++ struct fiemap_extent_info fei;
++ struct inode *i;
++ dev_t dev;
++
++ dev = r->blr_rdev;
++ i = r->blr_inode;
++
++ dprintk("--> %s\n", __func__);
++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL);
++ if (!bl_candidates)
++ return NULL;
++ INIT_LIST_HEAD(bl_candidates);
++ extents_setup(&fei);
++
++ list_for_each_entry(b, bl_possible, bll_list) {
++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
++
++ extents_count(&fei, i, b->bll_foff, b->bll_len);
++ if (fei.fi_extents_mapped) {
++
++ /*
++ * Common case here. Got a range which has
++ * extents. Now get those extents and process
++ * them into pNFS extents.
++ */
++ if (extents_get(&fei, i, b->bll_foff,
++ b->bll_len) == False)
++ goto cleanup;
++ if (extents_process(&fei, bl_candidates,
++ seg, dev, b) == False)
++ goto cleanup;
++ extents_cleanup(&fei);
++
++ } else if (seg->iomode == IOMODE_READ) {
++
++ /*
++ * Found a hole in a file while reading. No
++ * problem, just create a pNFS extent for the
++ * range and let the client know there's no
++ * backing store.
++ */
++ n = bll_alloc(b->bll_foff, b->bll_len,
++ BLOCK_LAYOUT_NEW, bl_candidates);
++ n->bll_es = PNFS_BLOCK_NONE_DATA;
++ n->bll_vol_id.sbid = 0;
++ n->bll_vol_id.devid = dev;
++ seg->length += b->bll_len;
++ } else {
++
++ /*
++ * There's a problem here. Since the iomode
++ * is read/write fallocate should have allocated
++ * any necessary storage for the given range.
++ */
++ dprintk(" Extent count for RW is 0\n");
++ goto cleanup;
++ }
++
++ } else {
++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates);
++ seg->length += n->bll_len;
++ }
+
-+ *outp = &ios->ol_state;
-+ return 0;
++ if (r->blr_ext_size < (b->bll_foff + b->bll_len))
++ r->blr_ext_size = b->bll_foff + b->bll_len;
++ }
++
++ while (!list_empty(bl_possible)) {
++ b = list_entry(bl_possible->next,
++ struct pnfs_blocklayout_layout, bll_list);
++ list_del(&b->bll_list);
++ kfree(b);
++ }
++
++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout,
++ bll_list);
++ seg->offset = b->bll_foff;
++ dprintk("<-- %s okay\n", __func__);
++ return bl_candidates;
++
++cleanup:
++ extents_cleanup(&fei);
++ if (bl_candidates)
++ kfree(bl_candidates);
++ dprintk("<-- %s, error occurred\n", __func__);
++ return NULL;
+}
+
-+void objio_free_io_state(struct objlayout_io_state *ol_state)
++/*
++ * layout_cache_merge -- collapse layouts which make up a contiguous range.
++ */
++static void
++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h)
+{
-+ struct objio_state *ios = container_of(ol_state, struct objio_state,
-+ ol_state);
-+
-+ kfree(ios);
++ pnfs_blocklayout_layout_t *b,
++ *p;
++
++ dprintk("--> %s\n", __func__);
++restart:
++ p = NULL;
++ list_for_each_entry(b, h, bll_list) {
++ if (p && (BLL_S_END(p) == b->bll_soff) &&
++ (p->bll_es == b->bll_es) &&
++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
++ /*
++ * We've got a condidate.
++ */
++#ifdef too_verbose
++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n",
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++ _2SECTS(b->bll_soff),
++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len),
++ _2SECTS(b->bll_soff));
++#endif
++
++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE)
++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE;
++ p->bll_len += b->bll_len;
++ list_del(&b->bll_list);
++ kfree(b);
++ goto restart;
++ } else if (p && (BLL_F_END(p) == b->bll_foff) &&
++ (p->bll_es == b->bll_es) &&
++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) {
++ p->bll_len += b->bll_len;
++ list_del(&b->bll_list);
++ kfree(b);
++ goto restart;
++ } else
++ p = b;
++ }
++ dprintk("<-- %s\n", __func__);
+}
+
-+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
++static int
++layout_cache_update(bl_layout_rec_t *r, struct list_head *h)
+{
-+ switch (oep) {
-+ case OSD_ERR_PRI_NO_ERROR:
-+ return (enum pnfs_osd_errno)0;
-+
-+ case OSD_ERR_PRI_CLEAR_PAGES:
-+ BUG_ON(1);
-+ return 0;
-+
-+ case OSD_ERR_PRI_RESOURCE:
-+ return PNFS_OSD_ERR_RESOURCE;
-+ case OSD_ERR_PRI_BAD_CRED:
-+ return PNFS_OSD_ERR_BAD_CRED;
-+ case OSD_ERR_PRI_NO_ACCESS:
-+ return PNFS_OSD_ERR_NO_ACCESS;
-+ case OSD_ERR_PRI_UNREACHABLE:
-+ return PNFS_OSD_ERR_UNREACHABLE;
-+ case OSD_ERR_PRI_NOT_FOUND:
-+ return PNFS_OSD_ERR_NOT_FOUND;
-+ case OSD_ERR_PRI_NO_SPACE:
-+ return PNFS_OSD_ERR_NO_SPACE;
-+ default:
-+ WARN_ON(1);
-+ /* fallthrough */
-+ case OSD_ERR_PRI_EIO:
-+ return PNFS_OSD_ERR_EIO;
++ pnfs_blocklayout_layout_t *b,
++ *c,
++ *n;
++ boolean_t status = 0;
++
++ dprintk("--> %s\n", __func__);
++ if (list_empty(&r->blr_layouts)) {
++ /* ---- Just add entries and return ---- */
++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev,
++ r->blr_inode->i_ino);
++ list_for_each_entry(b, h, bll_list) {
++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE,
++ &r->blr_layouts);
++ if (!c) {
++ status = -ENOMEM;
++ break;
++ }
++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n",
++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len),
++ _2SECTS(c->bll_soff), c->bll_es);
++ }
++ return status;
++ }
++
++ list_for_each_entry(b, h, bll_list) {
++ BUG_ON(!b->bll_vol_id.devid);
++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) {
++ boolean_t found = False;
++ list_for_each_entry(c, &r->blr_layouts, bll_list) {
++ if ((b->bll_soff >= c->bll_soff) &&
++ (b->bll_soff < BLL_S_END(c)) &&
++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) {
++ u64 u;
++
++ if ((b->bll_foff < c->bll_foff) ||
++ (b->bll_foff > BLL_F_END(c)))
++ BUG();
++
++ u = BLL_S_END(b) - BLL_S_END(c);
++ /*
++ * The updated cache entry has to be
++ * different than the current.
++ * Otherwise the cache state for 'b'
++ * should be BLOCK_LAYOUT_CACHE.
++ */
++ BUG_ON(BLL_S_END(b) < BLL_S_END(c));
++
++ dprintk(" "
++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n",
++ _2SECTS(c->bll_foff),
++ _2SECTS(c->bll_len),
++ _2SECTS(c->bll_soff),
++ _2SECTS(c->bll_len + u));
++ c->bll_len += u;
++ bll_collapse(r, c);
++ found = True;
++ break;
++ }
++ }
++
++ if (found == False) {
++ dprintk(" ERROR Expected to find"
++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n",
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++ _2SECTS(b->bll_soff));
++ list_for_each_entry(c, &r->blr_layouts, bll_list)
++ print_bll(c, "Cached");
++ BUG();
++ }
++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) {
++
++ c = list_first_entry(&r->blr_layouts,
++ struct pnfs_blocklayout_layout, bll_list);
++ if (b->bll_foff < c->bll_foff) {
++ /*
++ * Special case where new entry is before
++ * first cached entry.
++ */
++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL);
++ list_add(&c->bll_list, &r->blr_layouts);
++ dprintk(" new entry at head of list at %Lu, "
++ "len %Lu\n",
++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len));
++ } else {
++ list_for_each_entry(c, &r->blr_layouts,
++ bll_list) {
++ n = list_entry(c->bll_list.next,
++ struct pnfs_blocklayout_layout,
++ bll_list);
++ /*
++ * This is ugly, but can't think of
++ * another way to examine this case.
++ * Consider the following. Need to
++ * add an entry which starts at 40
++ * and the cache has the following
++ * entries:
++ * Start Length
++ * 10 5
++ * 30 5
++ * 50 5
++ * So, need to look and see if the new
++ * entry starts after the current
++ * cache, but before the next one.
++ * There's a catch in that the next
++ * entry might not be valid as it's
++ * really just a pointer to the list
++ * head.
++ */
++ if (((b->bll_foff >=
++ BLL_F_END(c)) &&
++ (c->bll_list.next == &r->blr_layouts)) ||
++ ((b->bll_foff >=
++ BLL_F_END(c)) &&
++ (b->bll_foff < n->bll_foff))) {
++
++ n = bll_alloc_dup(b,
++ BLOCK_LAYOUT_CACHE, NULL);
++ dprintk(" adding new %Lu:%Lu"
++ " after %Lu:%Lu\n",
++ _2SECTS(n->bll_foff),
++ _2SECTS(n->bll_len),
++ _2SECTS(c->bll_foff),
++ _2SECTS(c->bll_len));
++ list_add(&n->bll_list,
++ &c->bll_list);
++ break;
++ }
++ }
++ }
++ }
+ }
++ dprintk("<-- %s\n", __func__);
++ return status;
+}
+
-+static void _clear_bio(struct bio *bio)
++static void
++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in)
+{
-+ struct bio_vec *bv;
-+ unsigned i;
-+
-+ __bio_for_each_segment(bv, bio, i, 0) {
-+ unsigned this_count = bv->bv_len;
++ struct pnfs_blocklayout_layout *b,
++ *n;
++ u64 len;
++ struct nfsd4_layout_seg seg = *seg_in;
++
++ dprintk("--> %s\n", __func__);
++ if (seg.length == NFS4_MAX_UINT64) {
++ r->blr_recalled = 0;
++ dprintk(" Fast return of all layouts\n");
++ while (!list_empty(&r->blr_layouts)) {
++ b = list_entry(r->blr_layouts.next,
++ struct pnfs_blocklayout_layout, bll_list);
++ dprintk(" foff %Lu, len %Lu, soff %Lu\n",
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++ _2SECTS(b->bll_soff));
++ list_del(&b->bll_list);
++ kfree(b);
++ }
++ dprintk("<-- %s\n", __func__);
++ return;
++ }
+
-+ if (likely(PAGE_SIZE == this_count))
-+ clear_highpage(bv->bv_page);
-+ else
-+ zero_user(bv->bv_page, bv->bv_offset, this_count);
++restart:
++ list_for_each_entry(b, &r->blr_layouts, bll_list) {
++ if (seg.offset == b->bll_foff) {
++ /*
++ * This handle the following three cases:
++ * (1) return layout matches entire cache layout
++ * (2) return layout matches beginning portion of cache
++ * (3) return layout matches entire cache layout and
++ * into next entry. Varies from #1 in end case.
++ */
++ dprintk(" match on offsets, %Lu:%Lu\n",
++ _2SECTS(seg.offset), _2SECTS(seg.length));
++ len = MIN(seg.length, b->bll_len);
++ b->bll_foff += len;
++ b->bll_soff += len;
++ b->bll_len -= len;
++ seg.length -= len;
++ seg.offset += len;
++ if (!b->bll_len) {
++ list_del(&b->bll_list);
++ kfree(b);
++ dprintk(" removing cache line\n");
++ if (!seg.length) {
++ dprintk(" also finished\n");
++ goto complete;
++ }
++ /*
++ * Since 'b' was freed we can't continue at the
++ * next entry which is referenced as
++ * b->bll_list.next by the list_for_each_entry
++ * macro. Need to restart the loop.
++ * TODO: Think about creating a dummy 'b' which
++ * would keep list_for_each_entry() happy.
++ */
++ goto restart;
++ }
++ if (!seg.length) {
++ dprintk(" finished, but cache line not"
++ "empty\n");
++ goto complete;
++ }
++ } else if ((seg.offset >= b->bll_foff) &&
++ (seg.offset < BLL_F_END(b))) {
++ /*
++ * layout being returned is within this cache line.
++ */
++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n",
++ _2SECTS(seg.offset), _2SECTS(seg.length),
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
++ BUG_ON(!seg.length);
++ if ((seg.offset + seg.length) >= BLL_F_END(b)) {
++ /*
++ * Layout returned starts in the middle of
++ * cache entry and just need to trim back
++ * cache to shorter length.
++ */
++ dprintk(" trim back cache line\n");
++ len = seg.offset - b->bll_foff;
++ seg.offset += b->bll_len - len;
++ seg.length -= b->bll_len - len;
++ b->bll_len = len;
++ if (!seg.length)
++ return;
++ } else {
++ /*
++ * Need to split current cache layout because
++ * chunk is being removed from the middle.
++ */
++ dprintk(" split cache line\n");
++ len = seg.offset + seg.length;
++ n = bll_alloc(len,
++ (b->bll_foff + b->bll_len) - len,
++ BLOCK_LAYOUT_CACHE, NULL);
++ n->bll_soff = b->bll_soff + len;
++ list_add(&n->bll_list, &b->bll_list);
++ b->bll_len = seg.offset - b->bll_foff;
++ return;
++ }
++ }
+ }
++complete:
++ if (list_empty(&r->blr_layouts))
++ r->blr_recalled = 0;
++ dprintk("<-- %s\n", __func__);
+}
+
-+static int _io_check(struct objio_state *ios, bool is_write)
++/*
++ * layout_cache_fill_from_list -- fills from cache list
++ *
++ * NOTE: This routine was only seperated out from layout_cache_file_from()
++ * to reduce the indentation level which makes the code easier to read.
++ */
++static inline boolean_t
++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h,
++ struct nfsd4_layout_seg *seg)
+{
-+ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
-+ int lin_ret = 0;
-+ int i;
-+
-+ for (i = 0; i < ios->numdevs; i++) {
-+ struct osd_sense_info osi;
-+ struct osd_request *or = ios->per_dev[i].or;
-+ int ret;
-+
-+ if (!or)
-+ continue;
-+
-+ ret = osd_req_decode_sense(or, &osi);
-+ if (likely(!ret))
-+ continue;
-+
-+ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
-+ /* start read offset passed endof file */
-+ BUG_ON(is_write);
-+ _clear_bio(ios->per_dev[i].bio);
-+ dprintk("%s: start read offset passed end of file "
-+ "offset=0x%llx, length=0x%lx\n", __func__,
-+ _LLU(ios->per_dev[i].offset),
-+ ios->per_dev[i].length);
-+
-+ continue; /* we recovered */
++ pnfs_blocklayout_layout_t *b,
++ *n;
++ enum pnfs_block_extent_state4 s;
++
++ list_for_each_entry(b, &r->blr_layouts, bll_list) {
++ if (seg->offset < b->bll_foff) {
++ n = bll_alloc(seg->offset,
++ MIN(seg->length, b->bll_foff - seg->offset),
++ BLOCK_LAYOUT_NEW, NULL);
++ if (!n)
++ return False;
++
++ list_add(&n->bll_list, h->prev);
++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n",
++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len));
++ seg->offset += n->bll_len;
++ seg->length -= n->bll_len;
++ if (!seg->length)
++ break;
+ }
-+ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev,
-+ osd_pri_2_pnfs_err(osi.osd_err_pri),
-+ ios->per_dev[i].offset,
-+ ios->per_dev[i].length,
-+ is_write);
-+
-+ if (osi.osd_err_pri >= oep) {
-+ oep = osi.osd_err_pri;
-+ lin_ret = ret;
++
++ if ((seg->offset >= b->bll_foff) &&
++ (seg->offset < BLL_F_END(b))) {
++ if (layout_conflict(b, seg->iomode, &s) == False) {
++ dprintk(" CONFLICT FOUND: "
++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n",
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++ _2SECTS(b->bll_soff), b->bll_es,
++ seg->iomode);
++ return False;
++ }
++ n = bll_alloc(seg->offset,
++ MIN(seg->length, BLL_F_END(b) - seg->offset),
++ BLOCK_LAYOUT_CACHE, h);
++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): "
++ "in %Lu(f):%Lu(l):%Lu(s):%d\n",
++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len),
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len),
++ _2SECTS(b->bll_soff), b->bll_es);
++ if (!n)
++ return False;
++
++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff;
++ n->bll_vol_id.sbid = 0;
++ n->bll_vol_id.devid = b->bll_vol_id.devid;
++ n->bll_es = s;
++ seg->offset += n->bll_len;
++ seg->length -= n->bll_len;
++ if (!seg->length)
++ break;
+ }
+ }
-+
-+ return lin_ret;
++ return True;
+}
+
-+/*
-+ * Common IO state helpers.
-+ */
-+static void _io_free(struct objio_state *ios)
++static u64
++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length,
++ dev_t dev)
+{
-+ unsigned i;
-+
-+ for (i = 0; i < ios->numdevs; i++) {
-+ struct _objio_per_comp *per_dev = &ios->per_dev[i];
-+
-+ if (per_dev->or) {
-+ osd_end_request(per_dev->or);
-+ per_dev->or = NULL;
-+ }
-+
-+ if (per_dev->bio) {
-+ bio_put(per_dev->bio);
-+ per_dev->bio = NULL;
-+ }
-+ }
++ pnfs_blocklayout_layout_t *n;
++
++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates);
++ if (!n)
++ return 0;
++ n->bll_es = PNFS_BLOCK_NONE_DATA;
++ n->bll_vol_id.sbid = 0;
++ n->bll_vol_id.devid = dev;
++
++ return n->bll_len;
+}
+
-+struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)
++static void
++extents_setup(struct fiemap_extent_info *fei)
+{
-+ unsigned min_dev = ios->objio_seg->layout->olo_comps_index;
-+ unsigned max_dev = min_dev + ios->ol_state.num_comps;
-+
-+ BUG_ON(dev < min_dev || max_dev <= dev);
-+ return ios->objio_seg->ods[dev - min_dev];
++ fei->fi_extents_start = NULL;
+}
+
-+struct _striping_info {
-+ u64 obj_offset;
-+ u64 group_length;
-+ u64 total_group_length;
-+ u64 Major;
-+ unsigned dev;
-+ unsigned unit_off;
-+};
-+
-+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
-+ struct _striping_info *si)
++/*
++ * extents_count -- Determine the number of extents for a given range.
++ *
++ * No need to call set_fs() here because the function
++ * doesn't use copy_to_user() if it's only counting
++ * the number of extents needed.
++ */
++static void
++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
+{
-+ u32 stripe_unit = ios->objio_seg->stripe_unit;
-+ u32 group_width = ios->objio_seg->group_width;
-+ u64 group_depth = ios->objio_seg->group_depth;
-+ u32 U = stripe_unit * group_width;
-+
-+ u64 T = U * group_depth;
-+ u64 S = T * ios->objio_seg->group_count;
-+ u64 M = div64_u64(file_offset, S);
++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len));
++ fei->fi_flags = FIEMAP_FLAG_SYNC;
++ fei->fi_extents_max = 0;
++ fei->fi_extents_start = NULL;
++ fei->fi_extents_mapped = 0;
++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1);
++}
+
++/*
++ * extents_get -- Get list of extents for range
++ *
++ * extents_count() must have been called before this routine such that
++ * fi_extents_mapped is known.
++ */
++static boolean_t
++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len)
++{
++ int m_space,
++ rval;
++ struct fiemap_extent *fe;
++ mm_segment_t old_fs = get_fs();
++
+ /*
-+ G = (L - (M * S)) / T
-+ H = (L - (M * S)) % T
-+ */
-+ u64 LmodU = file_offset - M * S;
-+ u32 G = div64_u64(LmodU, T);
-+ u64 H = LmodU - G * T;
-+
-+ u32 N = div_u64(H, U);
-+
-+ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
-+ si->obj_offset = si->unit_off + (N * stripe_unit) +
-+ (M * group_depth * stripe_unit);
-+
-+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
-+ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-+ si->dev *= ios->objio_seg->mirrors_p1;
-+
-+ si->group_length = T - H;
-+ si->total_group_length = T;
-+ si->Major = M;
++ * Now malloc the correct amount of space
++ * needed. It's possible for the file to have changed
++ * between calls which would require more space for
++ * the extents. If that occurs the last extent will
++ * not have FIEMAP_EXTENT_LAST set and the error will
++ * be caught in extents_process().
++ */
++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent);
++ fe = kmalloc(m_space, GFP_KERNEL);
++ if (!fe)
++ return False;
++ memset(fe, 0, m_space);
++
++ fei->fi_extents_max = fei->fi_extents_mapped;
++ fei->fi_extents_mapped = 0;
++ fei->fi_extents_start = fe;
++
++ set_fs(KERNEL_DS);
++ rval = i->i_op->fiemap(i, fei, foff, len +
++ (1 << i->i_sb->s_blocksize_bits) - 1);
++ set_fs(old_fs);
++
++ if (rval || !fei->fi_extents_mapped) {
++ dprintk(" No extents. Wanted %d, got %d\n",
++ fei->fi_extents_max, fei->fi_extents_mapped);
++ kfree(fe);
++ fei->fi_extents_start = NULL;
++ return False;
++ } else
++ return True;
+}
+
-+static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
-+ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len)
++/*
++ * extents_process -- runs through the extent returned from the file system and
++ * creates block layout entries.
++ */
++static boolean_t
++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates,
++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b)
+{
-+ unsigned pg = *cur_pg;
-+ struct request_queue *q =
-+ osd_request_queue(_io_od(ios, per_dev->dev));
-+
-+ per_dev->length += cur_len;
-+
-+ if (per_dev->bio == NULL) {
-+ unsigned stripes = ios->ol_state.num_comps /
-+ ios->objio_seg->mirrors_p1;
-+ unsigned pages_in_stripe = stripes *
-+ (ios->objio_seg->stripe_unit / PAGE_SIZE);
-+ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-+ stripes;
-+
-+ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
-+ if (unlikely(!per_dev->bio)) {
-+ dprintk("Faild to allocate BIO size=%u\n", bio_size);
-+ return -ENOMEM;
++ struct fiemap_extent *fep,
++ *fep_last = NULL;
++ int i;
++ pnfs_blocklayout_layout_t *n;
++ u64 last_end,
++ rval;
++
++ dprintk("--> %s\n", __func__);
++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped;
++ i++, fep++) {
++
++ BUG_ON(!fep->fe_physical);
++ /*
++ * Deal with corner cases of hoel-y files.
++ */
++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) !=
++ fep->fe_logical)) {
++
++ /*
++ * If the last extent doesn't end logically
++ * at the beginning of the current we've got
++ * hole and need to create a pNFS extent.
++ */
++ dprintk(" Got a hole at %Ld:%Ld \n",
++ _2SECTS(fep_last->fe_logical),
++ _2SECTS(fep_last->fe_length));
++ last_end = fep_last->fe_logical + fep_last->fe_length;
++ rval = bll_alloc_holey(bl_candidates, last_end,
++ fep->fe_logical - last_end, dev);
++ if (!rval)
++ return False;
++ seg->length += rval;
+ }
++
++ n = bll_alloc(fep->fe_logical, fep->fe_length,
++ BLOCK_LAYOUT_NEW, bl_candidates);
++ if (unlikely(n == NULL)) {
++ dprintk("%s: bll_alloc failed\n", __func__);
++ return False;
++ }
++
++ n->bll_soff = fep->fe_physical;
++ n->bll_es = seg->iomode == IOMODE_READ ?
++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA;
++ n->bll_vol_id.sbid = 0;
++ n->bll_vol_id.devid = dev;
++ seg->length += fep->fe_length;
++ print_bll(n, "New extent");
++ fep_last = fep;
+ }
-+
-+ while (cur_len > 0) {
-+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
-+ unsigned added_len;
-+
-+ BUG_ON(ios->ol_state.nr_pages <= pg);
-+ cur_len -= pglen;
-+
-+ added_len = bio_add_pc_page(q, per_dev->bio,
-+ ios->ol_state.pages[pg], pglen, pgbase);
-+ if (unlikely(pglen != added_len))
-+ return -ENOMEM;
-+ pgbase = 0;
-+ ++pg;
-+ }
-+ BUG_ON(cur_len);
-+
-+ *cur_pg = pg;
-+ return 0;
++ dprintk("<-- %s (i=%d)\n", __func__, i);
++
++ return True;
+}
+
-+static int _prepare_one_group(struct objio_state *ios, u64 length,
-+ struct _striping_info *si, unsigned first_comp,
-+ unsigned *last_pg)
++static void
++extents_cleanup(struct fiemap_extent_info *fei)
+{
-+ unsigned stripe_unit = ios->objio_seg->stripe_unit;
-+ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
-+ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1;
-+ unsigned dev = si->dev;
-+ unsigned first_dev = dev - (dev % devs_in_group);
-+ unsigned comp = first_comp + (dev - first_dev);
-+ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
-+ unsigned cur_pg = *last_pg;
-+ int ret = 0;
-+
-+ while (length) {
-+ struct _objio_per_comp *per_dev = &ios->per_dev[comp];
-+ unsigned cur_len, page_off = 0;
-+
-+ if (!per_dev->length) {
-+ per_dev->dev = dev;
-+ if (dev < si->dev) {
-+ per_dev->offset = si->obj_offset + stripe_unit -
-+ si->unit_off;
-+ cur_len = stripe_unit;
-+ } else if (dev == si->dev) {
-+ per_dev->offset = si->obj_offset;
-+ cur_len = stripe_unit - si->unit_off;
-+ page_off = si->unit_off & ~PAGE_MASK;
-+ BUG_ON(page_off &&
-+ (page_off != ios->ol_state.pgbase));
-+ } else { /* dev > si->dev */
-+ per_dev->offset = si->obj_offset - si->unit_off;
-+ cur_len = stripe_unit;
-+ }
-+
-+ if (max_comp < comp)
-+ max_comp = comp;
-+
-+ dev += mirrors_p1;
-+ dev = (dev % devs_in_group) + first_dev;
-+ } else {
-+ cur_len = stripe_unit;
-+ }
-+ if (cur_len >= length)
-+ cur_len = length;
-+
-+ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-+ cur_len);
-+ if (unlikely(ret))
-+ goto out;
-+
-+ comp += mirrors_p1;
-+ comp = (comp % devs_in_group) + first_comp;
-+
-+ length -= cur_len;
-+ ios->length += cur_len;
++ if (fei->fi_extents_start) {
++ kfree(fei->fi_extents_start);
++ fei->fi_extents_start = NULL;
+ }
-+out:
-+ ios->numdevs = max_comp + mirrors_p1;
-+ *last_pg = cur_pg;
-+ return ret;
+}
+
-+static int _io_rw_pagelist(struct objio_state *ios)
++/*
++ * device_slice -- check to see if device is a slice or DM
++ */
++static boolean_t
++device_slice(dev_t devid)
+{
-+ u64 length = ios->ol_state.count;
-+ struct _striping_info si;
-+ unsigned devs_in_group = ios->objio_seg->group_width *
-+ ios->objio_seg->mirrors_p1;
-+ unsigned first_comp = 0;
-+ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps;
-+ unsigned last_pg = 0;
-+ int ret = 0;
-+
-+ _calc_stripe_info(ios, ios->ol_state.offset, &si);
-+ while (length) {
-+ if (length < si.group_length)
-+ si.group_length = length;
-+
-+ ret = _prepare_one_group(ios, si.group_length, &si, first_comp,
-+ &last_pg);
-+ if (unlikely(ret))
-+ goto out;
-+
-+ length -= si.group_length;
-+
-+ si.group_length = si.total_group_length;
-+ si.unit_off = 0;
-+ ++si.Major;
-+ si.obj_offset = si.Major * ios->objio_seg->stripe_unit *
-+ ios->objio_seg->group_depth;
-+
-+ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
-+ si.dev %= num_comps;
-+
-+ first_comp += devs_in_group;
-+ first_comp %= num_comps;
++ struct block_device *bd = open_by_devnum(devid, FMODE_READ);
++ boolean_t rval = False;
++
++ if (bd) {
++ if (bd->bd_disk->minors > 1)
++ rval = True;
++ blkdev_put(bd, FMODE_READ);
+ }
-+
-+out:
-+ if (!ios->length)
-+ return ret;
-+
-+ return 0;
++ return rval;
+}
+
-+static ssize_t _sync_done(struct objio_state *ios)
++/*
++ * device_dm -- check to see if device is a Device Mapper volume.
++ *
++ * Returns 1 for DM or 0 if not
++ */
++static boolean_t
++device_dm(dev_t devid)
+{
-+ struct completion *waiting = ios->private;
-+
-+ complete(waiting);
-+ return 0;
++ boolean_t rval = False;
++ bl_comm_msg_t msg;
++ bl_comm_res_t *res;
++
++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK;
++ msg.u.msg_dev = devid;
++ if (bl_upcall(bl_comm_global, &msg, &res)) {
++ dprintk("Failed upcall to check on DM status\n");
++ } else if (res->u.dm_vol) {
++ rval = True;
++ dprintk("Device is DM volume\n");
++ } else
++ dprintk("Device is not DM volume\n");
++ kfree(res);
++
++ return rval;
+}
+
-+static void _last_io(struct kref *kref)
++static boolean_t
++layout_inode_add(struct inode *i, bl_layout_rec_t **p)
+{
-+ struct objio_state *ios = container_of(kref, struct objio_state, kref);
-+
-+ ios->done(ios);
-+}
++ bl_layout_rec_t *r = NULL;
+
-+static void _done_io(struct osd_request *or, void *p)
-+{
-+ struct objio_state *ios = p;
++ if (!i->i_op->fiemap || !i->i_op->fallocate) {
++ printk("pNFS: file system doesn't support required fiemap or"
++ "fallocate methods\n");
++ return False;
++ }
++
++ r = kmalloc(sizeof (*r), GFP_KERNEL);
++ if (!r)
++ goto error;
+
-+ kref_put(&ios->kref, _last_io);
++ r->blr_rdev = i->i_sb->s_dev;
++ r->blr_inode = i;
++ r->blr_orig_size = i->i_size;
++ r->blr_ext_size = 0;
++ r->blr_recalled = 0;
++ INIT_LIST_HEAD(&r->blr_layouts);
++ spin_lock_init(&r->blr_lock);
++ spin_lock(&layout_hashtbl_lock);
++ list_add_tail(&r->blr_hash, &layout_hash);
++ spin_unlock(&layout_hashtbl_lock);
++ *p = r;
++ return True;
++
++error:
++ if (r)
++ kfree(r);
++ return False;
+}
+
-+static ssize_t _io_exec(struct objio_state *ios)
++static bl_layout_rec_t *
++__layout_inode_find(struct inode *i)
+{
-+ DECLARE_COMPLETION_ONSTACK(wait);
-+ ssize_t status = 0; /* sync status */
-+ unsigned i;
-+ objio_done_fn saved_done_fn = ios->done;
-+ bool sync = ios->ol_state.sync;
-+
-+ if (sync) {
-+ ios->done = _sync_done;
-+ ios->private = &wait;
++ bl_layout_rec_t *r;
++
++ if (!list_empty(&layout_hash)) {
++ list_for_each_entry(r, &layout_hash, blr_hash) {
++ if ((r->blr_inode->i_ino == i->i_ino) &&
++ (r->blr_rdev == i->i_sb->s_dev)) {
++ return r;
++ }
++ }
+ }
++ return NULL;
++}
+
-+ kref_init(&ios->kref);
-+
-+ for (i = 0; i < ios->numdevs; i++) {
-+ struct osd_request *or = ios->per_dev[i].or;
-+
-+ if (!or)
-+ continue;
-+
-+ kref_get(&ios->kref);
-+ osd_execute_request_async(or, _done_io, ios);
-+ }
++static bl_layout_rec_t *
++layout_inode_find(struct inode *i)
++{
++ bl_layout_rec_t *r;
+
-+ kref_put(&ios->kref, _last_io);
++ spin_lock(&layout_hashtbl_lock);
++ r = __layout_inode_find(i);
++ spin_unlock(&layout_hashtbl_lock);
++
++ return r;
++}
+
-+ if (sync) {
-+ wait_for_completion(&wait);
-+ status = saved_done_fn(ios);
++static void
++layout_inode_del(struct inode *i)
++{
++ bl_layout_rec_t *r;
++
++ spin_lock(&layout_hashtbl_lock);
++ r = __layout_inode_find(i);
++ if (r) {
++ spin_lock(&r->blr_lock);
++ if (list_empty(&r->blr_layouts)) {
++ list_del(&r->blr_hash);
++ spin_unlock(&r->blr_lock);
++ kfree(r);
++ } else {
++ spin_unlock(&r->blr_lock);
++ }
++ } else {
++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n",
++ __func__, i->i_sb->s_dev, i->i_ino);
+ }
-+
-+ return status;
++ spin_unlock(&layout_hashtbl_lock);
+}
+
+/*
-+ * read
++ * map_state2name -- converts state in ascii string.
++ *
++ * Used for debug messages only.
+ */
-+static ssize_t _read_done(struct objio_state *ios)
++static char *
++map_state2name(enum pnfs_block_extent_state4 s)
+{
-+ ssize_t status;
-+ int ret = _io_check(ios, false);
++ switch (s) {
++ case PNFS_BLOCK_READWRITE_DATA: return " RW";
++ case PNFS_BLOCK_READ_DATA: return " RO";
++ case PNFS_BLOCK_INVALID_DATA: return "INVALID";
++ case PNFS_BLOCK_NONE_DATA: return " NONE";
++ default:
++ BUG();
++ }
++}
+
-+ _io_free(ios);
++static pnfs_blocklayout_devinfo_t *
++bld_alloc(struct list_head *volumes, int type)
++{
++ pnfs_blocklayout_devinfo_t *bld;
++
++ bld = kmalloc(sizeof (*bld), GFP_KERNEL);
++ if (!bld)
++ return NULL;
+
-+ if (likely(!ret))
-+ status = ios->length;
-+ else
-+ status = ret;
++ memset(bld, 0, sizeof (*bld));
++ bld->bld_type = type;
++ list_add_tail(&bld->bld_list, volumes);
+
-+ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
-+ return status;
++ return bld;
+}
+
-+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
++static void
++bld_free(pnfs_blocklayout_devinfo_t *bld)
+{
-+ struct osd_request *or = NULL;
-+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-+ unsigned dev = per_dev->dev;
-+ struct pnfs_osd_object_cred *cred =
-+ &ios->objio_seg->layout->olo_comps[dev];
-+ struct osd_obj_id obj = {
-+ .partition = cred->oc_object_id.oid_partition_id,
-+ .id = cred->oc_object_id.oid_object_id,
-+ };
-+ int ret;
-+
-+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
-+ if (unlikely(!or)) {
-+ ret = -ENOMEM;
-+ goto err;
-+ }
-+ per_dev->or = or;
++ list_del(&bld->bld_list);
++ kfree(bld);
++}
+
-+ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
++static void
++print_bll(pnfs_blocklayout_layout_t *b, char *text)
++{
++ dprintk(" BLL: %s\n", text);
++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n",
++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len),
++ map_state2name(b->bll_es));
++}
+
-+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
-+ if (ret) {
-+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
-+ __func__, ret);
-+ goto err;
++static inline void
++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c)
++{
++ pnfs_blocklayout_layout_t *n;
++ int dbg_count = 0;
++ u64 endpoint;
++
++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA);
++ while (c->bll_list.next != &r->blr_layouts) {
++ n = list_entry(c->bll_list.next,
++ struct pnfs_blocklayout_layout, bll_list);
++ endpoint = BLL_S_END(c);
++ if ((n->bll_soff >= c->bll_soff) &&
++ (n->bll_soff < endpoint)) {
++ if (endpoint < BLL_S_END(n)) {
++ /*
++ * The following is possible.
++ *
++ *
++ * Existing: +---+ +---+
++ * New: +-----------------------+
++ * The client request merge entries together
++ * but didn't require picking up all of the
++ * last entry. So, we still need to delete
++ * the last entry and add the remaining space
++ * to the new entry.
++ */
++ c->bll_len += BLL_S_END(n) - endpoint;
++ }
++ dbg_count++;
++ list_del(&n->bll_list);
++ kfree(n);
++ } else {
++ break;
++ }
+ }
-+
-+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-+ per_dev->length);
-+
-+err:
-+ return ret;
++ /* ---- Debug only, remove before integration ---- */
++ if (dbg_count)
++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n",
++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c)));
+}
+
-+static ssize_t _read_exec(struct objio_state *ios)
++static pnfs_blocklayout_layout_t *
++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h)
+{
-+ unsigned i;
-+ int ret;
-+
-+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
-+ if (!ios->per_dev[i].length)
-+ continue;
-+ ret = _read_mirrors(ios, i);
-+ if (unlikely(ret))
-+ goto err;
++ pnfs_blocklayout_layout_t *n = NULL;
++
++ n = kmalloc(sizeof (*n), GFP_KERNEL);
++ if (n) {
++ memset(n, 0, sizeof (*n));
++ n->bll_foff = offset;
++ n->bll_len = len;
++ n->bll_cache_state = state;
++ if (h)
++ list_add_tail(&n->bll_list, h);
+ }
-+
-+ ios->done = _read_done;
-+ return _io_exec(ios); /* In sync mode exec returns the io status */
-+
-+err:
-+ _io_free(ios);
-+ return ret;
++ return n;
+}
+
-+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
++static pnfs_blocklayout_layout_t *
++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c,
++ struct list_head *h)
+{
-+ struct objio_state *ios = container_of(ol_state, struct objio_state,
-+ ol_state);
-+ int ret;
-+
-+ ret = _io_rw_pagelist(ios);
-+ if (unlikely(ret))
-+ return ret;
-+
-+ return _read_exec(ios);
++ pnfs_blocklayout_layout_t *n = NULL;
++
++ n = bll_alloc(b->bll_foff, b->bll_len, c, h);
++ if (n) {
++ n->bll_es = b->bll_es;
++ n->bll_soff = b->bll_soff;
++ n->bll_vol_id.devid = b->bll_vol_id.devid;
++ }
++ return n;
+}
+
-+/*
-+ * write
-+ */
-+static ssize_t _write_done(struct objio_state *ios)
++static inline boolean_t
++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode,
++ enum pnfs_block_extent_state4 *s)
+{
-+ ssize_t status;
-+ int ret = _io_check(ios, true);
++ /* ---- Normal case ---- */
++ *s = b->bll_es;
++
++ switch (b->bll_es) {
++ case PNFS_BLOCK_READWRITE_DATA:
++ if (iomode == IOMODE_READ)
++ *s = PNFS_BLOCK_READ_DATA;
++ /* ---- Any use is permitted. ---- */
++ break;
++ case PNFS_BLOCK_READ_DATA:
++ /* ---- Committed as read only data. ---- */
++ if (iomode == IOMODE_RW)
++ return False;
++ break;
++ case PNFS_BLOCK_INVALID_DATA:
++ /* ---- Blocks have been allocated, but not initialized ---- */
++ if (iomode == IOMODE_READ)
++ *s = PNFS_BLOCK_NONE_DATA;
++ break;
++ case PNFS_BLOCK_NONE_DATA:
++ /* ---- Hole-y file. No backing store avail. ---- */
++ if (iomode != IOMODE_READ)
++ return False;
++ break;
++ default:
++ BUG();
++ }
++ return True;
++}
+
-+ _io_free(ios);
++#endif /* CONFIG_SPNFS_BLOCK */
+diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
+index c2a4f71..2e025c2 100644
+--- a/fs/nfsd/export.c
++++ b/fs/nfsd/export.c
+@@ -17,11 +17,19 @@
+ #include <linux/module.h>
+ #include <linux/exportfs.h>
+
++#include <linux/nfsd/nfsd4_pnfs.h>
++#if defined(CONFIG_SPNFS)
++#include <linux/nfsd4_spnfs.h>
++#if defined(CONFIG_SPNFS_BLOCK)
++#include <linux/nfsd4_block.h>
++#endif
++#endif
+ #include <linux/nfsd/syscall.h>
+ #include <net/ipv6.h>
+
+ #include "nfsd.h"
+ #include "nfsfh.h"
++#include "pnfsd.h"
+
+ #define NFSDDBG_FACILITY NFSDDBG_EXPORT
+
+@@ -352,10 +360,84 @@ static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+ return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+ }
+
++#if defined(CONFIG_PNFSD)
++static struct pnfsd_cb_operations pnfsd_cb_op = {
++ .cb_layout_recall = nfsd_layout_recall_cb,
++ .cb_device_notify = nfsd_device_notify_cb,
++
++ .cb_get_state = nfs4_pnfs_cb_get_state,
++ .cb_change_state = nfs4_pnfs_cb_change_state,
++};
++
++#if defined(CONFIG_SPNFS)
++static struct pnfs_export_operations spnfs_export_ops = {
++ .layout_type = spnfs_layout_type,
++ .get_device_info = spnfs_getdeviceinfo,
++ .get_device_iter = spnfs_getdeviceiter,
++ .layout_get = spnfs_layoutget,
++ .layout_return = spnfs_layoutreturn,
++};
+
-+ if (likely(!ret)) {
-+ /* FIXME: should be based on the OSD's persistence model
-+ * See OSD2r05 Section 4.13 Data persistence model */
-+ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC;
-+ status = ios->length;
-+ } else {
-+ status = ret;
-+ }
++static struct pnfs_export_operations spnfs_ds_export_ops = {
++ .get_state = spnfs_get_state,
++};
+
-+ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
-+ return status;
-+}
++#if defined(CONFIG_SPNFS_BLOCK)
++static struct pnfs_export_operations bl_export_ops = {
++ .layout_type = bl_layout_type,
++ .get_device_info = bl_getdeviceinfo,
++ .get_device_iter = bl_getdeviceiter,
++ .layout_get = bl_layoutget,
++ .layout_return = bl_layoutreturn,
++};
++#endif /* CONFIG_SPNFS_BLOCK */
++#endif /* CONFIG_SPNFS */
++#endif /* CONFIG_PNFSD */
+
-+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+ static struct svc_export *svc_export_update(struct svc_export *new,
+ struct svc_export *old);
+ static struct svc_export *svc_export_lookup(struct svc_export *);
+
++static int pnfsd_check_export(struct inode *inode, int *flags)
+{
-+ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
-+ unsigned dev = ios->per_dev[cur_comp].dev;
-+ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1;
-+ int ret;
-+
-+ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-+ struct osd_request *or = NULL;
-+ struct pnfs_osd_object_cred *cred =
-+ &ios->objio_seg->layout->olo_comps[dev];
-+ struct osd_obj_id obj = {
-+ .partition = cred->oc_object_id.oid_partition_id,
-+ .id = cred->oc_object_id.oid_object_id,
-+ };
-+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
-+ struct bio *bio;
++#if defined(CONFIG_PNFSD)
+
-+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
-+ if (unlikely(!or)) {
-+ ret = -ENOMEM;
-+ goto err;
-+ }
-+ per_dev->or = or;
++#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
++ if (!inode->i_sb->s_pnfs_op)
++ pnfsd_lexp_init(inode);
++ return 0;
++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+
-+ if (per_dev != master_dev) {
-+ bio = bio_kmalloc(GFP_KERNEL,
-+ master_dev->bio->bi_max_vecs);
-+ if (unlikely(!bio)) {
-+ dprintk("Faild to allocate BIO size=%u\n",
-+ master_dev->bio->bi_max_vecs);
-+ ret = -ENOMEM;
-+ goto err;
-+ }
++#if defined(CONFIG_SPNFS)
++#if defined(CONFIG_SPNFS_BLOCK)
++ if (pnfs_block_enabled(inode, *flags)) {
++ dprintk("set pnfs block export structure... \n");
++ inode->i_sb->s_pnfs_op = &bl_export_ops;
++ } else
++#endif /* CONFIG_SPNFS_BLOCK */
++ /*
++ * spnfs_enabled() indicates we're an MDS.
++ * XXX Better to check an export time option as well.
++ */
++ if (spnfs_enabled()) {
++ dprintk("set spnfs export structure...\n");
++ inode->i_sb->s_pnfs_op = &spnfs_export_ops;
++ } else {
++ dprintk("%s spnfs not in use\n", __func__);
+
-+ __bio_clone(bio, master_dev->bio);
-+ bio->bi_bdev = NULL;
-+ bio->bi_next = NULL;
-+ per_dev->bio = bio;
-+ per_dev->dev = dev;
-+ per_dev->length = master_dev->length;
-+ per_dev->offset = master_dev->offset;
-+ } else {
-+ bio = master_dev->bio;
-+ /* FIXME: bio_set_dir() */
-+ bio->bi_rw |= REQ_WRITE;
-+ }
++ /*
++ * get_state is needed if we're a DS using spnfs.
++ * XXX Better to check an export time option instead.
++ */
++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops;
++ }
++#endif /* CONFIG_SPNFS */
+
-+ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
++#endif /* CONFIG_PNFSD */
+
-+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
-+ if (ret) {
-+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
-+ __func__, ret);
-+ goto err;
-+ }
++ return 0;
++}
+
-+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
-+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
-+ per_dev->length);
+ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+ {
+
+@@ -395,8 +477,17 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+ return -EINVAL;
+ }
+
+- return 0;
++#if !defined(CONFIG_SPNFS)
++ if (inode->i_sb->s_pnfs_op &&
++ (!inode->i_sb->s_pnfs_op->layout_type ||
++ !inode->i_sb->s_pnfs_op->get_device_info ||
++ !inode->i_sb->s_pnfs_op->layout_get)) {
++ dprintk("exp_export: export of invalid fs pnfs export ops.\n");
++ return -EINVAL;
++ }
++#endif /* !CONFIG_SPNFS */
+
++ return pnfsd_check_export(inode, flags);
+ }
+
+ #ifdef CONFIG_NFSD_V4
+@@ -586,6 +677,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
+ if (exp.ex_uuid == NULL)
+ err = -ENOMEM;
+ }
++ } else if (strcmp(buf, "pnfs") == 0) {
++ exp.ex_pnfs = 1;
+ } else if (strcmp(buf, "secinfo") == 0)
+ err = secinfo_parse(&mesg, buf, &exp);
+ else
+@@ -660,6 +753,8 @@ static int svc_export_show(struct seq_file *m,
+ seq_printf(m, "%02x", exp->ex_uuid[i]);
+ }
+ }
++ if (exp->ex_pnfs)
++ seq_puts(m, ",pnfs");
+ show_secinfo(m, exp);
+ }
+ seq_puts(m, ")\n");
+@@ -687,6 +782,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
+ new->ex_fslocs.locations = NULL;
+ new->ex_fslocs.locations_count = 0;
+ new->ex_fslocs.migrated = 0;
++ new->ex_pnfs = 0;
+ }
+
+ static void export_update(struct cache_head *cnew, struct cache_head *citem)
+@@ -699,6 +795,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
+ new->ex_anon_uid = item->ex_anon_uid;
+ new->ex_anon_gid = item->ex_anon_gid;
+ new->ex_fsid = item->ex_fsid;
++ new->ex_pnfs = item->ex_pnfs;
+ new->ex_uuid = item->ex_uuid;
+ item->ex_uuid = NULL;
+ new->ex_pathname = item->ex_pathname;
+@@ -1635,8 +1732,17 @@ nfsd_export_init(void)
+ if (rv)
+ return rv;
+ rv = cache_register(&svc_expkey_cache);
+- if (rv)
++ if (rv) {
+ cache_unregister(&svc_export_cache);
++ goto out;
+ }
++#if defined(CONFIG_PNFSD)
++ spin_lock(&pnfsd_cb_ctl.lock);
++ pnfsd_cb_ctl.module = THIS_MODULE;
++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op;
++ spin_unlock(&pnfsd_cb_ctl.lock);
++#endif /* CONFIG_PNFSD */
++out:
+ return rv;
+
+ }
+@@ -1664,6 +1770,12 @@ nfsd_export_shutdown(void)
+
+ exp_writelock();
+
++#if defined(CONFIG_PNFSD)
++ spin_lock(&pnfsd_cb_ctl.lock);
++ pnfsd_cb_ctl.module = NULL;
++ pnfsd_cb_ctl.cb_op = NULL;
++ spin_unlock(&pnfsd_cb_ctl.lock);
++#endif /* CONFIG_PNFSD */
+ cache_unregister(&svc_expkey_cache);
+ cache_unregister(&svc_export_cache);
+ svcauth_unix_purge();
+diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
+index 988cbb3..fc8f498 100644
+--- a/fs/nfsd/nfs4callback.c
++++ b/fs/nfsd/nfs4callback.c
+@@ -41,7 +41,6 @@
+
+ #define NFSPROC4_CB_NULL 0
+ #define NFSPROC4_CB_COMPOUND 1
+-#define NFS4_STATEID_SIZE 16
+
+ /* Index of predefined Linux callback client operations */
+
+@@ -49,11 +48,17 @@ enum {
+ NFSPROC4_CLNT_CB_NULL = 0,
+ NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_SEQUENCE,
++#if defined(CONFIG_PNFSD)
++ NFSPROC4_CLNT_CB_LAYOUT,
++ NFSPROC4_CLNT_CB_DEVICE,
++#endif
+ };
+
+ enum nfs_cb_opnum4 {
+ OP_CB_RECALL = 4,
++ OP_CB_LAYOUT = 5,
+ OP_CB_SEQUENCE = 11,
++ OP_CB_DEVICE = 14,
+ };
+
+ #define NFS4_MAXTAGLEN 20
+@@ -79,6 +84,19 @@ enum nfs_cb_opnum4 {
+ #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
++ cb_sequence_enc_sz + \
++ 1 + 3 + \
++ enc_nfs4_fh_sz + 4)
++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
++ cb_sequence_dec_sz + \
++ op_dec_sz)
++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \
++ cb_sequence_enc_sz + \
++ 1 + 6)
++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \
++ cb_sequence_dec_sz + \
++ op_dec_sz)
+
+ /*
+ * Generic encode routines from fs/nfs/nfs4xdr.c
+@@ -95,6 +113,10 @@ xdr_writemem(__be32 *p, const void *ptr, int nbytes)
+ }
+
+ #define WRITE32(n) *p++ = htonl(n)
++#define WRITE64(n) do { \
++ *p++ = htonl((u32)((n) >> 32)); \
++ *p++ = htonl((u32)(n)); \
++} while (0)
+ #define WRITEMEM(ptr,nbytes) do { \
+ p = xdr_writemem(p, ptr, nbytes); \
+ } while (0)
+@@ -268,6 +290,111 @@ encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+ hdr->nops++;
+ }
+
++#if defined(CONFIG_PNFSD)
+
-+err:
-+ return ret;
-+}
++#include "pnfsd.h"
+
-+static ssize_t _write_exec(struct objio_state *ios)
++static void
++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr,
++ struct nfs4_cb_compound_hdr *hdr)
+{
-+ unsigned i;
-+ int ret;
-+
-+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
-+ if (!ios->per_dev[i].length)
-+ continue;
-+ ret = _write_mirrors(ios, i);
-+ if (unlikely(ret))
-+ goto err;
-+ }
-+
-+ ios->done = _write_done;
-+ return _io_exec(ios); /* In sync mode exec returns the io->status */
++ u32 *p;
+
-+err:
-+ _io_free(ios);
-+ return ret;
-+}
++ BUG_ON(hdr->minorversion == 0);
+
-+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
-+{
-+ struct objio_state *ios = container_of(ol_state, struct objio_state,
-+ ol_state);
-+ int ret;
++ RESERVE_SPACE(20);
++ WRITE32(OP_CB_LAYOUT);
++ WRITE32(clr->cb.cbl_seg.layout_type);
++ WRITE32(clr->cb.cbl_seg.iomode);
++ WRITE32(clr->cb.cbl_layoutchanged);
++ WRITE32(clr->cb.cbl_recall_type);
++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) {
++ struct nfs4_fsid fsid = clr->cb.cbl_fsid;
+
-+ /* TODO: ios->stable = stable; */
-+ ret = _io_rw_pagelist(ios);
-+ if (unlikely(ret))
-+ return ret;
++ RESERVE_SPACE(16);
++ WRITE64(fsid.major);
++ WRITE64(fsid.minor);
++ dprintk("%s: type %x iomode %d changed %d recall_type %d "
++ "fsid 0x%llx-0x%llx\n",
++ __func__, clr->cb.cbl_seg.layout_type,
++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
++ clr->cb.cbl_recall_type, fsid.major, fsid.minor);
++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) {
++ int len = clr->clr_file->fi_fhlen;
++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid;
+
-+ return _write_exec(ios);
++ RESERVE_SPACE(20 + len);
++ WRITE32(len);
++ WRITEMEM(clr->clr_file->fi_fhval, len);
++ WRITE64(clr->cb.cbl_seg.offset);
++ WRITE64(clr->cb.cbl_seg.length);
++ encode_stateid(xdr, cbl_sid);
++ dprintk("%s: type %x iomode %d changed %d recall_type %d "
++ "offset %lld length %lld stateid " STATEID_FMT "\n",
++ __func__, clr->cb.cbl_seg.layout_type,
++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
++ clr->cb.cbl_recall_type,
++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length,
++ STATEID_VAL(cbl_sid));
++ } else {
++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n",
++ __func__, clr->cb.cbl_seg.layout_type,
++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged,
++ clr->cb.cbl_recall_type);
++ }
++ hdr->nops++;
+}
+
-+/*
-+ * Policy Operations
-+ */
-+
-+/*
-+ * Return the stripe size for the specified file
-+ */
-+ssize_t
-+objlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay)
++static void
++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd,
++ struct nfs4_cb_compound_hdr *hdr)
+{
-+ ssize_t sz, maxsz = -1;
-+ struct pnfs_layout_segment *lseg;
++ u32 *p;
++ int i;
++ int len = nd->nd_list->cbd_len;
++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list;
+
-+ list_for_each_entry(lseg, &pnfslay->segs, fi_list) {
-+ int n;
-+ struct objlayout_segment *objlseg =
-+ container_of(lseg, struct objlayout_segment, lseg);
-+ struct pnfs_osd_layout *lo =
-+ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
-+ struct pnfs_osd_data_map *map = &lo->olo_map;
++ dprintk("NFSD %s: --> num %d\n", __func__, len);
+
-+ n = map->odm_group_width;
-+ if (n == 0)
-+ n = map->odm_num_comps / (map->odm_mirror_cnt + 1);
++ BUG_ON(hdr->minorversion == 0);
+
-+ switch (map->odm_raid_algorithm) {
-+ case PNFS_OSD_RAID_0:
-+ break;
++ RESERVE_SPACE(8);
++ WRITE32(OP_CB_DEVICE);
+
-+ case PNFS_OSD_RAID_4:
-+ case PNFS_OSD_RAID_5:
-+ n -= 1;
-+ break;
++ /* notify4 cnda_changes<>; */
++ WRITE32(len);
++ for (i = 0; i < len; i++) {
++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n",
++ __func__, cbd[i].cbd_notify_type,
++ cbd[i].cbd_layout_type,
++ cbd[i].cbd_devid.sbid,
++ cbd[i].cbd_devid.devid,
++ cbd[i].cbd_immediate, i);
+
-+ case PNFS_OSD_RAID_PQ:
-+ n -= 2;
-+ break;
++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE);
++ RESERVE_SPACE(32);
++ /* bitmap4 notify_mask; */
++ WRITE32(1);
++ WRITE32(cbd[i].cbd_notify_type);
++ /* opaque notify_vals<>; */
++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
++ WRITE32(24);
++ else
++ WRITE32(20);
++ WRITE32(cbd[i].cbd_layout_type);
++ WRITE64(cbd[i].cbd_devid.sbid);
++ WRITE64(cbd[i].cbd_devid.devid);
+
-+ default:
-+ BUG_ON(1);
++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) {
++ RESERVE_SPACE(4);
++ WRITE32(cbd[i].cbd_immediate);
+ }
-+ sz = map->odm_stripe_unit * n;
-+ if (sz > maxsz)
-+ maxsz = sz;
+ }
-+ dprintk("%s: Return %Zx\n", __func__, maxsz);
-+ return maxsz;
++ hdr->nops++;
+}
++#endif /* CONFIG_PNFSD */
+
-+/*
-+ * Get the max [rw]size
-+ */
-+static ssize_t
-+objlayout_get_blocksize(void)
+ static int
+ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
+ {
+@@ -297,6 +424,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ return 0;
+ }
+
++#if defined(CONFIG_PNFSD)
++static int
++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p,
++ struct nfs4_rpc_args *rpc_args)
+{
-+ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE;
++ struct xdr_stream xdr;
++ struct nfs4_layoutrecall *args = rpc_args->args_op;
++ struct nfs4_cb_compound_hdr hdr = {
++ .ident = 0,
++ .minorversion = rpc_args->args_seq.cbs_minorversion,
++ };
+
-+ return sz;
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_cb_compound_hdr(&xdr, &hdr);
++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
++ encode_cb_layout(&xdr, args, &hdr);
++ encode_cb_nops(&hdr);
++ return 0;
+}
+
-+/*
-+ * Don't gather across stripes, but rather gather (coalesce) up to
-+ * the stripe size.
-+ *
-+ * FIXME: change interface to use merge_align, merge_count
-+ */
-+static struct pnfs_layoutdriver_type objlayout_type = {
-+ .id = LAYOUT_OSD2_OBJECTS,
-+ .name = "LAYOUT_OSD2_OBJECTS",
-+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
-+
-+ .initialize_mountpoint = objlayout_initialize_mountpoint,
-+ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
-+
-+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
-+ .free_layout_hdr = objlayout_free_layout_hdr,
-+
-+ .alloc_lseg = objlayout_alloc_lseg,
-+ .free_lseg = objlayout_free_lseg,
-+
-+ .get_stripesize = objlayout_get_stripesize,
-+ .get_blocksize = objlayout_get_blocksize,
++static int
++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p,
++ struct nfs4_rpc_args *rpc_args)
++{
++ struct xdr_stream xdr;
++ struct nfs4_notify_device *args = rpc_args->args_op;
++ struct nfs4_cb_compound_hdr hdr = {
++ .ident = 0,
++ .minorversion = rpc_args->args_seq.cbs_minorversion,
++ };
+
-+ .read_pagelist = objlayout_read_pagelist,
-+ .write_pagelist = objlayout_write_pagelist,
-+ .commit = objlayout_commit,
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_cb_compound_hdr(&xdr, &hdr);
++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
++ encode_cb_device(&xdr, args, &hdr);
++ encode_cb_nops(&hdr);
++ return 0;
++}
++#endif /* CONFIG_PNFSD */
+
+ static int
+ decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
+@@ -413,6 +579,48 @@ out:
+ return status;
+ }
+
++#if defined(CONFIG_PNFSD)
++static int
++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p,
++ struct nfsd4_cb_sequence *seq)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr;
++ int status;
+
-+ .encode_layoutcommit = objlayout_encode_layoutcommit,
-+ .encode_layoutreturn = objlayout_encode_layoutreturn,
-+};
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_cb_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_cb_sequence(&xdr, seq, rqstp);
++ if (status)
++ goto out;
++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT);
++out:
++ return status;
++}
+
-+void *objio_init_mt(void)
++static int
++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p,
++ struct nfsd4_cb_sequence *seq)
+{
-+ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL);
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr;
++ int status;
+
-+ if (!omt)
-+ return ERR_PTR(-ENOMEM);
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_cb_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_cb_sequence(&xdr, seq, rqstp);
++ if (status)
++ goto out;
++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE);
++out:
++ return status;
++}
++#endif /* CONFIG_PNFSD */
+
-+ INIT_LIST_HEAD(&omt->dev_list);
-+ spin_lock_init(&omt->dev_list_lock);
-+ return omt;
+ /*
+ * RPC procedure tables
+ */
+@@ -430,6 +638,10 @@ out:
+ static struct rpc_procinfo nfs4_cb_procedures[] = {
+ PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null),
+ PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall),
++#if defined(CONFIG_PNFSD)
++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout),
++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device),
++#endif
+ };
+
+ static struct rpc_version nfs_cb_version4 = {
+@@ -615,10 +827,9 @@ out:
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
++static void nfsd4_cb_prepare_sequence(struct rpc_task *task,
++ struct nfs4_client *clp)
+ {
+- struct nfs4_delegation *dp = calldata;
+- struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+ u32 minorversion = clp->cl_cb_conn.cb_minorversion;
+ int status = 0;
+@@ -638,11 +849,15 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+ rpc_call_start(task);
+ }
+
+-static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata)
+ {
+ struct nfs4_delegation *dp = calldata;
+- struct nfs4_client *clp = dp->dl_client;
++ nfsd4_cb_prepare_sequence(task, dp->dl_client);
+}
-+
-+void objio_fini_mt(void *mountid)
+
++static void nfsd4_cb_done_sequence(struct rpc_task *task,
++ struct nfs4_client *clp)
+{
-+ _dev_list_remove_all(mountid);
-+ kfree(mountid);
-+}
-+
-+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
-+MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
-+MODULE_LICENSE("GPL");
+ dprintk("%s: minorversion=%d\n", __func__,
+ clp->cl_cb_conn.cb_minorversion);
+
+@@ -666,7 +881,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
+ struct nfs4_client *clp = dp->dl_client;
+ struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
+
+- nfsd4_cb_done(task, calldata);
++ nfsd4_cb_done_sequence(task, clp);
+
+ if (current_rpc_client == NULL) {
+ /* We're shutting down; give up. */
+@@ -713,7 +928,7 @@ static void nfsd4_cb_recall_release(void *calldata)
+ }
+
+ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+- .rpc_call_prepare = nfsd4_cb_prepare,
++ .rpc_call_prepare = nfsd4_cb_recall_prepare,
+ .rpc_call_done = nfsd4_cb_recall_done,
+ .rpc_release = nfsd4_cb_recall_release,
+ };
+@@ -788,3 +1003,173 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
+ {
+ queue_work(callback_wq, &dp->dl_recall.cb_work);
+ }
+
-+static int __init
-+objlayout_init(void)
++#if defined(CONFIG_PNFSD)
++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata)
+{
-+ int ret = pnfs_register_layoutdriver(&objlayout_type);
-+
-+ if (ret)
-+ printk(KERN_INFO
-+ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
-+ __func__, ret);
-+ else
-+ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
-+ __func__);
-+ return ret;
++ struct nfs4_layoutrecall *clr = calldata;
++ nfsd4_cb_prepare_sequence(task, clr->clr_client);
+}
+
-+static void __exit
-+objlayout_exit(void)
++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata)
+{
-+ pnfs_unregister_layoutdriver(&objlayout_type);
-+ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
-+ __func__);
-+}
-+
-+module_init(objlayout_init);
-+module_exit(objlayout_exit);
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c
---- linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-09-30 12:25:08.353285000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c 2010-09-30 12:25:08.368286000 -0400
-@@ -0,0 +1,773 @@
-+/*
-+ * objlayout.c
-+ *
-+ * pNFS layout driver for Panasas OSDs
-+ *
-+ * Copyright (C) 2007-2009 Panasas Inc.
-+ * All rights reserved.
-+ *
-+ * Benny Halevy <bhalevy at panasas.com>
-+ * Boaz Harrosh <bharrosh at panasas.com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License version 2
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the Panasas company nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
-+
-+#include <scsi/osd_initiator.h>
-+#include "objlayout.h"
++ struct nfs4_layoutrecall *clr = calldata;
++ struct nfs4_client *clp = clr->clr_client;
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++ nfsd4_cb_done_sequence(task, clp);
+
-+struct pnfs_client_operations *pnfs_client_ops;
++ if (!task->tk_status)
++ return;
+
-+/*
-+ * Create a objlayout layout structure for the given inode and return it.
-+ */
-+struct pnfs_layout_hdr *
-+objlayout_alloc_layout_hdr(struct inode *inode)
-+{
-+ struct objlayout *objlay;
++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n",
++ __func__,
++ clp,
++ clp->cl_cb_client,
++ clr->clr_file,
++ task->tk_status);
+
-+ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
-+ if (objlay) {
-+ spin_lock_init(&objlay->lock);
-+ INIT_LIST_HEAD(&objlay->err_list);
++ switch (task->tk_status) {
++ case -EIO:
++ /* Network partition? */
++ atomic_set(&clp->cl_cb_set, 0);
++ warn_no_callback_path(clp, task->tk_status);
++ /* FIXME:
++ * The pnfs standard states that we need to only expire
++ * the client after at-least "lease time" .eg lease-time * 2
++ * when failing to communicate a recall
++ */
++ break;
++ case -NFS4ERR_DELAY:
++ /* Poll the client until it's done with the layout */
++ rpc_delay(task, HZ/100); /* 10 mili-seconds */
++ task->tk_status = 0;
++ rpc_restart_call_prepare(task);
++ break;
++ case -NFS4ERR_NOMATCHING_LAYOUT:
++ task->tk_status = 0;
++ nomatching_layout(clr);
+ }
-+ dprintk("%s: Return %p\n", __func__, objlay);
-+ return &objlay->pnfs_layout;
+}
+
-+/*
-+ * Free an objlayout layout structure
-+ */
-+void
-+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
++static void nfsd4_cb_layout_release(void *calldata)
+{
-+ struct objlayout *objlay = OBJLAYOUT(lo);
-+
-+ dprintk("%s: objlay %p\n", __func__, objlay);
-+
-+ WARN_ON(!list_empty(&objlay->err_list));
-+ kfree(objlay);
++ struct nfs4_layoutrecall *clr = calldata;
++ kfree(clr->clr_args);
++ clr->clr_args = NULL;
++ put_layoutrecall(clr);
+}
+
++static const struct rpc_call_ops nfsd4_cb_layout_ops = {
++ .rpc_call_prepare = nfsd4_cb_layout_prepare,
++ .rpc_call_done = nfsd4_cb_layout_done,
++ .rpc_release = nfsd4_cb_layout_release,
++};
++
+/*
-+ * Unmarshall layout and store it in pnfslay.
++ * Called with state lock.
+ */
-+struct pnfs_layout_segment *
-+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
-+ struct nfs4_layoutget_res *lgr)
++int
++nfsd4_cb_layout(struct nfs4_layoutrecall *clr)
+{
++ struct nfs4_client *clp = clr->clr_client;
++ struct rpc_clnt *clnt = clp->cl_cb_client;
++ struct nfs4_rpc_args *args;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT],
++ .rpc_cred = callback_cred
++ };
+ int status;
-+ void *layout = lgr->layout.buf;
-+ struct objlayout_segment *objlseg;
-+ struct pnfs_osd_layout *pnfs_osd_layout;
-+
-+ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout);
-+
-+ BUG_ON(!layout);
-+
-+ status = -ENOMEM;
-+ objlseg = kzalloc(sizeof(*objlseg) +
-+ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL);
-+ if (!objlseg)
-+ goto err;
-+
-+ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
-+ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout);
-+
-+ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg,
-+ pnfs_osd_layout);
-+ if (status)
-+ goto err;
+
-+ dprintk("%s: Return %p\n", __func__, &objlseg->lseg);
-+ return &objlseg->lseg;
++ args = kzalloc(sizeof(*args), GFP_KERNEL);
++ if (!args) {
++ status = -ENOMEM;
++ goto out;
++ }
++ clr->clr_args = args;
++ args->args_op = clr;
++ msg.rpc_argp = args;
++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
++ &nfsd4_cb_layout_ops, clr);
++out:
++ if (status) {
++ kfree(args);
++ put_layoutrecall(clr);
++ }
++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status);
++ return status;
++}
+
-+ err:
-+ kfree(objlseg);
-+ return ERR_PTR(status);
++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata)
++{
++ struct nfs4_notify_device *cbnd = calldata;
++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client);
+}
+
-+/*
-+ * Free a layout segement
-+ */
-+void
-+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata)
+{
-+ struct objlayout_segment *objlseg;
++ struct nfs4_notify_device *cbnd = calldata;
++ struct nfs4_client *clp = cbnd->nd_client;
+
-+ dprintk("%s: freeing layout segment %p\n", __func__, lseg);
++ nfsd4_cb_done_sequence(task, clp);
+
-+ if (unlikely(!lseg))
-+ return;
++ dprintk("%s: clp %p cb_client %p: status %d\n",
++ __func__,
++ clp,
++ clp->cl_cb_client,
++ task->tk_status);
+
-+ objlseg = container_of(lseg, struct objlayout_segment, lseg);
-+ objio_free_lseg(objlseg->internal);
-+ kfree(objlseg);
++ if (task->tk_status == -EIO) {
++ /* Network partition? */
++ atomic_set(&clp->cl_cb_set, 0);
++ warn_no_callback_path(clp, task->tk_status);
++ }
+}
+
-+/*
-+ * I/O Operations
-+ */
-+static inline u64
-+end_offset(u64 start, u64 len)
++static void nfsd4_cb_device_release(void *calldata)
+{
-+ u64 end;
-+
-+ end = start + len;
-+ return end >= start ? end : NFS4_MAX_UINT64;
++ struct nfs4_notify_device *cbnd = calldata;
++ kfree(cbnd->nd_args);
++ cbnd->nd_args = NULL;
++ kfree(cbnd);
+}
+
-+/* last octet in a range */
-+static inline u64
-+last_byte_offset(u64 start, u64 len)
-+{
-+ u64 end;
-+
-+ BUG_ON(!len);
-+ end = start + len;
-+ return end > start ? end - 1 : NFS4_MAX_UINT64;
-+}
++static const struct rpc_call_ops nfsd4_cb_device_ops = {
++ .rpc_call_prepare = nfsd4_cb_device_prepare,
++ .rpc_call_done = nfsd4_cb_device_done,
++ .rpc_release = nfsd4_cb_device_release,
++};
+
-+static struct objlayout_io_state *
-+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
-+ struct page **pages,
-+ unsigned pgbase,
-+ unsigned nr_pages,
-+ loff_t offset,
-+ size_t count,
-+ struct pnfs_layout_segment *lseg,
-+ void *rpcdata)
++/*
++ * Called with state lock.
++ */
++int
++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd)
+{
-+ struct objlayout_segment *objlseg =
-+ container_of(lseg, struct objlayout_segment, lseg);
-+ struct objlayout_io_state *state;
-+ u64 lseg_end_offset;
-+ size_t size_nr_pages;
++ struct nfs4_client *clp = cbnd->nd_client;
++ struct rpc_clnt *clnt = clp->cl_cb_client;
++ struct nfs4_rpc_args *args;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE],
++ .rpc_cred = callback_cred
++ };
++ int status = -EIO;
+
-+ dprintk("%s: allocating io_state\n", __func__);
-+ if (objio_alloc_io_state(objlseg->internal, &state))
-+ return NULL;
++ dprintk("%s: clp %p\n", __func__, clp);
+
-+ BUG_ON(offset < lseg->range.offset);
-+ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length);
-+ BUG_ON(offset >= lseg_end_offset);
-+ if (offset + count > lseg_end_offset) {
-+ count = lseg->range.length - (offset - lseg->range.offset);
-+ dprintk("%s: truncated count %Zd\n", __func__, count);
++ args = kzalloc(sizeof(*args), GFP_KERNEL);
++ if (!args) {
++ status = -ENOMEM;
++ goto out;
+ }
++ args->args_op = cbnd;
++ msg.rpc_argp = args;
+
-+ if (pgbase > PAGE_SIZE) {
-+ unsigned n = pgbase >> PAGE_SHIFT;
-+
-+ pgbase &= ~PAGE_MASK;
-+ pages += n;
-+ nr_pages -= n;
-+ }
++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
++ &nfsd4_cb_device_ops, cbnd);
++out:
++ if (status)
++ kfree(args);
++ dprintk("%s: status %d\n", __func__, status);
++ return status;
++}
++#endif /* CONFIG_PNFSD */
+diff --git a/fs/nfsd/nfs4pnfsd.c b/fs/nfsd/nfs4pnfsd.c
+new file mode 100644
+index 0000000..8e8bae3
+--- /dev/null
++++ b/fs/nfsd/nfs4pnfsd.c
+@@ -0,0 +1,1688 @@
++/******************************************************************************
++ *
++ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
++ * (c) 2009 NetApp. All Rights Reserved.
++ *
++ * NetApp provides this source code under the GPL v2 License.
++ * The GPL v2 license is available at
++ * http://opensource.org/licenses/gpl-license.php.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *****************************************************************************/
+
-+ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-+ BUG_ON(nr_pages < size_nr_pages);
-+ if (nr_pages > size_nr_pages)
-+ nr_pages = size_nr_pages;
++#include "pnfsd.h"
+
-+ INIT_LIST_HEAD(&state->err_list);
-+ state->objlseg = objlseg;
-+ state->rpcdata = rpcdata;
-+ state->pages = pages;
-+ state->pgbase = pgbase;
-+ state->nr_pages = nr_pages;
-+ state->offset = offset;
-+ state->count = count;
-+ state->sync = 0;
++#define NFSDDBG_FACILITY NFSDDBG_PROC
+
-+ return state;
-+}
++/* Globals */
++static u32 current_layoutid = 1;
+
-+static void
-+objlayout_free_io_state(struct objlayout_io_state *state)
-+{
-+ dprintk("%s: freeing io_state\n", __func__);
-+ if (unlikely(!state))
-+ return;
++/*
++ * Currently used for manipulating the layout state.
++ */
++static DEFINE_SPINLOCK(layout_lock);
+
-+ objio_free_io_state(state);
-+}
++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP)
++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock))
++#else
++# define BUG_ON_UNLOCKED_LAYOUT()
++#endif
+
+/*
-+ * I/O done common code
++ * Layout state - NFSv4.1 pNFS
+ */
-+static void
-+objlayout_iodone(struct objlayout_io_state *state)
-+{
-+ dprintk("%s: state %p status\n", __func__, state);
++static struct kmem_cache *pnfs_layout_slab;
++static struct kmem_cache *pnfs_layoutrecall_slab;
+
-+ if (likely(state->status >= 0)) {
-+ objlayout_free_io_state(state);
-+ } else {
-+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
++/* hash table for nfsd4_pnfs_deviceid.sbid */
++#define SBID_HASH_BITS 8
++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS)
++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1)
+
-+ spin_lock(&objlay->lock);
-+ objlay->delta_space_valid = OBJ_DSU_INVALID;
-+ list_add(&objlay->err_list, &state->err_list);
-+ spin_unlock(&objlay->lock);
-+ }
-+}
++struct sbid_tracker {
++ u64 id;
++ struct super_block *sb;
++ struct list_head hash;
++};
+
-+/*
-+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
-+ *
-+ * The @index component IO failed (error returned from target). Register
-+ * the error for later reporting at layout-return.
-+ */
-+void
-+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
-+ int osd_error, u64 offset, u64 length, bool is_write)
-+{
-+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
++static u64 current_sbid;
++static struct list_head sbid_hashtbl[SBID_HASH_SIZE];
+
-+ BUG_ON(index >= state->num_comps);
-+ if (osd_error) {
-+ struct pnfs_osd_layout *layout =
-+ (typeof(layout))state->objlseg->pnfs_osd_layout;
++static inline unsigned long
++sbid_hashval(struct super_block *sb)
++{
++ return hash_ptr(sb, SBID_HASH_BITS);
++}
+
-+ ioerr->oer_component = layout->olo_comps[index].oc_object_id;
-+ ioerr->oer_comp_offset = offset;
-+ ioerr->oer_comp_length = length;
-+ ioerr->oer_iswrite = is_write;
-+ ioerr->oer_errno = osd_error;
++static inline struct sbid_tracker *
++alloc_sbid(void)
++{
++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL);
++}
+
-+ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
-+ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
-+ __func__, index, ioerr->oer_errno,
-+ ioerr->oer_iswrite,
-+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
-+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
-+ ioerr->oer_component.oid_partition_id,
-+ ioerr->oer_component.oid_object_id,
-+ ioerr->oer_comp_offset,
-+ ioerr->oer_comp_length);
-+ } else {
-+ /* User need not call if no error is reported */
-+ ioerr->oer_errno = 0;
-+ }
++static void
++destroy_sbid(struct sbid_tracker *sbid)
++{
++ spin_lock(&layout_lock);
++ list_del(&sbid->hash);
++ spin_unlock(&layout_lock);
++ kfree(sbid);
+}
+
-+static void _rpc_commit_complete(struct work_struct *work)
++void
++nfsd4_free_pnfs_slabs(void)
+{
-+ struct rpc_task *task;
-+ struct nfs_write_data *wdata;
++ int i;
++ struct sbid_tracker *sbid;
+
-+ dprintk("%s enter\n", __func__);
-+ task = container_of(work, struct rpc_task, u.tk_work);
-+ wdata = container_of(task, struct nfs_write_data, task);
++ nfsd4_free_slab(&pnfs_layout_slab);
++ nfsd4_free_slab(&pnfs_layoutrecall_slab);
+
-+ pnfs_commit_done(wdata);
++ for (i = 0; i < SBID_HASH_SIZE; i++) {
++ while (!list_empty(&sbid_hashtbl[i])) {
++ sbid = list_first_entry(&sbid_hashtbl[i],
++ struct sbid_tracker,
++ hash);
++ destroy_sbid(sbid);
++ }
++ }
+}
+
-+/*
-+ * Commit data remotely on OSDs
-+ */
-+enum pnfs_try_status
-+objlayout_commit(struct nfs_write_data *wdata, int how)
++int
++nfsd4_init_pnfs_slabs(void)
+{
-+ int status = PNFS_ATTEMPTED;
-+
-+ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete);
-+ schedule_work(&wdata->task.u.tk_work);
-+ dprintk("%s: Return %d\n", __func__, status);
-+ return status;
-+}
++ int i;
+
-+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
-+ * This is because the osd completion is called with ints-off from
-+ * the block layer
-+ */
-+static void _rpc_read_complete(struct work_struct *work)
-+{
-+ struct rpc_task *task;
-+ struct nfs_read_data *rdata;
++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts",
++ sizeof(struct nfs4_layout), 0, 0, NULL);
++ if (pnfs_layout_slab == NULL)
++ return -ENOMEM;
++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls",
++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL);
++ if (pnfs_layoutrecall_slab == NULL)
++ return -ENOMEM;
+
-+ dprintk("%s enter\n", __func__);
-+ task = container_of(work, struct rpc_task, u.tk_work);
-+ rdata = container_of(task, struct nfs_read_data, task);
++ for (i = 0; i < SBID_HASH_SIZE; i++) {
++ INIT_LIST_HEAD(&sbid_hashtbl[i]);
++ }
+
-+ pnfs_read_done(rdata);
++ return 0;
+}
+
-+void
-+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
++/* XXX: Need to implement the notify types and track which
++ * clients have which devices. */
++void pnfs_set_device_notify(clientid_t *clid, unsigned int types)
+{
-+ int eof = state->eof;
-+ struct nfs_read_data *rdata;
++ struct nfs4_client *clp;
++ dprintk("%s: -->\n", __func__);
+
-+ state->status = status;
-+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
-+ rdata = state->rpcdata;
-+ rdata->task.tk_status = status;
-+ if (status >= 0) {
-+ rdata->res.count = status;
-+ rdata->res.eof = eof;
++ nfs4_lock_state();
++ /* Indicate that client has a device so we can only notify
++ * the correct clients */
++ clp = find_confirmed_client(clid);
++ if (clp) {
++ atomic_inc(&clp->cl_deviceref);
++ dprintk("%s: Incr device count (clnt %p) to %d\n",
++ __func__, clp, atomic_read(&clp->cl_deviceref));
+ }
-+ objlayout_iodone(state);
-+ /* must not use state after this point */
++ nfs4_unlock_state();
++}
+
-+ if (sync)
-+ pnfs_read_done(rdata);
-+ else {
-+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
-+ schedule_work(&rdata->task.u.tk_work);
-+ }
++/* Clear notifications for this client
++ * XXX: Do we need to loop through a clean up all
++ * krefs when nfsd cleans up the client? */
++void pnfs_clear_device_notify(struct nfs4_client *clp)
++{
++ atomic_dec(&clp->cl_deviceref);
++ dprintk("%s: Decr device count (clnt %p) to %d\n",
++ __func__, clp, atomic_read(&clp->cl_deviceref));
+}
+
-+/*
-+ * Perform sync or async reads.
-+ */
-+enum pnfs_try_status
-+objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages)
++static struct nfs4_layout_state *
++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp,
++ stateid_t *stateid)
+{
-+ loff_t offset = rdata->args.offset;
-+ size_t count = rdata->args.count;
-+ struct objlayout_io_state *state;
-+ ssize_t status = 0;
-+ loff_t eof;
++ struct nfs4_layout_state *new;
+
-+ dprintk("%s: Begin inode %p offset %llu count %d\n",
-+ __func__, rdata->inode, offset, (int)count);
++ /* FIXME: use a kmem_cache */
++ new = kzalloc(sizeof(*new), GFP_KERNEL);
++ if (!new)
++ return new;
++ get_nfs4_file(fp);
++ INIT_LIST_HEAD(&new->ls_perfile);
++ INIT_LIST_HEAD(&new->ls_layouts);
++ kref_init(&new->ls_ref);
++ new->ls_client = clp;
++ new->ls_file = fp;
++ new->ls_stateid.si_boot = stateid->si_boot;
++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */
++ new->ls_stateid.si_generation = 1;
++ spin_lock(&layout_lock);
++ new->ls_stateid.si_fileid = current_layoutid++;
++ list_add(&new->ls_perfile, &fp->fi_layout_states);
++ spin_unlock(&layout_lock);
++ return new;
++}
+
-+ eof = i_size_read(rdata->inode);
-+ if (unlikely(offset + count > eof)) {
-+ if (offset >= eof) {
-+ status = 0;
-+ rdata->res.count = 0;
-+ rdata->res.eof = 1;
-+ goto out;
-+ }
-+ count = eof - offset;
-+ }
++static inline void
++get_layout_state(struct nfs4_layout_state *ls)
++{
++ kref_get(&ls->ls_ref);
++}
+
-+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
-+ rdata->args.pages, rdata->args.pgbase,
-+ nr_pages, offset, count,
-+ rdata->pdata.lseg, rdata);
-+ if (unlikely(!state)) {
-+ status = -ENOMEM;
-+ goto out;
-+ }
++static void
++destroy_layout_state_common(struct nfs4_layout_state *ls)
++{
++ struct nfs4_file *fp = ls->ls_file;
+
-+ state->eof = state->offset + state->count >= eof;
++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp,
++ ls->ls_client);
++ BUG_ON(!list_empty(&ls->ls_layouts));
++ kfree(ls);
++ put_nfs4_file(fp);
++}
+
-+ status = objio_read_pagelist(state);
-+ out:
-+ dprintk("%s: Return status %Zd\n", __func__, status);
-+ rdata->pdata.pnfs_error = status;
-+ return PNFS_ATTEMPTED;
++static void
++destroy_layout_state(struct kref *kref)
++{
++ struct nfs4_layout_state *ls =
++ container_of(kref, struct nfs4_layout_state, ls_ref);
++
++ spin_lock(&layout_lock);
++ list_del(&ls->ls_perfile);
++ spin_unlock(&layout_lock);
++ destroy_layout_state_common(ls);
+}
+
-+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
-+ * This is because the osd completion is called with ints-off from
-+ * the block layer
-+ */
-+static void _rpc_write_complete(struct work_struct *work)
++static void
++destroy_layout_state_locked(struct kref *kref)
+{
-+ struct rpc_task *task;
-+ struct nfs_write_data *wdata;
++ struct nfs4_layout_state *ls =
++ container_of(kref, struct nfs4_layout_state, ls_ref);
+
-+ dprintk("%s enter\n", __func__);
-+ task = container_of(work, struct rpc_task, u.tk_work);
-+ wdata = container_of(task, struct nfs_write_data, task);
++ list_del(&ls->ls_perfile);
++ destroy_layout_state_common(ls);
++}
+
-+ pnfs_writeback_done(wdata);
++static inline void
++put_layout_state(struct nfs4_layout_state *ls)
++{
++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
++ atomic_read(&ls->ls_ref.refcount));
++ kref_put(&ls->ls_ref, destroy_layout_state);
+}
+
-+void
-+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
-+ bool sync)
++static inline void
++put_layout_state_locked(struct nfs4_layout_state *ls)
+{
-+ struct nfs_write_data *wdata;
++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls,
++ atomic_read(&ls->ls_ref.refcount));
++ kref_put(&ls->ls_ref, destroy_layout_state_locked);
++}
+
-+ dprintk("%s: Begin\n", __func__);
-+ wdata = state->rpcdata;
-+ state->status = status;
-+ wdata->task.tk_status = status;
-+ if (status >= 0) {
-+ wdata->res.count = status;
-+ wdata->verf.committed = state->committed;
-+ dprintk("%s: Return status %d committed %d\n",
-+ __func__, wdata->task.tk_status,
-+ wdata->verf.committed);
-+ } else
-+ dprintk("%s: Return status %d\n",
-+ __func__, wdata->task.tk_status);
-+ objlayout_iodone(state);
-+ /* must not use state after this point */
++/*
++ * Search the fp->fi_layout_state list for a layout state with the clientid.
++ * If not found, then this is a 'first open/delegation/lock stateid' from
++ * the client for this file.
++ * Called under the layout_lock.
++ */
++static struct nfs4_layout_state *
++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp)
++{
++ struct nfs4_layout_state *ls;
+
-+ if (sync)
-+ pnfs_writeback_done(wdata);
-+ else {
-+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
-+ schedule_work(&wdata->task.u.tk_work);
++ BUG_ON_UNLOCKED_LAYOUT();
++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) {
++ if (ls->ls_client == clp) {
++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n",
++ __func__, ls,
++ atomic_read(&ls->ls_ref.refcount));
++ get_layout_state(ls);
++ return ls;
++ }
+ }
++ return NULL;
++}
++
++static __be32
++verify_stateid(struct nfs4_file *fp, stateid_t *stateid)
++{
++ struct nfs4_stateid *local = NULL;
++ struct nfs4_delegation *temp = NULL;
++
++ /* check if open or lock stateid */
++ local = find_stateid(stateid, RD_STATE);
++ if (local)
++ return 0;
++ temp = find_delegation_stateid(fp->fi_inode, stateid);
++ if (temp)
++ return 0;
++ return nfserr_bad_stateid;
+}
+
+/*
-+ * Perform sync or async writes.
++ * nfs4_preocess_layout_stateid ()
++ *
++ * We have looked up the nfs4_file corresponding to the current_fh, and
++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op()
++ * that make sense with a layout stateid.
++ *
++ * Called with the state_lock held
++ * Returns zero and stateid is updated, or error.
++ *
++ * Note: the struct nfs4_layout_state pointer is only set by layoutget.
+ */
-+enum pnfs_try_status
-+objlayout_write_pagelist(struct nfs_write_data *wdata,
-+ unsigned nr_pages,
-+ int how)
++static __be32
++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp,
++ stateid_t *stateid, struct nfs4_layout_state **lsp)
+{
-+ struct objlayout_io_state *state;
-+ ssize_t status;
++ struct nfs4_layout_state *ls = NULL;
++ __be32 status = 0;
+
-+ dprintk("%s: Begin inode %p offset %llu count %u\n",
-+ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp);
+
-+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
-+ wdata->args.pages,
-+ wdata->args.pgbase,
-+ nr_pages,
-+ wdata->args.offset,
-+ wdata->args.count,
-+ wdata->pdata.lseg, wdata);
-+ if (unlikely(!state)) {
-+ status = -ENOMEM;
++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__,
++ STATEID_VAL(stateid));
++
++ status = nfs4_check_stateid(stateid);
++ if (status)
+ goto out;
-+ }
+
-+ state->sync = how & FLUSH_SYNC;
++ /* Is this the first use of this layout ? */
++ spin_lock(&layout_lock);
++ ls = find_get_layout_state(clp, fp);
++ spin_unlock(&layout_lock);
++ if (!ls) {
++ /* Only alloc layout state on layoutget (which sets lsp). */
++ if (!lsp) {
++ dprintk("%s ERROR: Not layoutget & no layout stateid\n",
++ __func__);
++ status = nfserr_bad_stateid;
++ goto out;
++ }
++ dprintk("%s Initial stateid for layout: file %p client %p\n",
++ __func__, fp, clp);
+
-+ status = objio_write_pagelist(state, how & FLUSH_STABLE);
-+ out:
-+ dprintk("%s: Return status %Zd\n", __func__, status);
-+ wdata->pdata.pnfs_error = status;
-+ return PNFS_ATTEMPTED;
-+}
++ /* verify input stateid */
++ status = verify_stateid(fp, stateid);
++ if (status) {
++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n",
++ __func__);
++ goto out;
++ }
++ ls = alloc_init_layout_state(clp, fp, stateid);
++ if (!ls) {
++ dprintk("%s pNFS ERROR: no memory for layout state\n",
++ __func__);
++ status = nfserr_resource;
++ goto out;
++ }
++ } else {
++ dprintk("%s Not initial stateid. Layout state %p file %p\n",
++ __func__, ls, fp);
+
-+void
-+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
-+ struct xdr_stream *xdr,
-+ const struct nfs4_layoutcommit_args *args)
-+{
-+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
-+ struct pnfs_osd_layoutupdate lou;
-+ __be32 *start;
++ /* BAD STATEID */
++ status = nfserr_bad_stateid;
++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque,
++ sizeof(stateid_opaque_t)) != 0) {
+
-+ dprintk("%s: Begin\n", __func__);
++ /* if a LAYOUTGET operation and stateid is a valid
++ * open/deleg/lock stateid, accept it as a parallel
++ * initial layout stateid
++ */
++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) {
++ dprintk("%s parallel initial layout state\n",
++ __func__);
++ goto verified;
++ }
+
-+ spin_lock(&objlay->lock);
-+ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
-+ lou.dsu_delta = objlay->delta_space_used;
-+ objlay->delta_space_used = 0;
-+ objlay->delta_space_valid = OBJ_DSU_INIT;
-+ lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
-+ spin_unlock(&objlay->lock);
++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__);
++ goto out_put;
++ }
+
-+ start = xdr_reserve_space(xdr, 4);
++ /* stateid is a valid layout stateid for this file. */
++ if (stateid->si_generation > ls->ls_stateid.si_generation) {
++ dprintk("%s bad stateid 1\n", __func__);
++ goto out_put;
++ }
++ }
++verified:
++ status = 0;
+
-+ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
++ /* Return the layout state if requested */
++ if (lsp) {
++ get_layout_state(ls);
++ *lsp = ls;
++ }
++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__,
++ STATEID_VAL(&ls->ls_stateid));
++out_put:
++ dprintk("%s PUT LO STATE:\n", __func__);
++ put_layout_state(ls);
++out:
++ dprintk("<-- %s status %d\n", __func__, htonl(status));
+
-+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
++ return status;
++}
+
-+ dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
-+ lou.dsu_delta, lou.olu_ioerr_flag);
++static inline struct nfs4_layout *
++alloc_layout(void)
++{
++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL);
+}
+
-+static int
-+err_prio(u32 oer_errno)
++static inline void
++free_layout(struct nfs4_layout *lp)
+{
-+ switch (oer_errno) {
-+ case 0:
-+ return 0;
++ kmem_cache_free(pnfs_layout_slab, lp);
++}
+
-+ case PNFS_OSD_ERR_RESOURCE:
-+ return OSD_ERR_PRI_RESOURCE;
-+ case PNFS_OSD_ERR_BAD_CRED:
-+ return OSD_ERR_PRI_BAD_CRED;
-+ case PNFS_OSD_ERR_NO_ACCESS:
-+ return OSD_ERR_PRI_NO_ACCESS;
-+ case PNFS_OSD_ERR_UNREACHABLE:
-+ return OSD_ERR_PRI_UNREACHABLE;
-+ case PNFS_OSD_ERR_NOT_FOUND:
-+ return OSD_ERR_PRI_NOT_FOUND;
-+ case PNFS_OSD_ERR_NO_SPACE:
-+ return OSD_ERR_PRI_NO_SPACE;
-+ default:
-+ WARN_ON(1);
-+ /* fallthrough */
-+ case PNFS_OSD_ERR_EIO:
-+ return OSD_ERR_PRI_EIO;
-+ }
++#define update_layout_stateid(ls, sid) { \
++ update_stateid(&(ls)->ls_stateid); \
++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", \
++ __func__, (ls)->ls_stateid.si_generation, (ls)); \
++ memcpy((sid), &(ls)->ls_stateid, sizeof(stateid_t)); \
+}
+
+static void
-+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
-+ const struct pnfs_osd_ioerr *src_err)
++init_layout(struct nfs4_layout_state *ls,
++ struct nfs4_layout *lp,
++ struct nfs4_file *fp,
++ struct nfs4_client *clp,
++ struct svc_fh *current_fh,
++ struct nfsd4_layout_seg *seg,
++ stateid_t *stateid)
+{
-+ u64 dest_end, src_end;
-+
-+ if (!dest_err->oer_errno) {
-+ *dest_err = *src_err;
-+ /* accumulated device must be blank */
-+ memset(&dest_err->oer_component.oid_device_id, 0,
-+ sizeof(dest_err->oer_component.oid_device_id));
-+
-+ return;
-+ }
-+
-+ if (dest_err->oer_component.oid_partition_id !=
-+ src_err->oer_component.oid_partition_id)
-+ dest_err->oer_component.oid_partition_id = 0;
-+
-+ if (dest_err->oer_component.oid_object_id !=
-+ src_err->oer_component.oid_object_id)
-+ dest_err->oer_component.oid_object_id = 0;
-+
-+ if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
-+ dest_err->oer_comp_offset = src_err->oer_comp_offset;
-+
-+ dest_end = end_offset(dest_err->oer_comp_offset,
-+ dest_err->oer_comp_length);
-+ src_end = end_offset(src_err->oer_comp_offset,
-+ src_err->oer_comp_length);
-+ if (dest_end < src_end)
-+ dest_end = src_end;
++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__,
++ ls, lp, clp, fp, fp->fi_inode);
+
-+ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
++ get_nfs4_file(fp);
++ lp->lo_client = clp;
++ lp->lo_file = fp;
++ get_layout_state(ls);
++ lp->lo_state = ls;
++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg));
++ spin_lock(&layout_lock);
++ update_layout_stateid(ls, stateid);
++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts);
++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts);
++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts);
++ spin_unlock(&layout_lock);
++ dprintk("pNFS %s end\n", __func__);
++}
+
-+ if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
-+ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
-+ dest_err->oer_errno = src_err->oer_errno;
-+ } else if (src_err->oer_iswrite) {
-+ dest_err->oer_iswrite = true;
-+ dest_err->oer_errno = src_err->oer_errno;
-+ }
++static void
++dequeue_layout(struct nfs4_layout *lp)
++{
++ BUG_ON_UNLOCKED_LAYOUT();
++ list_del(&lp->lo_perclnt);
++ list_del(&lp->lo_perfile);
++ list_del(&lp->lo_perstate);
+}
+
+static void
-+encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr)
++destroy_layout(struct nfs4_layout *lp)
+{
-+ struct objlayout_io_state *state, *tmp;
-+ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
++ struct nfs4_client *clp;
++ struct nfs4_file *fp;
++ struct nfs4_layout_state *ls;
+
-+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
-+ unsigned i;
++ BUG_ON_UNLOCKED_LAYOUT();
++ clp = lp->lo_client;
++ fp = lp->lo_file;
++ ls = lp->lo_state;
++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n",
++ __func__, lp, clp, fp, fp->fi_inode,
++ list_empty(&ls->ls_layouts));
+
-+ for (i = 0; i < state->num_comps; i++) {
-+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
++ kmem_cache_free(pnfs_layout_slab, lp);
++ /* release references taken by init_layout */
++ put_layout_state_locked(ls);
++ put_nfs4_file(fp);
++}
+
-+ if (!ioerr->oer_errno)
-+ continue;
++void fs_layout_return(struct super_block *sb, struct inode *ino,
++ struct nfsd4_pnfs_layoutreturn *lrp, int flags,
++ void *recall_cookie)
++{
++ int ret;
+
-+ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
-+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
-+ "offset=0x%llx length=0x%llx\n",
-+ __func__, i, ioerr->oer_errno,
-+ ioerr->oer_iswrite,
-+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
-+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
-+ ioerr->oer_component.oid_partition_id,
-+ ioerr->oer_component.oid_object_id,
-+ ioerr->oer_comp_offset,
-+ ioerr->oer_comp_length);
++ if (unlikely(!sb->s_pnfs_op->layout_return))
++ return;
+
-+ merge_ioerr(&accumulated_err, ioerr);
-+ }
-+ list_del(&state->err_list);
-+ objlayout_free_io_state(state);
-+ }
++ lrp->lr_flags = flags;
++ lrp->args.lr_cookie = recall_cookie;
+
-+ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err));
++ if (!ino) /* FSID or ALL */
++ ino = sb->s_root->d_inode;
++
++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args);
++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx "
++ "cookie = %p flags 0x%x status=%d\n",
++ __func__, ino->i_ino, lrp->args.lr_seg.iomode,
++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length,
++ recall_cookie, flags, ret);
+}
+
-+void
-+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
-+ struct xdr_stream *xdr,
-+ const struct nfs4_layoutreturn_args *args)
++static u64
++alloc_init_sbid(struct super_block *sb)
+{
-+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
-+ struct objlayout_io_state *state, *tmp;
-+ __be32 *start, *uninitialized_var(last_xdr);
-+
-+ dprintk("%s: Begin\n", __func__);
-+ start = xdr_reserve_space(xdr, 4);
-+ BUG_ON(!start);
-+
-+ spin_lock(&objlay->lock);
++ struct sbid_tracker *sbid;
++ struct sbid_tracker *new = alloc_sbid();
++ unsigned long hash_idx = sbid_hashval(sb);
++ u64 id = 0;
+
-+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
-+ unsigned i;
-+ int res = 0;
++ if (likely(new)) {
++ spin_lock(&layout_lock);
++ id = ++current_sbid;
++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK);
++ id = new->id;
++ BUG_ON(id == 0);
++ new->sb = sb;
+
-+ for (i = 0; i < state->num_comps && !res; i++) {
-+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash)
++ if (sbid->sb == sb) {
++ kfree(new);
++ id = sbid->id;
++ spin_unlock(&layout_lock);
++ return id;
++ }
++ list_add(&new->hash, &sbid_hashtbl[hash_idx]);
++ spin_unlock(&layout_lock);
++ }
++ return id;
++}
+
-+ if (!ioerr->oer_errno)
-+ continue;
++struct super_block *
++find_sbid_id(u64 id)
++{
++ struct sbid_tracker *sbid;
++ struct super_block *sb = NULL;
++ unsigned long hash_idx = id & SBID_HASH_MASK;
++ int pos = 0;
+
-+ dprintk("%s: err[%d]: errno=%d is_write=%d "
-+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
-+ "offset=0x%llx length=0x%llx\n",
-+ __func__, i, ioerr->oer_errno,
-+ ioerr->oer_iswrite,
-+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
-+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
-+ ioerr->oer_component.oid_partition_id,
-+ ioerr->oer_component.oid_object_id,
-+ ioerr->oer_comp_offset,
-+ ioerr->oer_comp_length);
++ spin_lock(&layout_lock);
++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
++ pos++;
++ if (sbid->id != id)
++ continue;
++ if (pos > 1)
++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
++ sb = sbid->sb;
++ break;
++ }
++ spin_unlock(&layout_lock);
++ return sb;
++}
+
-+ last_xdr = xdr->p;
-+ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
-+ }
-+ if (unlikely(res)) {
-+ /* no space for even one error descriptor */
-+ BUG_ON(last_xdr == start + 1);
++u64
++find_create_sbid(struct super_block *sb)
++{
++ struct sbid_tracker *sbid;
++ unsigned long hash_idx = sbid_hashval(sb);
++ int pos = 0;
++ u64 id = 0;
+
-+ /* we've encountered a situation with lots and lots of
-+ * errors and no space to encode them all. Use the last
-+ * available slot to report the union of all the
-+ * remaining errors.
-+ */
-+ xdr_rewind_stream(xdr, last_xdr -
-+ pnfs_osd_ioerr_xdr_sz() / 4);
-+ encode_accumulated_error(objlay, xdr);
-+ goto loop_done;
-+ }
-+ list_del(&state->err_list);
-+ objlayout_free_io_state(state);
++ spin_lock(&layout_lock);
++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) {
++ pos++;
++ if (sbid->sb != sb)
++ continue;
++ if (pos > 1)
++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]);
++ id = sbid->id;
++ break;
+ }
-+loop_done:
-+ spin_unlock(&objlay->lock);
++ spin_unlock(&layout_lock);
+
-+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
-+ dprintk("%s: Return\n", __func__);
-+}
++ if (!id)
++ id = alloc_init_sbid(sb);
+
-+struct objlayout_deviceinfo {
-+ struct page *page;
-+ struct pnfs_osd_deviceaddr da; /* This must be last */
-+};
++ return id;
++}
+
-+/* Initialize and call nfs_getdeviceinfo, then decode and return a
-+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
-+ * should be called.
++/*
++ * Create a layoutrecall structure
++ * An optional layoutrecall can be cloned (except for the layoutrecall lists)
+ */
-+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr)
++static struct nfs4_layoutrecall *
++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl,
++ struct nfs4_client *clp,
++ struct nfs4_file *lrfile)
+{
-+ struct objlayout_deviceinfo *odi;
-+ struct pnfs_device pd;
-+ struct super_block *sb;
-+ struct page *page;
-+ size_t sz;
-+ u32 *p;
-+ int err;
++ struct nfs4_layoutrecall *clr;
+
-+ page = alloc_page(GFP_KERNEL);
-+ if (!page)
-+ return -ENOMEM;
++ dprintk("NFSD %s\n", __func__);
++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL);
++ if (clr == NULL)
++ return clr;
+
-+ pd.area = page_address(page);
++ dprintk("NFSD %s -->\n", __func__);
+
-+ memcpy(&pd.dev_id, d_id, sizeof(*d_id));
-+ pd.layout_type = LAYOUT_OSD2_OBJECTS;
-+ pd.pages = &page;
-+ pd.pgbase = 0;
-+ pd.pglen = PAGE_SIZE;
-+ pd.mincount = 0;
++ memset(clr, 0, sizeof(*clr));
++ if (lrfile)
++ get_nfs4_file(lrfile);
++ clr->clr_client = clp;
++ clr->clr_file = lrfile;
++ clr->cb = *cbl;
+
-+ sb = pnfslay->inode->i_sb;
-+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->inode), &pd);
-+ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
-+ if (err)
-+ goto err_out;
++ kref_init(&clr->clr_ref);
++ INIT_LIST_HEAD(&clr->clr_perclnt);
+
-+ p = pd.area;
-+ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p);
-+ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL);
-+ if (!odi) {
-+ err = -ENOMEM;
-+ goto err_out;
-+ }
-+ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
-+ odi->page = page;
-+ *deviceaddr = &odi->da;
-+ return 0;
++ dprintk("NFSD %s return %p\n", __func__, clr);
++ return clr;
++}
+
-+err_out:
-+ __free_page(page);
-+ return err;
++static void
++get_layoutrecall(struct nfs4_layoutrecall *clr)
++{
++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
++ atomic_read(&clr->clr_ref.refcount));
++ kref_get(&clr->clr_ref);
+}
+
-+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
++static void
++destroy_layoutrecall(struct kref *kref)
+{
-+ struct objlayout_deviceinfo *odi = container_of(deviceaddr,
-+ struct objlayout_deviceinfo,
-+ da);
-+
-+ __free_page(odi->page);
-+ kfree(odi);
++ struct nfs4_layoutrecall *clr =
++ container_of(kref, struct nfs4_layoutrecall, clr_ref);
++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr,
++ clr->clr_file, clr->clr_client);
++ BUG_ON(!list_empty(&clr->clr_perclnt));
++ if (clr->clr_file)
++ put_nfs4_file(clr->clr_file);
++ kmem_cache_free(pnfs_layoutrecall_slab, clr);
+}
+
-+/*
-+ * Initialize a mountpoint by retrieving the list of
-+ * available devices for it.
-+ * Return the pnfs_mount_type structure so the
-+ * pNFS_client can refer to the mount point later on.
-+ */
+int
-+objlayout_initialize_mountpoint(struct nfs_server *server,
-+ const struct nfs_fh *mntfh)
++put_layoutrecall(struct nfs4_layoutrecall *clr)
+{
-+ void *data;
++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
++ atomic_read(&clr->clr_ref.refcount));
++ return kref_put(&clr->clr_ref, destroy_layoutrecall);
++}
+
-+ data = objio_init_mt();
-+ if (IS_ERR(data)) {
-+ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n",
-+ __func__, PTR_ERR(data));
-+ return PTR_ERR(data);
-+ }
-+ server->pnfs_ld_data = data;
++void *
++layoutrecall_done(struct nfs4_layoutrecall *clr)
++{
++ void *recall_cookie = clr->cb.cbl_cookie;
++ struct nfs4_layoutrecall *parent = clr->parent;
+
-+ dprintk("%s: Return data=%p\n", __func__, data);
-+ return 0;
++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr,
++ atomic_read(&clr->clr_ref.refcount));
++ BUG_ON_UNLOCKED_LAYOUT();
++ list_del_init(&clr->clr_perclnt);
++ put_layoutrecall(clr);
++
++ if (parent && !put_layoutrecall(parent))
++ recall_cookie = NULL;
++
++ return recall_cookie;
+}
+
+/*
-+ * Uninitialize a mountpoint
++ * get_state() and cb_get_state() are
+ */
-+int
-+objlayout_uninitialize_mountpoint(struct nfs_server *server)
++void
++release_pnfs_ds_dev_list(struct nfs4_stateid *stp)
+{
-+ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data);
-+ objio_fini_mt(server->pnfs_ld_data);
-+ return 0;
++ struct pnfs_ds_dev_entry *ddp;
++
++ while (!list_empty(&stp->st_pnfs_ds_id)) {
++ ddp = list_entry(stp->st_pnfs_ds_id.next,
++ struct pnfs_ds_dev_entry, dd_dev_entry);
++ list_del(&ddp->dd_dev_entry);
++ kfree(ddp);
++ }
+}
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h
---- linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-09-30 12:25:08.371283000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h 2010-09-30 12:25:08.373280000 -0400
-@@ -0,0 +1,206 @@
-+/*
-+ * objlayout.h
-+ *
-+ * Data types and function declerations for interfacing with the
-+ * pNFS standard object layout driver.
-+ *
-+ * Copyright (C) 2007-2009 Panasas Inc.
-+ * All rights reserved.
-+ *
-+ * Benny Halevy <bhalevy at panasas.com>
-+ * Boaz Harrosh <bharrosh at panasas.com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License version 2
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the Panasas company nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
+
-+#ifndef _OBJLAYOUT_H
-+#define _OBJLAYOUT_H
++static int
++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid)
++{
++ struct pnfs_ds_dev_entry *ddp;
+
-+#include <linux/nfs_fs.h>
-+#include <linux/pnfs_osd_xdr.h>
-+#include "../pnfs.h"
++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL);
++ if (!ddp)
++ return -ENOMEM;
+
-+/*
-+ * in-core layout segment
-+ */
-+struct objlayout_segment {
-+ struct pnfs_layout_segment lseg;
-+ void *internal; /* for provider internal use */
-+ u8 pnfs_osd_layout[];
-+};
++ INIT_LIST_HEAD(&ddp->dd_dev_entry);
++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id);
++ ddp->dd_dsid = dsid;
++ return 0;
++}
+
+/*
-+ * per-inode layout
++ * are two octet ranges overlapping?
++ * start1 last1
++ * |-----------------|
++ * start2 last2
++ * |----------------|
+ */
-+struct objlayout {
-+ struct pnfs_layout_hdr pnfs_layout;
++static inline int
++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
++{
++ u64 start1 = l1->offset;
++ u64 last1 = last_byte_offset(start1, l1->length);
++ u64 start2 = l2->offset;
++ u64 last2 = last_byte_offset(start2, l2->length);
++ int ret;
+
-+ /* for layout_commit */
-+ enum osd_delta_space_valid_enum {
-+ OBJ_DSU_INIT = 0,
-+ OBJ_DSU_VALID,
-+ OBJ_DSU_INVALID,
-+ } delta_space_valid;
-+ s64 delta_space_used; /* consumed by write ops */
++ /* if last1 == start2 there's a single byte overlap */
++ ret = (last2 >= start1) && (last1 >= start2);
++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__,
++ l1->offset, l1->length, l2->offset, l2->length, ret);
++ return ret;
++}
+
-+ /* for layout_return */
-+ spinlock_t lock;
-+ struct list_head err_list;
-+};
++static inline int
++same_fsid_major(struct nfs4_fsid *fsid, u64 major)
++{
++ return fsid->major == major;
++}
+
-+static inline struct objlayout *
-+OBJLAYOUT(struct pnfs_layout_hdr *lo)
++static inline int
++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh)
+{
-+ return container_of(lo, struct objlayout, pnfs_layout);
++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid);
+}
+
+/*
-+ * per-I/O operation state
-+ * embedded in objects provider io_state data structure
++ * find a layout recall conflicting with the specified layoutget
+ */
-+struct objlayout_io_state {
-+ struct objlayout_segment *objlseg;
++static int
++is_layout_recalled(struct nfs4_client *clp,
++ struct svc_fh *current_fh,
++ struct nfsd4_layout_seg *seg)
++{
++ struct nfs4_layoutrecall *clr;
+
-+ struct page **pages;
-+ unsigned pgbase;
-+ unsigned nr_pages;
-+ unsigned long count;
-+ loff_t offset;
-+ bool sync;
++ spin_lock(&layout_lock);
++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) {
++ if (clr->cb.cbl_seg.layout_type != seg->layout_type)
++ continue;
++ if (clr->cb.cbl_recall_type == RETURN_ALL)
++ goto found;
++ if (clr->cb.cbl_recall_type == RETURN_FSID) {
++ if (same_fsid(&clr->cb.cbl_fsid, current_fh))
++ goto found;
++ else
++ continue;
++ }
++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE);
++ if (clr->cb.cbl_seg.clientid == seg->clientid &&
++ lo_seg_overlapping(&clr->cb.cbl_seg, seg))
++ goto found;
++ }
++ spin_unlock(&layout_lock);
++ return 0;
++found:
++ spin_unlock(&layout_lock);
++ return 1;
++}
+
-+ void *rpcdata;
-+ int status; /* res */
-+ int eof; /* res */
-+ int committed; /* res */
++/*
++ * are two octet ranges overlapping or adjacent?
++ */
++static inline int
++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2)
++{
++ u64 start1 = l1->offset;
++ u64 end1 = end_offset(start1, l1->length);
++ u64 start2 = l2->offset;
++ u64 end2 = end_offset(start2, l2->length);
+
-+ /* Error reporting (layout_return) */
-+ struct list_head err_list;
-+ unsigned num_comps;
-+ /* Pointer to array of error descriptors of size num_comps.
-+ * It should contain as many entries as devices in the osd_layout
-+ * that participate in the I/O. It is up to the io_engine to allocate
-+ * needed space and set num_comps.
-+ */
-+ struct pnfs_osd_ioerr *ioerrs;
-+};
++ /* is end1 == start2 ranges are adjacent */
++ return (end2 >= start1) && (end1 >= start2);
++}
++
++static void
++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg)
++{
++ u64 lo_start = lo->offset;
++ u64 lo_end = end_offset(lo_start, lo->length);
++ u64 lg_start = lg->offset;
++ u64 lg_end = end_offset(lg_start, lg->length);
+
-+/*
-+ * Raid engine I/O API
-+ */
-+extern void *objio_init_mt(void);
-+extern void objio_fini_mt(void *mt);
++ /* lo already covers lg? */
++ if (lo_start <= lg_start && lg_end <= lo_end)
++ return;
+
-+extern int objio_alloc_lseg(void **outp,
-+ struct pnfs_layout_hdr *pnfslay,
-+ struct pnfs_layout_segment *lseg,
-+ struct pnfs_osd_layout *layout);
-+extern void objio_free_lseg(void *p);
++ /* extend start offset */
++ if (lo_start > lg_start)
++ lo_start = lg_start;
+
-+extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp);
-+extern void objio_free_io_state(struct objlayout_io_state *state);
++ /* extend end offset */
++ if (lo_end < lg_end)
++ lo_end = lg_end;
+
-+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
-+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
-+ bool stable);
++ lo->offset = lo_start;
++ lo->length = (lo_end == NFS4_MAX_UINT64) ?
++ lo_end : lo_end - lo_start;
++}
+
-+/*
-+ * callback API
-+ */
-+extern void objlayout_io_set_result(struct objlayout_io_state *state,
-+ unsigned index, int osd_error,
-+ u64 offset, u64 length, bool is_write);
++static struct nfs4_layout *
++merge_layout(struct nfs4_file *fp,
++ struct nfs4_client *clp,
++ struct nfsd4_layout_seg *seg)
++{
++ struct nfs4_layout *lp = NULL;
+
-+static inline void
-+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
++ spin_lock(&layout_lock);
++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile)
++ if (lp->lo_seg.layout_type == seg->layout_type &&
++ lp->lo_seg.clientid == seg->clientid &&
++ lp->lo_seg.iomode == seg->iomode &&
++ lo_seg_mergeable(&lp->lo_seg, seg)) {
++ extend_layout(&lp->lo_seg, seg);
++ break;
++ }
++ spin_unlock(&layout_lock);
++
++ return lp;
++}
++
++__be32
++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp,
++ struct exp_xdr_stream *xdr)
+{
-+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.layout);
++ u32 status;
++ __be32 nfserr;
++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode;
++ struct super_block *sb = ino->i_sb;
++ int can_merge;
++ struct nfs4_file *fp;
++ struct nfs4_client *clp;
++ struct nfs4_layout *lp = NULL;
++ struct nfs4_layout_state *ls = NULL;
++ struct nfsd4_pnfs_layoutget_arg args = {
++ .lg_minlength = lgp->lg_minlength,
++ .lg_fh = &lgp->lg_fhp->fh_handle,
++ };
++ struct nfsd4_pnfs_layoutget_res res = {
++ .lg_seg = lgp->lg_seg,
++ };
+
-+ /* If one of the I/Os errored out and the delta_space_used was
-+ * invalid we render the complete report as invalid. Protocol mandate
-+ * the DSU be accurate or not reported.
-+ */
-+ spin_lock(&objlay->lock);
-+ if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
-+ objlay->delta_space_valid = OBJ_DSU_VALID;
-+ objlay->delta_space_used += space_used;
++ dprintk("NFSD: %s Begin\n", __func__);
++
++ args.lg_sbid = find_create_sbid(sb);
++ if (!args.lg_sbid) {
++ nfserr = nfserr_layouttrylater;
++ goto out;
+ }
-+ spin_unlock(&objlay->lock);
-+}
+
-+extern void objlayout_read_done(struct objlayout_io_state *state,
-+ ssize_t status, bool sync);
-+extern void objlayout_write_done(struct objlayout_io_state *state,
-+ ssize_t status, bool sync);
++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL &&
++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type);
+
-+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
-+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr);
-+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
++ nfs4_lock_state();
++ fp = find_alloc_file(ino, lgp->lg_fhp);
++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid);
++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp);
++ if (!fp || !clp) {
++ nfserr = nfserr_inval;
++ goto out_unlock;
++ }
+
-+/*
-+ * exported generic objects function vectors
-+ */
++ /* Check decoded layout stateid */
++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls);
++ if (nfserr)
++ goto out_unlock;
+
-+extern int objlayout_initialize_mountpoint(
-+ struct nfs_server *,
-+ const struct nfs_fh *);
-+extern int objlayout_uninitialize_mountpoint(struct nfs_server *);
++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) {
++ nfserr = nfserr_recallconflict;
++ goto out;
++ }
+
-+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *);
-+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
++ /* pre-alloc layout in case we can't merge after we call
++ * the file system
++ */
++ lp = alloc_layout();
++ if (!lp) {
++ nfserr = nfserr_layouttrylater;
++ goto out_unlock;
++ }
+
-+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
-+ struct pnfs_layout_hdr *,
-+ struct nfs4_layoutget_res *);
-+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd "
++ "iomode %u offset %llu length %llu\n",
++ __func__, lgp->lg_seg.layout_type,
++ exp_xdr_qbytes(xdr->end - xdr->p),
++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length);
+
-+extern enum pnfs_try_status objlayout_read_pagelist(
-+ struct nfs_read_data *,
-+ unsigned nr_pages);
++ /* FIXME: need to eliminate the use of the state lock */
++ nfs4_unlock_state();
++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res);
++ nfs4_lock_state();
+
-+extern enum pnfs_try_status objlayout_write_pagelist(
-+ struct nfs_write_data *,
-+ unsigned nr_pages,
-+ int how);
++ dprintk("pNFS %s: post-export status %u "
++ "iomode %u offset %llu length %llu\n",
++ __func__, status, res.lg_seg.iomode,
++ res.lg_seg.offset, res.lg_seg.length);
+
-+extern enum pnfs_try_status objlayout_commit(
-+ struct nfs_write_data *,
-+ int how);
++ /*
++ * The allowable error codes for the layout_get pNFS export
++ * operations vector function (from the file system) can be
++ * expanded as needed to include other errors defined for
++ * the RFC 5561 LAYOUTGET operation.
++ */
++ switch (status) {
++ case 0:
++ nfserr = NFS4_OK;
++ break;
++ case NFS4ERR_ACCESS:
++ case NFS4ERR_BADIOMODE:
++ /* No support for LAYOUTIOMODE4_RW layouts */
++ case NFS4ERR_BADLAYOUT:
++ /* No layout matching loga_minlength rules */
++ case NFS4ERR_INVAL:
++ case NFS4ERR_IO:
++ case NFS4ERR_LAYOUTTRYLATER:
++ case NFS4ERR_LAYOUTUNAVAILABLE:
++ case NFS4ERR_LOCKED:
++ case NFS4ERR_NOSPC:
++ case NFS4ERR_RECALLCONFLICT:
++ case NFS4ERR_SERVERFAULT:
++ case NFS4ERR_TOOSMALL:
++ /* Requested layout too big for loga_maxcount */
++ case NFS4ERR_WRONG_TYPE:
++ /* Not a regular file */
++ nfserr = cpu_to_be32(status);
++ goto out_freelayout;
++ default:
++ BUG();
++ nfserr = nfserr_serverfault;
++ }
+
-+extern void objlayout_encode_layoutcommit(
-+ struct pnfs_layout_hdr *,
-+ struct xdr_stream *,
-+ const struct nfs4_layoutcommit_args *);
++ lgp->lg_seg = res.lg_seg;
++ lgp->lg_roc = res.lg_return_on_close;
+
-+extern void objlayout_encode_layoutreturn(
-+ struct pnfs_layout_hdr *,
-+ struct xdr_stream *,
-+ const struct nfs4_layoutreturn_args *);
++ /* SUCCESS!
++ * Can the new layout be merged into an existing one?
++ * If so, free unused layout struct
++ */
++ if (can_merge && merge_layout(fp, clp, &res.lg_seg))
++ goto out_freelayout;
+
-+#endif /* _OBJLAYOUT_H */
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c
---- linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-09-30 12:25:08.382280000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c 2010-09-30 12:25:08.383286000 -0400
-@@ -0,0 +1,751 @@
-+/*
-+ * panfs_shim.c
-+ *
-+ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack
-+ *
-+ * Copyright (C) 2007-2009 Panasas Inc.
-+ * All rights reserved.
-+ *
-+ * Benny Halevy <bhalevy at panasas.com>
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the Panasas company nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ */
++ /* Can't merge, so let's initialize this new layout */
++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg, &lgp->lg_sid);
++out_unlock:
++ if (ls)
++ put_layout_state(ls);
++ if (fp)
++ put_nfs4_file(fp);
++ nfs4_unlock_state();
++out:
++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp,
++ be32_to_cpu(nfserr));
++ return nfserr;
++out_freelayout:
++ free_layout(lp);
++ goto out_unlock;
++}
+
-+#include <linux/module.h>
-+#include <linux/slab.h>
-+#include <asm/byteorder.h>
++static void
++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr)
++{
++ u64 lo_start = lo->offset;
++ u64 lo_end = end_offset(lo_start, lo->length);
++ u64 lr_start = lr->offset;
++ u64 lr_end = end_offset(lr_start, lr->length);
+
-+#include "objlayout.h"
-+#include "panfs_shim.h"
++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__,
++ lo->offset, lo->length, lr->offset, lr->length);
+
-+#include <linux/panfs_shim_api.h>
++ /* lr fully covers lo? */
++ if (lr_start <= lo_start && lo_end <= lr_end) {
++ lo->length = 0;
++ goto out;
++ }
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++ /*
++ * split not supported yet. retain layout segment.
++ * remains must be returned by the client
++ * on the final layout return.
++ */
++ if (lo_start < lr_start && lr_end < lo_end) {
++ dprintk("%s: split not supported\n", __func__);
++ goto out;
++ }
+
-+struct panfs_export_operations *panfs_export_ops;
++ if (lo_start < lr_start)
++ lo_end = lr_start - 1;
++ else /* lr_end < lo_end */
++ lo_start = lr_end + 1;
+
-+void *
-+objio_init_mt(void)
-+{
-+ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL;
++ lo->offset = lo_start;
++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start;
++out:
++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length);
+}
+
-+void objio_fini_mt(void *mountid)
++static int
++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp,
++ struct nfsd4_pnfs_layoutreturn *lrp,
++ struct nfs4_layout_state *ls)
+{
++ int layouts_found = 0;
++ struct nfs4_layout *lp, *nextlp;
++
++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp);
++ spin_lock(&layout_lock);
++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) {
++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n",
++ __func__, lp,
++ lp->lo_client, clp,
++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type,
++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode);
++ if (lp->lo_client != clp ||
++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type ||
++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode &&
++ lrp->args.lr_seg.iomode != IOMODE_ANY) ||
++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg))
++ continue;
++ layouts_found++;
++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg);
++ if (!lp->lo_seg.length) {
++ lrp->lrs_present = 0;
++ dequeue_layout(lp);
++ destroy_layout(lp);
++ }
++ }
++ if (ls && layouts_found && lrp->lrs_present)
++ update_layout_stateid(ls, &lrp->lr_sid);
++ spin_unlock(&layout_lock);
++
++ return layouts_found;
+}
+
+static int
-+panfs_shim_conv_raid01(struct pnfs_osd_layout *layout,
-+ struct pnfs_osd_data_map *lo_map,
-+ pan_agg_layout_hdr_t *hdr)
++pnfs_return_client_layouts(struct nfs4_client *clp,
++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid)
+{
-+ if (lo_map->odm_mirror_cnt) {
-+ hdr->type = PAN_AGG_RAID1;
-+ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1;
-+ } else if (layout->olo_num_comps > 1) {
-+ hdr->type = PAN_AGG_RAID0;
-+ hdr->hdr.raid0.num_comps = layout->olo_num_comps;
-+ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit;
-+ } else
-+ hdr->type = PAN_AGG_SIMPLE;
-+ return 0;
++ int layouts_found = 0;
++ struct nfs4_layout *lp, *nextlp;
++
++ spin_lock(&layout_lock);
++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) {
++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type ||
++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode &&
++ lrp->args.lr_seg.iomode != IOMODE_ANY))
++ continue;
++
++ if (lrp->args.lr_return_type == RETURN_FSID &&
++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid))
++ continue;
++
++ layouts_found++;
++ dequeue_layout(lp);
++ destroy_layout(lp);
++ }
++ spin_unlock(&layout_lock);
++
++ return layouts_found;
+}
+
+static int
-+panfs_shim_conv_raid5(struct pnfs_osd_layout *layout,
-+ struct pnfs_osd_data_map *lo_map,
-+ pan_agg_layout_hdr_t *hdr)
++recall_return_perfect_match(struct nfs4_layoutrecall *clr,
++ struct nfsd4_pnfs_layoutreturn *lrp,
++ struct nfs4_file *fp,
++ struct svc_fh *current_fh)
+{
-+ if (lo_map->odm_mirror_cnt)
-+ goto err;
++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode ||
++ clr->cb.cbl_recall_type != lrp->args.lr_return_type)
++ return 0;
+
-+ if (lo_map->odm_group_width || lo_map->odm_group_depth) {
-+ if (!lo_map->odm_group_width || !lo_map->odm_group_depth)
-+ goto err;
++ return (clr->cb.cbl_recall_type == RETURN_FILE &&
++ clr->clr_file == fp &&
++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset &&
++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) ||
+
-+ hdr->type = PAN_AGG_GRP_RAID5_LEFT;
-+ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps;
-+ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps)
-+ goto err;
-+ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit;
-+ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width;
-+ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth;
-+ /* this is a guess, panasas server is not supposed to
-+ hand out layotu otherwise */
-+ hdr->hdr.grp_raid5_left.group_layout_policy =
-+ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN;
-+ } else {
-+ hdr->type = PAN_AGG_RAID5_LEFT;
-+ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps;
-+ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps)
-+ goto err;
-+ hdr->hdr.raid5_left.stripe_unit2 =
-+ hdr->hdr.raid5_left.stripe_unit1 =
-+ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit;
-+ }
++ (clr->cb.cbl_recall_type == RETURN_FSID &&
++ same_fsid(&clr->cb.cbl_fsid, current_fh)) ||
+
-+ return 0;
-+err:
-+ return -EINVAL;
++ clr->cb.cbl_recall_type == RETURN_ALL;
+}
+
-+/*
-+ * Convert a pnfs_osd data map into Panasas aggregation layout header
-+ */
+static int
-+panfs_shim_conv_pnfs_osd_data_map(
-+ struct pnfs_osd_layout *layout,
-+ pan_agg_layout_hdr_t *hdr)
++recall_return_partial_match(struct nfs4_layoutrecall *clr,
++ struct nfsd4_pnfs_layoutreturn *lrp,
++ struct nfs4_file *fp,
++ struct svc_fh *current_fh)
+{
-+ int status = -EINVAL;
-+ struct pnfs_osd_data_map *lo_map = &layout->olo_map;
++ /* iomode matching? */
++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode &&
++ clr->cb.cbl_seg.iomode != IOMODE_ANY &&
++ lrp->args.lr_seg.iomode != IOMODE_ANY)
++ return 0;
+
-+ if (!layout->olo_num_comps) {
-+ dprintk("%s: !!layout.n_comps(%u)\n", __func__,
-+ layout->olo_num_comps);
-+ goto err;
-+ }
++ if (clr->cb.cbl_recall_type == RETURN_ALL ||
++ lrp->args.lr_return_type == RETURN_ALL)
++ return 1;
+
-+ switch (lo_map->odm_raid_algorithm) {
-+ case PNFS_OSD_RAID_0:
-+ if (layout->olo_num_comps != lo_map->odm_num_comps ||
-+ layout->olo_comps_index) {
-+ dprintk("%s: !!PNFS_OSD_RAID_0 "
-+ "layout.n_comps(%u) map.n_comps(%u) "
-+ "comps_index(%u)\n", __func__,
-+ layout->olo_num_comps,
-+ lo_map->odm_num_comps,
-+ layout->olo_comps_index);
-+ goto err;
++ /* fsid matches? */
++ if (clr->cb.cbl_recall_type == RETURN_FSID ||
++ lrp->args.lr_return_type == RETURN_FSID)
++ return same_fsid(&clr->cb.cbl_fsid, current_fh);
++
++ /* file matches, range overlapping? */
++ return clr->clr_file == fp &&
++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg);
++}
++
++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh,
++ struct nfsd4_pnfs_layoutreturn *lrp)
++{
++ int status = 0;
++ int layouts_found = 0;
++ struct inode *ino = current_fh->fh_dentry->d_inode;
++ struct nfs4_file *fp = NULL;
++ struct nfs4_client *clp;
++ struct nfs4_layout_state *ls = NULL;
++ struct nfs4_layoutrecall *clr, *nextclr;
++ u64 ex_fsid = current_fh->fh_export->ex_fsid;
++ void *recall_cookie = NULL;
++
++ dprintk("NFSD: %s\n", __func__);
++
++ nfs4_lock_state();
++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid);
++ if (!clp)
++ goto out;
++
++ if (lrp->args.lr_return_type == RETURN_FILE) {
++ fp = find_file(ino);
++ if (!fp) {
++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for "
++ "ino %p:%lu\n",
++ __func__, ino, ino ? ino->i_ino : 0L);
++ goto out;
+ }
-+ status = panfs_shim_conv_raid01(layout, lo_map, hdr);
-+ break;
+
-+ case PNFS_OSD_RAID_5:
-+ if (!lo_map->odm_group_width) {
-+ if (layout->olo_num_comps != lo_map->odm_num_comps ||
-+ layout->olo_comps_index) {
-+ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width "
-+ "layout.n_comps(%u)!=map.n_comps(%u) "
-+ "|| comps_index(%u)\n", __func__,
-+ layout->olo_num_comps,
-+ lo_map->odm_num_comps,
-+ layout->olo_comps_index);
-+ goto err;
-+ }
-+ } else if ((layout->olo_num_comps != lo_map->odm_num_comps &&
-+ layout->olo_num_comps > lo_map->odm_group_width) ||
-+ (layout->olo_comps_index % lo_map->odm_group_width)){
-+ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) "
-+ "layout.n_comps(%u) map.n_comps(%u) "
-+ "comps_index(%u)\n", __func__,
-+ lo_map->odm_group_width,
-+ layout->olo_num_comps,
-+ lo_map->odm_num_comps,
-+ layout->olo_comps_index);
-+ goto err;
-+ }
-+ status = panfs_shim_conv_raid5(layout, lo_map, hdr);
-+ break;
++ /* Check the stateid */
++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino);
++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, &ls);
++ if (status)
++ goto out_put_file;
++
++ /* update layouts */
++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp, ls);
++ /* optimize for the all-empty case */
++ if (list_empty(&fp->fi_layouts))
++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
++ } else {
++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid);
++ }
++
++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d "
++ "return_type %d fsid 0x%llx offset %llu length %llu: "
++ "layouts_found %d\n",
++ __func__, clp, fp, lrp->args.lr_seg.layout_type,
++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type,
++ ex_fsid,
++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found);
++
++ /* update layoutrecalls
++ * note: for RETURN_{FSID,ALL}, fp may be NULL
++ */
++ spin_lock(&layout_lock);
++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls,
++ clr_perclnt) {
++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type)
++ continue;
+
-+ case PNFS_OSD_RAID_4:
-+ case PNFS_OSD_RAID_PQ:
-+ default:
-+ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__,
-+ lo_map->odm_raid_algorithm);
-+ goto err;
++ if (recall_return_perfect_match(clr, lrp, fp, current_fh))
++ recall_cookie = layoutrecall_done(clr);
++ else if (layouts_found &&
++ recall_return_partial_match(clr, lrp, fp, current_fh))
++ clr->clr_time = CURRENT_TIME;
+ }
++ spin_unlock(&layout_lock);
+
-+ return 0;
++out_put_file:
++ if (fp)
++ put_nfs4_file(fp);
++ if (ls)
++ put_layout_state(ls);
++out:
++ nfs4_unlock_state();
+
-+err:
++ /* call exported filesystem layout_return (ignore return-code) */
++ fs_layout_return(sb, ino, lrp, 0, recall_cookie);
++
++ dprintk("pNFS %s: exit status %d \n", __func__, status);
+ return status;
+}
+
+/*
-+ * Convert pnfs_osd layout into Panasas map and caps type
++ * PNFS Metadata server export operations callback for get_state
++ *
++ * called by the cluster fs when it receives a get_state() from a data
++ * server.
++ * returns status, or pnfs_get_state* with pnfs_get_state->status set.
++ *
+ */
+int
-+objio_alloc_lseg(void **outp,
-+ struct pnfs_layout_hdr *pnfslay,
-+ struct pnfs_layout_segment *lseg,
-+ struct pnfs_osd_layout *layout)
++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg)
+{
-+ int i, total_comps;
-+ int status;
-+ struct pnfs_osd_object_cred *lo_comp;
-+ pan_size_t alloc_sz, local_sz;
-+ pan_sm_map_cap_t *mcs = NULL;
-+ u8 *buf;
-+ pan_agg_comp_obj_t *pan_comp;
-+ pan_sm_sec_t *pan_sec;
-+
-+ status = -EINVAL;
-+ if (layout->olo_num_comps < layout->olo_map.odm_group_width) {
-+ total_comps = layout->olo_comps_index + layout->olo_num_comps;
-+ } else {
-+ /* allocate full map, otherwise SAM gets confused */
-+ total_comps = layout->olo_map.odm_num_comps;
-+ }
-+ alloc_sz = total_comps *
-+ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t));
-+ for (i = 0; i < layout->olo_num_comps; i++) {
-+ void *p = layout->olo_comps[i].oc_cap.cred;
-+ if (panfs_export_ops->sm_sec_t_get_size_otw(
-+ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL))
-+ goto err;
-+ alloc_sz += local_sz;
-+ }
++ struct nfs4_stateid *stp;
++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */
++ int status = -EINVAL;
++ struct inode *ino;
++ struct nfs4_delegation *dl;
++ stateid_t *stid = (stateid_t *)&arg->stid;
+
-+ status = -ENOMEM;
-+ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL);
-+ if (!mcs)
-+ goto err;
-+ buf = (u8 *)&mcs[1];
++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__,
++ STATEID_VAL(stid), arg->ino);
+
-+ mcs->offset = lseg->range.offset;
-+ mcs->length = lseg->range.length;
-+#if 0
-+ /* FIXME: for now */
-+ mcs->expiration_time.ts_sec = 0;
-+ mcs->expiration_time.ts_nsec = 0;
-+#endif
-+ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL;
-+ status = panfs_shim_conv_pnfs_osd_data_map(layout,
-+ &mcs->full_map.layout_hdr);
-+ if (status)
-+ goto err;
++ nfs4_lock_state();
++ stp = find_stateid(stid, flags);
++ if (!stp) {
++ ino = iget_locked(sb, arg->ino);
++ if (!ino)
++ goto out;
+
-+ mcs->full_map.components.size = total_comps;
-+ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf;
-+ buf += total_comps * sizeof(pan_agg_comp_obj_t);
++ if (ino->i_state & I_NEW) {
++ iget_failed(ino);
++ goto out;
++ }
+
-+ mcs->secs.size = total_comps;
-+ mcs->secs.data = (pan_sm_sec_t *)buf;
-+ buf += total_comps * sizeof(pan_sm_sec_t);
++ dl = find_delegation_stateid(ino, stid);
++ if (dl)
++ status = 0;
+
-+ lo_comp = layout->olo_comps;
-+ pan_comp = mcs->full_map.components.data + layout->olo_comps_index;
-+ pan_sec = mcs->secs.data + layout->olo_comps_index;
-+ for (i = 0; i < layout->olo_num_comps; i++) {
-+ void *p;
-+ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id;
-+ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id;
-+ u64 dev_id = __be64_to_cpup(
-+ (__be64 *)oc_obj_id->oid_device_id.data + 1);
++ iput(ino);
++ } else {
++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */
+
-+ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n",
-+ __func__, i,
-+ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data),
-+ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1),
-+ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id);
++ /* arg->devid is the Data server id, set by the cluster fs */
++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid);
++ if (status)
++ goto out;
+
-+ if (i == 0) {
-+ /* make up mgr_id to calm sam down */
-+ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0,
-+ &obj_id->dev_id);
-+ obj_id->grp_id = oc_obj_id->oid_partition_id;
-+ obj_id->obj_id = oc_obj_id->oid_object_id;
-+ }
++ arg->access = stp->st_access_bmap;
++ *(clientid_t *)&arg->clid =
++ stp->st_stateowner->so_client->cl_clientid;
++ }
++out:
++ nfs4_unlock_state();
++ return status;
++}
+
-+ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) {
-+ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n",
-+ __func__, i, (u64)obj_id->grp_id,
-+ lo_comp->oc_object_id.oid_partition_id);
-+ status = -EINVAL;
-+ goto err;
-+ }
++static int
++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile,
++ stateid_t *lsid)
++{
++ int found = 0;
++ struct nfs4_layout *lp;
++ struct nfs4_layout_state *ls;
+
-+ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) {
-+ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n",
-+ __func__, i, obj_id->obj_id,
-+ lo_comp->oc_object_id.oid_object_id);
-+ status = -EINVAL;
-+ goto err;
-+ }
++ spin_lock(&layout_lock);
++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) {
++ if (lp->lo_file != lrfile)
++ continue;
+
-+ pan_comp->dev_id = dev_id;
-+ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) {
-+ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n",
-+ __func__, i, obj_id->dev_id);
-+ status = -EINVAL;
-+ goto err;
-+ }
-+ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) {
-+ dprintk("%s: degraded maps not supported yet\n",
-+ __func__);
-+ status = -ENOTSUPP;
-+ goto err;
-+ }
-+ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL;
-+ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) {
-+ dprintk("%s: cap key security not supported yet\n",
-+ __func__);
-+ status = -ENOTSUPP;
-+ goto err;
++ ls = find_get_layout_state(clp, lrfile);
++ if (!ls) {
++ /* This shouldn't happen as the file should have a
++ * layout stateid if it has a layout.
++ */
++ printk(KERN_ERR "%s: file %p has no layout stateid\n",
++ __func__, lrfile);
++ WARN_ON(1);
++ break;
+ }
-+
-+ p = lo_comp->oc_cap.cred;
-+ panfs_export_ops->sm_sec_t_unmarshall(
-+ (pan_sm_sec_otw_t *)&p,
-+ pan_sec,
-+ buf,
-+ alloc_sz,
-+ NULL,
-+ &local_sz);
-+ buf += local_sz;
-+ alloc_sz -= local_sz;
-+
-+ lo_comp++;
-+ pan_comp++;
-+ pan_sec++;
++ update_layout_stateid(ls, lsid);
++ put_layout_state_locked(ls);
++ found = 1;
++ break;
+ }
++ spin_unlock(&layout_lock);
+
-+ *outp = mcs;
-+ dprintk("%s:Return mcs=%p\n", __func__, mcs);
-+ return 0;
-+
-+err:
-+ objio_free_lseg(mcs);
-+ dprintk("%s:Error %d\n", __func__, status);
-+ return status;
++ return found;
+}
+
-+/*
-+ * Free a Panasas map and caps type
-+ */
-+void
-+objio_free_lseg(void *p)
++static int
++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid)
+{
-+ kfree(p);
++ int found = 0;
++ struct nfs4_layout *lp;
++
++ /* note: minor version unused */
++ spin_lock(&layout_lock);
++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt)
++ if (lp->lo_file->fi_fsid.major == fsid->major) {
++ found = 1;
++ break;
++ }
++ spin_unlock(&layout_lock);
++ return found;
+}
+
-+/*
-+ * I/O routines
-+ */
-+int
-+objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
++static int
++cl_has_any_layout(struct nfs4_client *clp)
+{
-+ struct panfs_shim_io_state *p;
-+
-+ dprintk("%s: allocating io_state\n", __func__);
-+ p = kzalloc(sizeof(*p), GFP_KERNEL);
-+ if (!p)
-+ return -ENOMEM;
++ return !list_empty(&clp->cl_layouts);
++}
+
-+ *outp = &p->ol_state;
-+ return 0;
++static int
++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl,
++ struct nfs4_file *lrfile, stateid_t *lsid)
++{
++ switch (cbl->cbl_recall_type) {
++ case RETURN_FILE:
++ return cl_has_file_layout(clp, lrfile, lsid);
++ case RETURN_FSID:
++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid);
++ default:
++ return cl_has_any_layout(clp);
++ }
+}
+
+/*
-+ * Free an I/O state
++ * Called without the layout_lock.
+ */
+void
-+objio_free_io_state(struct objlayout_io_state *ol_state)
++nomatching_layout(struct nfs4_layoutrecall *clr)
+{
-+ struct panfs_shim_io_state *state = container_of(ol_state,
-+ struct panfs_shim_io_state, ol_state);
-+ int i;
++ struct nfsd4_pnfs_layoutreturn lr = {
++ .args.lr_return_type = clr->cb.cbl_recall_type,
++ .args.lr_seg = clr->cb.cbl_seg,
++ };
++ struct inode *inode;
++ void *recall_cookie;
+
-+ dprintk("%s: freeing io_state\n", __func__);
-+ for (i = 0; i < state->ol_state.nr_pages; i++)
-+ kunmap(state->ol_state.pages[i]);
++ if (clr->clr_file) {
++ inode = igrab(clr->clr_file->fi_inode);
++ if (WARN_ON(!inode))
++ return;
++ } else {
++ inode = NULL;
++ }
+
-+ if (state->ucreds)
-+ panfs_export_ops->ucreds_put(state->ucreds);
-+ kfree(state->sg_list);
-+ kfree(state);
++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__,
++ clr->clr_client, clr->clr_file);
++
++ if (clr->cb.cbl_recall_type == RETURN_FILE)
++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr,
++ NULL);
++ else
++ pnfs_return_client_layouts(clr->clr_client, &lr,
++ clr->cb.cbl_fsid.major);
++
++ spin_lock(&layout_lock);
++ recall_cookie = layoutrecall_done(clr);
++ spin_unlock(&layout_lock);
++
++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN,
++ recall_cookie);
++ iput(inode);
+}
+
-+static int
-+panfs_shim_pages_to_sg(
-+ struct panfs_shim_io_state *state,
-+ struct page **pages,
-+ unsigned int pgbase,
-+ unsigned nr_pages,
-+ size_t count)
++void pnfs_expire_client(struct nfs4_client *clp)
+{
-+ unsigned i, n;
-+ pan_sg_entry_t *sg;
++ for (;;) {
++ struct nfs4_layoutrecall *lrp = NULL;
+
-+ dprintk("%s pgbase %u nr_pages %u count %d "
-+ "pg0 %p flags 0x%x index %llu\n",
-+ __func__, pgbase, nr_pages, (int)count, pages[0],
-+ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index);
++ spin_lock(&layout_lock);
++ if (!list_empty(&clp->cl_layoutrecalls)) {
++ lrp = list_entry(clp->cl_layoutrecalls.next,
++ struct nfs4_layoutrecall, clr_perclnt);
++ get_layoutrecall(lrp);
++ }
++ spin_unlock(&layout_lock);
++ if (!lrp)
++ break;
+
-+ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL);
-+ if (sg == NULL)
-+ return -ENOMEM;
++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file);
++ BUG_ON(lrp->clr_client != clp);
++ nomatching_layout(lrp);
++ put_layoutrecall(lrp);
++ }
+
-+ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n",
-+ __func__, sg, pages, pgbase, nr_pages);
++ for (;;) {
++ struct nfs4_layout *lp = NULL;
++ struct inode *inode = NULL;
++ struct nfsd4_pnfs_layoutreturn lr;
++ bool empty = false;
+
-+ for (i = 0; i < nr_pages; i++) {
-+ sg[i].buffer = (char *)kmap(pages[i]) + pgbase;
-+ n = PAGE_SIZE - pgbase;
-+ pgbase = 0;
-+ if (n > count)
-+ n = count;
-+ sg[i].chunk_size = n;
-+ count -= n;
-+ if (likely(count)) {
-+ sg[i].next = &sg[i+1];
-+ } else {
-+ /* we're done */
-+ sg[i].next = NULL;
-+ break;
++ spin_lock(&layout_lock);
++ if (!list_empty(&clp->cl_layouts)) {
++ lp = list_entry(clp->cl_layouts.next,
++ struct nfs4_layout, lo_perclnt);
++ inode = igrab(lp->lo_file->fi_inode);
++ memset(&lr, 0, sizeof(lr));
++ lr.args.lr_return_type = RETURN_FILE;
++ lr.args.lr_seg = lp->lo_seg;
++ empty = list_empty(&lp->lo_file->fi_layouts);
++ BUG_ON(lp->lo_client != clp);
++ dequeue_layout(lp);
++ destroy_layout(lp); /* do not access lp after this */
+ }
-+ }
-+ BUG_ON(count);
++ spin_unlock(&layout_lock);
++ if (!lp)
++ break;
+
-+ state->sg_list = sg;
-+ return 0;
++ if (WARN_ON(!inode))
++ break;
++
++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino,
++ lp, clp);
++
++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE,
++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL);
++ iput(inode);
++ }
+}
+
++struct create_recall_list_arg {
++ struct nfsd4_pnfs_cb_layout *cbl;
++ struct nfs4_file *lrfile;
++ struct list_head *todolist;
++ unsigned todo_count;
++};
++
+/*
-+ * Callback function for async reads
++ * look for matching layout for the given client
++ * and add a pending layout recall to the todo list
++ * if found any.
++ * returns:
++ * 0 if layouts found or negative error.
+ */
-+static void
-+panfs_shim_read_done(
-+ void *arg1,
-+ void *arg2,
-+ pan_sam_read_res_t *res_p,
-+ pan_status_t rc)
++static int
++lo_recall_per_client(struct nfs4_client *clp, void *p)
+{
-+ struct panfs_shim_io_state *state = arg1;
-+ ssize_t status;
++ stateid_t lsid;
++ struct nfs4_layoutrecall *pending;
++ struct create_recall_list_arg *arg = p;
+
-+ dprintk("%s: Begin\n", __func__);
-+ if (!res_p)
-+ res_p = &state->u.read.res;
-+ if (rc == PAN_SUCCESS)
-+ rc = res_p->result;
-+ if (rc == PAN_SUCCESS) {
-+ status = res_p->length;
-+ WARN_ON(status < 0);
++ memset(&lsid, 0, sizeof(lsid));
++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid))
++ return 0;
++
++ /* Matching put done by layoutreturn */
++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile);
++ /* out of memory, drain todo queue */
++ if (!pending)
++ return -ENOMEM;
++
++ *(stateid_t *)&pending->cb.cbl_sid = lsid;
++ list_add(&pending->clr_perclnt, arg->todolist);
++ arg->todo_count++;
++ return 0;
++}
++
++/* Create a layoutrecall structure for each client based on the
++ * original structure. */
++int
++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len,
++ struct nfsd4_pnfs_cb_layout *cbl,
++ struct nfs4_file *lrfile)
++{
++ struct nfs4_client *clp;
++ struct create_recall_list_arg arg = {
++ .cbl = cbl,
++ .lrfile = lrfile,
++ .todolist = todolist,
++ };
++ int status = 0;
++
++ dprintk("%s: -->\n", __func__);
++
++ /* If client given by fs, just do single client */
++ if (cbl->cbl_seg.clientid) {
++ clp = find_confirmed_client(
++ (clientid_t *)&cbl->cbl_seg.clientid);
++ if (!clp) {
++ status = -ENOENT;
++ dprintk("%s: clientid %llx not found\n", __func__,
++ (unsigned long long)cbl->cbl_seg.clientid);
++ goto out;
++ }
++
++ status = lo_recall_per_client(clp, &arg);
+ } else {
-+ status = -panfs_export_ops->convert_rc(rc);
-+ dprintk("%s: pan_sam_read rc %d: status %Zd\n",
-+ __func__, rc, status);
++ /* Check all clients for layout matches */
++ status = filter_confirmed_clients(lo_recall_per_client, &arg);
+ }
-+ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
-+ objlayout_read_done(&state->ol_state, status, true);
++
++out:
++ *todo_len = arg.todo_count;
++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status);
++ return status;
+}
+
-+ssize_t
-+objio_read_pagelist(struct objlayout_io_state *ol_state)
++/*
++ * Recall layouts asynchronously
++ * Called with state lock.
++ */
++static int
++spawn_layout_recall(struct super_block *sb, struct list_head *todolist,
++ unsigned todo_len)
+{
-+ struct panfs_shim_io_state *state = container_of(ol_state,
-+ struct panfs_shim_io_state, ol_state);
-+ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
-+ ssize_t status = 0;
-+ pan_status_t rc = PAN_SUCCESS;
++ struct nfs4_layoutrecall *pending;
++ struct nfs4_layoutrecall *parent = NULL;
++ int status = 0;
+
-+ dprintk("%s: Begin\n", __func__);
++ dprintk("%s: -->\n", __func__);
+
-+ status = panfs_shim_pages_to_sg(state, ol_state->pages,
-+ ol_state->pgbase, ol_state->nr_pages,
-+ ol_state->count);
-+ if (unlikely(status))
-+ goto err;
++ if (todo_len > 1) {
++ pending = list_entry(todolist->next, struct nfs4_layoutrecall,
++ clr_perclnt);
+
-+ state->obj_sec.min_security = 0;
-+ state->obj_sec.map_ccaps = mcs;
++ parent = alloc_init_layoutrecall(&pending->cb, NULL,
++ pending->clr_file);
++ if (unlikely(!parent)) {
++ /* We want forward progress. If parent cannot be
++ * allocated take the first one as parent but don't
++ * execute it. Caller must check for -EAGAIN, if so
++ * When the partial recalls return,
++ * nfsd_layout_recall_cb should be called again.
++ */
++ list_del_init(&pending->clr_perclnt);
++ if (todo_len > 2) {
++ parent = pending;
++ } else {
++ parent = NULL;
++ put_layoutrecall(pending);
++ }
++ --todo_len;
++ status = -ENOMEM;
++ }
++ }
+
-+ rc = panfs_export_ops->ucreds_get(&state->ucreds);
-+ if (unlikely(rc)) {
-+ status = -EACCES;
-+ goto err;
++ while (!list_empty(todolist)) {
++ pending = list_entry(todolist->next, struct nfs4_layoutrecall,
++ clr_perclnt);
++ list_del_init(&pending->clr_perclnt);
++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__,
++ pending->clr_client,
++ pending->clr_client->cl_cb_client,
++ pending->clr_file);
++ if (unlikely(!pending->clr_client->cl_cb_client)) {
++ printk(KERN_INFO
++ "%s: clientid %08x/%08x has no callback path\n",
++ __func__,
++ pending->clr_client->cl_clientid.cl_boot,
++ pending->clr_client->cl_clientid.cl_id);
++ put_layoutrecall(pending);
++ continue;
++ }
++
++ pending->clr_time = CURRENT_TIME;
++ pending->clr_sb = sb;
++ if (parent) {
++ /* If we created a parent its initial ref count is 1.
++ * We will need to de-ref it eventually. So we just
++ * don't increment on behalf of the last one.
++ */
++ if (todo_len != 1)
++ get_layoutrecall(parent);
++ }
++ pending->parent = parent;
++ get_layoutrecall(pending);
++ /* Add to list so corresponding layoutreturn can find req */
++ list_add(&pending->clr_perclnt,
++ &pending->clr_client->cl_layoutrecalls);
++
++ nfsd4_cb_layout(pending);
++ --todo_len;
+ }
+
-+ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id;
-+ state->u.read.args.offset = ol_state->offset;
-+ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP,
-+ &state->u.read.args,
-+ &state->obj_sec,
-+ state->sg_list,
-+ state->ucreds,
-+ ol_state->sync ?
-+ NULL : panfs_shim_read_done,
-+ state, NULL,
-+ &state->u.read.res);
-+ if (rc != PAN_ERR_IN_PROGRESS)
-+ panfs_shim_read_done(state, NULL, &state->u.read.res, rc);
-+ err:
-+ dprintk("%s: Return %Zd\n", __func__, status);
+ return status;
+}
+
+/*
-+ * Callback function for async writes
++ * Spawn a thread to perform a recall layout
++ *
+ */
-+static void
-+panfs_shim_write_done(
-+ void *arg1,
-+ void *arg2,
-+ pan_sam_write_res_t *res_p,
-+ pan_status_t rc)
++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode,
++ struct nfsd4_pnfs_cb_layout *cbl)
+{
-+ struct panfs_shim_io_state *state = arg1;
-+ ssize_t status;
++ int status;
++ struct nfs4_file *lrfile = NULL;
++ struct list_head todolist;
++ unsigned todo_len = 0;
+
-+ dprintk("%s: Begin\n", __func__);
-+ if (!res_p)
-+ res_p = &state->u.write.res;
-+ if (rc == PAN_SUCCESS)
-+ rc = res_p->result;
-+ if (rc == PAN_SUCCESS) {
-+/* state->ol_state.committed = NFS_FILE_SYNC;*/
-+ state->ol_state.committed = NFS_UNSTABLE;
-+ status = res_p->length;
-+ WARN_ON(status < 0);
++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl);
++ BUG_ON(!cbl);
++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE &&
++ cbl->cbl_recall_type != RETURN_FSID &&
++ cbl->cbl_recall_type != RETURN_ALL);
++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode);
++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ &&
++ cbl->cbl_seg.iomode != IOMODE_RW &&
++ cbl->cbl_seg.iomode != IOMODE_ANY);
+
-+ objlayout_add_delta_space_used(&state->ol_state,
-+ res_p->delta_capacity_used);
-+ } else {
-+ status = -panfs_export_ops->convert_rc(rc);
-+ dprintk("%s: pan_sam_write rc %u: status %Zd\n",
-+ __func__, rc, status);
++ if (nfsd_serv == NULL) {
++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n");
++ return -ENOENT;
+ }
-+ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc);
-+ objlayout_write_done(&state->ol_state, status, true);
-+}
-+
-+ssize_t
-+objio_write_pagelist(struct objlayout_io_state *ol_state,
-+ bool stable /* unused, PanOSD writes are stable */)
-+{
-+ struct panfs_shim_io_state *state = container_of(ol_state,
-+ struct panfs_shim_io_state, ol_state);
-+ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)ol_state->objlseg->internal;
-+ ssize_t status = 0;
-+ pan_status_t rc = PAN_SUCCESS;
+
-+ dprintk("%s: Begin\n", __func__);
++ nfs4_lock_state();
++ status = -ENOENT;
++ if (inode) {
++ lrfile = find_file(inode);
++ if (!lrfile) {
++ dprintk("NFSD nfsd_layout_recall_cb: "
++ "nfs4_file not found\n");
++ goto err;
++ }
++ if (cbl->cbl_recall_type == RETURN_FSID)
++ cbl->cbl_fsid = lrfile->fi_fsid;
++ }
+
-+ status = panfs_shim_pages_to_sg(state, ol_state->pages,
-+ ol_state->pgbase, ol_state->nr_pages,
-+ ol_state->count);
-+ if (unlikely(status))
-+ goto err;
++ INIT_LIST_HEAD(&todolist);
+
-+ state->obj_sec.min_security = 0;
-+ state->obj_sec.map_ccaps = mcs;
++ /* If no cookie provided by FS, return a default one */
++ if (!cbl->cbl_cookie)
++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS;
+
-+ rc = panfs_export_ops->ucreds_get(&state->ucreds);
-+ if (unlikely(rc)) {
-+ status = -EACCES;
-+ goto err;
++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile);
++ if (list_empty(&todolist)) {
++ status = -ENOENT;
++ } else {
++ /* process todolist even if create_layout_recall_list
++ * returned an error */
++ int status2 = spawn_layout_recall(sb, &todolist, todo_len);
++ if (status2)
++ status = status2;
+ }
+
-+ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id;
-+ state->u.write.args.offset = ol_state->offset;
-+ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE,
-+ &state->u.write.args,
-+ &state->obj_sec,
-+ state->sg_list,
-+ state->ucreds,
-+ ol_state->sync ?
-+ NULL : panfs_shim_write_done,
-+ state,
-+ NULL,
-+ &state->u.write.res);
-+ if (rc != PAN_ERR_IN_PROGRESS)
-+ panfs_shim_write_done(state, NULL, &state->u.write.res, rc);
-+ err:
-+ dprintk("%s: Return %Zd\n", __func__, status);
-+ return status;
++err:
++ nfs4_unlock_state();
++ if (lrfile)
++ put_nfs4_file(lrfile);
++ return (todo_len && status) ? -EAGAIN : status;
+}
+
-+int
-+panfs_shim_register(struct panfs_export_operations *ops)
++struct create_device_notify_list_arg {
++ struct list_head *todolist;
++ struct nfsd4_pnfs_cb_dev_list *ndl;
++};
++
++static int
++create_device_notify_per_cl(struct nfs4_client *clp, void *p)
+{
-+ if (panfs_export_ops) {
-+ printk(KERN_INFO
-+ "%s: panfs already registered (panfs ops %p)\n",
-+ __func__, panfs_export_ops);
-+ return -EINVAL;
-+ }
++ struct nfs4_notify_device *cbnd;
++ struct create_device_notify_list_arg *arg = p;
++
++ if (atomic_read(&clp->cl_deviceref) <= 0)
++ return 0;
+
-+ printk(KERN_INFO "%s: registering panfs ops %p\n",
-+ __func__, ops);
++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL);
++ if (!cbnd)
++ return -ENOMEM;
+
-+ panfs_export_ops = ops;
++ cbnd->nd_list = arg->ndl;
++ cbnd->nd_client = clp;
++ list_add(&cbnd->nd_perclnt, arg->todolist);
+ return 0;
+}
-+EXPORT_SYMBOL(panfs_shim_register);
+
++/* Create a list of clients to send device notifications. */
+int
-+panfs_shim_unregister(void)
++create_device_notify_list(struct list_head *todolist,
++ struct nfsd4_pnfs_cb_dev_list *ndl)
+{
-+ if (!panfs_export_ops) {
-+ printk(KERN_INFO "%s: panfs is not registered\n", __func__);
-+ return -EINVAL;
-+ }
++ int status;
++ struct create_device_notify_list_arg arg = {
++ .todolist = todolist,
++ .ndl = ndl,
++ };
+
-+ printk(KERN_INFO "%s: unregistering panfs ops %p\n",
-+ __func__, panfs_export_ops);
++ nfs4_lock_state();
++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg);
++ nfs4_unlock_state();
+
-+ panfs_export_ops = NULL;
-+ return 0;
++ return status;
+}
-+EXPORT_SYMBOL(panfs_shim_unregister);
+
+/*
-+ * Policy Operations
-+ */
-+
-+/*
-+ * Return the stripe size for the specified file
++ * For each client that a device, send a device notification.
++ * XXX: Need to track which clients have which devices.
+ */
-+ssize_t
-+panlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay)
++int nfsd_device_notify_cb(struct super_block *sb,
++ struct nfsd4_pnfs_cb_dev_list *ndl)
+{
-+ ssize_t sz, maxsz = -1;
-+ struct pnfs_layout_segment *lseg;
-+
-+ dprintk("%s: Begin\n", __func__);
++ struct nfs4_notify_device *cbnd;
++ unsigned int notify_num = 0;
++ int status2, status = 0;
++ struct list_head todolist;
+
-+ list_for_each_entry(lseg, &pnfslay->segs, fi_list) {
-+ int n;
-+ struct objlayout_segment *panlseg =
-+ container_of(lseg, struct objlayout_segment, lseg);
-+ struct pnfs_osd_layout *lo =
-+ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout;
-+ struct pnfs_osd_data_map *map = &lo->olo_map;
++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list);
+
-+ n = map->odm_group_width;
-+ if (n == 0)
-+ n = map->odm_num_comps / (map->odm_mirror_cnt + 1);
++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len);
+
-+ switch (map->odm_raid_algorithm) {
-+ case PNFS_OSD_RAID_0:
-+ break;
++ if (nfsd_serv == NULL)
++ return -ENOENT;
+
-+ case PNFS_OSD_RAID_4:
-+ case PNFS_OSD_RAID_5:
-+ n -= 1;
-+ n *= 8; /* FIXME: until we have 2-D coalescing */
-+ break;
++ INIT_LIST_HEAD(&todolist);
+
-+ case PNFS_OSD_RAID_PQ:
-+ n -= 2;
-+ break;
++ status = create_device_notify_list(&todolist, ndl);
+
-+ default:
-+ BUG_ON(1);
++ while (!list_empty(&todolist)) {
++ cbnd = list_entry(todolist.next, struct nfs4_notify_device,
++ nd_perclnt);
++ list_del_init(&cbnd->nd_perclnt);
++ status2 = nfsd4_cb_notify_device(cbnd);
++ pnfs_clear_device_notify(cbnd->nd_client);
++ if (status2) {
++ kfree(cbnd);
++ status = status2;
+ }
-+ sz = map->odm_stripe_unit * n;
-+ if (sz > maxsz)
-+ maxsz = sz;
++ notify_num++;
+ }
-+ dprintk("%s: Return %Zd\n", __func__, maxsz);
-+ return maxsz;
-+}
-+
-+#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024)
-+#define PANLAYOUT_DEF_STRIPE_WIDTH 9
-+#define PANLAYOUT_MAX_STRIPE_WIDTH 11
-+#define PANLAYOUT_MAX_GATHER_STRIPES 8
+
-+/*
-+ * Get the max [rw]size
-+ */
-+static ssize_t
-+panlayout_get_blocksize(void)
-+{
-+ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) *
-+ PANLAYOUT_DEF_STRIPE_UNIT *
-+ PANLAYOUT_MAX_GATHER_STRIPES;
-+ dprintk("%s: Return %Zd\n", __func__, sz);
-+ return sz;
++ dprintk("NFSD %s: status %d clients %u\n",
++ __func__, status, notify_num);
++ return status;
+}
-+
-+/*
-+ * Don't gather across stripes, but rather gather (coalesce) up to
-+ * the stripe size.
+diff --git a/fs/nfsd/nfs4pnfsdlm.c b/fs/nfsd/nfs4pnfsdlm.c
+new file mode 100644
+index 0000000..006ded5
+--- /dev/null
++++ b/fs/nfsd/nfs4pnfsdlm.c
+@@ -0,0 +1,461 @@
++/******************************************************************************
+ *
-+ * FIXME: change interface to use merge_align, merge_count
-+ */
-+#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS)
-+
-+static struct pnfs_layoutdriver_type panlayout_type = {
-+ .id = PNFS_LAYOUT_PANOSD,
-+ .name = "PNFS_LAYOUT_PANOSD",
-+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
++ * (c) 2007 Network Appliance, Inc. All Rights Reserved.
++ * (c) 2009 NetApp. All Rights Reserved.
++ *
++ * NetApp provides this source code under the GPL v2 License.
++ * The GPL v2 license is available at
++ * http://opensource.org/licenses/gpl-license.php.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ ******************************************************************************/
+
-+ .initialize_mountpoint = objlayout_initialize_mountpoint,
-+ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
++#include <linux/nfs4.h>
++#include <linux/nfsd/const.h>
++#include <linux/nfsd/debug.h>
++#include <linux/nfsd/nfs4pnfsdlm.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/sunrpc/clnt.h>
+
-+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
-+ .free_layout_hdr = objlayout_free_layout_hdr,
++#include "nfsfh.h"
++#include "nfsd.h"
+
-+ .alloc_lseg = objlayout_alloc_lseg,
-+ .free_lseg = objlayout_free_lseg,
++#define NFSDDBG_FACILITY NFSDDBG_PROC
+
-+ .get_stripesize = panlayout_get_stripesize,
-+ .get_blocksize = panlayout_get_blocksize,
++/* Just use a linked list. Do not expect more than 32 dlm_device_entries
++ * the first implementation will just use one device per cluster file system
++ */
+
-+ .read_pagelist = objlayout_read_pagelist,
-+ .write_pagelist = objlayout_write_pagelist,
-+ .commit = objlayout_commit,
++static LIST_HEAD(dlm_device_list);
++static DEFINE_SPINLOCK(dlm_device_list_lock);
+
-+ .encode_layoutcommit = objlayout_encode_layoutcommit,
-+ .encode_layoutreturn = objlayout_encode_layoutreturn,
++struct dlm_device_entry {
++ struct list_head dlm_dev_list;
++ char disk_name[DISK_NAME_LEN];
++ int num_ds;
++ char ds_list[NFSD_DLM_DS_LIST_MAX];
+};
+
-+MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs");
-+MODULE_AUTHOR("Benny Halevy <bhalevy at panasas.com>");
-+MODULE_LICENSE("GPL");
-+
-+static int __init
-+panlayout_init(void)
++static struct dlm_device_entry *
++_nfsd4_find_pnfs_dlm_device(char *disk_name)
+{
-+ int ret = pnfs_register_layoutdriver(&panlayout_type);
++ struct dlm_device_entry *dlm_pdev;
+
-+ if (ret)
-+ printk(KERN_INFO
-+ "%s: Registering Panasas OSD pNFS Layout Driver failed: error=%d\n",
-+ __func__, ret);
-+ else
-+ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n",
-+ __func__);
-+ return ret;
++ dprintk("--> %s disk name %s\n", __func__, disk_name);
++ spin_lock(&dlm_device_list_lock);
++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) {
++ dprintk("%s Look for dlm_pdev %s\n", __func__,
++ dlm_pdev->disk_name);
++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) {
++ spin_unlock(&dlm_device_list_lock);
++ return dlm_pdev;
++ }
++ }
++ spin_unlock(&dlm_device_list_lock);
++ return NULL;
+}
+
-+static void __exit
-+panlayout_exit(void)
-+{
-+ pnfs_unregister_layoutdriver(&panlayout_type);
-+ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n",
-+ __func__);
++static struct dlm_device_entry *
++nfsd4_find_pnfs_dlm_device(struct super_block *sb) {
++ char dname[BDEVNAME_SIZE];
++
++ bdevname(sb->s_bdev, dname);
++ return _nfsd4_find_pnfs_dlm_device(dname);
+}
+
-+module_init(panlayout_init);
-+module_exit(panlayout_exit);
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h
---- linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-09-30 12:25:08.386283000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h 2010-09-30 12:25:08.388281000 -0400
-@@ -0,0 +1,482 @@
-+/*
-+ * panfs_shim.h
-+ *
-+ * Data types and external function declerations for interfacing with
-+ * panfs (Panasas DirectFlow) I/O stack
-+ *
-+ * Copyright (C) 2007 Panasas Inc.
-+ * All rights reserved.
-+ *
-+ * Benny Halevy <bhalevy at panasas.com>
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the Panasas company nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ */
++ssize_t
++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen)
++{
++ char *pos = buf;
++ ssize_t size = 0;
++ struct dlm_device_entry *dlm_pdev;
++ int ret = -EINVAL;
++
++ spin_lock(&dlm_device_list_lock);
++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list)
++ {
++ int advanced;
++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list);
++ if (advanced >= buflen - size)
++ goto out;
++ size += advanced;
++ pos += advanced;
++ }
++ ret = size;
+
-+#ifndef _PANLAYOUT_PANFS_SHIM_H
-+#define _PANLAYOUT_PANFS_SHIM_H
++out:
++ spin_unlock(&dlm_device_list_lock);
++ return ret;
++}
+
-+typedef s8 pan_int8_t;
-+typedef u8 pan_uint8_t;
-+typedef s16 pan_int16_t;
-+typedef u16 pan_uint16_t;
-+typedef s32 pan_int32_t;
-+typedef u32 pan_uint32_t;
-+typedef s64 pan_int64_t;
-+typedef u64 pan_uint64_t;
++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds)
++{
++ char *start = ds_list;
+
-+/*
-+ * from pan_base_types.h
-+ */
-+typedef pan_uint64_t pan_rpc_none_t;
-+typedef pan_uint32_t pan_rpc_arrdim_t;
-+typedef pan_uint32_t pan_status_t;
-+typedef pan_uint8_t pan_otw_t;
-+typedef pan_uint8_t pan_pad_t;
++ *num_ds = 0;
+
-+typedef pan_uint32_t pan_timespec_sec_t;
-+typedef pan_uint32_t pan_timespec_nsec_t;
++ while (*start) {
++ struct sockaddr_storage tempAddr;
++ int ipLen = strcspn(start, ",");
+
-+typedef struct pan_timespec_s pan_timespec_t;
-+struct pan_timespec_s {
-+ pan_timespec_sec_t ts_sec;
-+ pan_timespec_nsec_t ts_nsec;
-+};
++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr)))
++ return false;
++ (*num_ds)++;
++ start += ipLen + 1;
++ }
++ return true;
++}
+
+/*
-+ * from pan_std_types.h
++ * pnfs_dlm_device string format:
++ * block-device-path:<ds1 ipv4 address>,<ds2 ipv4 address>
++ *
++ * Examples
++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with
++ * two data servers for the dlm cluster file system mounted on /dev/sda.
++ *
++ * /dev/sda:192.168.1.96,192.168.1.100'
++ * replaces the data server list for /dev/sda
++ *
++ * Only the deviceid == 1 is supported. Can add device id to
++ * pnfs_dlm_device string when needed.
++ *
++ * Only the round robin each data server once stripe index is supported.
+ */
-+typedef pan_uint32_t pan_size_t;
-+typedef int pan_bool_t;
++int
++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len)
+
-+/*
-+ * from pan_common_error.h
-+ */
-+#define PAN_SUCCESS ((pan_status_t)0)
-+#define PAN_ERR_IN_PROGRESS ((pan_status_t)55)
++{
++ struct dlm_device_entry *new, *found;
++ char *bufp = pnfs_dlm_device;
++ char *endp = bufp + strlen(bufp);
++ int err = -ENOMEM;
+
-+/*
-+ * from pan_sg.h
-+ */
-+typedef struct pan_sg_entry_s pan_sg_entry_t;
-+struct pan_sg_entry_s {
-+ void *buffer; /* pointer to memory */
-+ pan_uint32_t chunk_size; /* size of each chunk (bytes) */
-+ pan_sg_entry_t *next;
-+};
++ dprintk("--> %s len %d\n", __func__, len);
+
-+/*
-+ * from pan_storage.h
-+ */
-+typedef pan_uint64_t pan_stor_dev_id_t;
-+typedef pan_uint32_t pan_stor_obj_grp_id_t;
-+typedef pan_uint64_t pan_stor_obj_uniq_t;
-+typedef pan_uint32_t pan_stor_action_t;
-+typedef pan_uint8_t pan_stor_cap_key_t[20];
++ new = kzalloc(sizeof(*new), GFP_KERNEL);
++ if (!new)
++ return err;
+
-+typedef pan_uint8_t pan_stor_key_type_t;
-+typedef pan_uint64_t pan_stor_len_t;
-+typedef pan_int64_t pan_stor_delta_len_t;
-+typedef pan_uint64_t pan_stor_offset_t;
-+typedef pan_uint16_t pan_stor_op_t;
++ err = -EINVAL;
++ /* disk_name */
++ /* FIXME: need to check for valid disk_name. search superblocks?
++ * check for slash dev slash ?
++ */
++ len = strcspn(bufp, ":");
++ if (len > DISK_NAME_LEN)
++ goto out_free;
++ memcpy(new->disk_name, bufp, len);
+
-+typedef pan_uint16_t pan_stor_sec_level_t;
++ err = -EINVAL;
++ bufp += len + 1;
++ if (bufp >= endp)
++ goto out_free;
+
-+struct pan_stor_obj_id_s {
-+ pan_stor_dev_id_t dev_id;
-+ pan_stor_obj_uniq_t obj_id;
-+ pan_stor_obj_grp_id_t grp_id;
-+};
++ /* data server list */
++ /* FIXME: need to check for comma separated valid ip format */
++ len = strcspn(bufp, ":");
++ if (len > NFSD_DLM_DS_LIST_MAX)
++ goto out_free;
++ memcpy(new->ds_list, bufp, len);
+
-+typedef struct pan_stor_obj_id_s pan_stor_obj_id_t;
+
-+#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U)
-+#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U)
-+#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U)
-+#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U)
-+#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U)
-+#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U)
-+#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U)
-+#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U)
++ /* validate the ips */
++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds)))
++ goto out_free;
+
-+/*
-+ * from pan_aggregation_map.h
-+ */
-+typedef pan_uint8_t pan_agg_type_t;
-+typedef pan_uint64_t pan_agg_map_version_t;
-+typedef pan_uint8_t pan_agg_obj_state_t;
-+typedef pan_uint8_t pan_agg_comp_state_t;
-+typedef pan_uint8_t pan_agg_comp_flag_t;
++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__,
++ new->disk_name, new->num_ds, new->ds_list);
+
-+#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00)
-+#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01)
-+#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02)
-+#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03)
-+#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04)
-+#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05)
-+#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06)
-+#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07)
-+#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00)
-+#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01)
-+#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02)
-+#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03)
-+#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00)
-+#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01)
-+#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02)
-+#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04)
++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name);
++ if (found) {
++ /* FIXME: should compare found->ds_list with new->ds_list
++ * and if it is different, kick off a CB_NOTIFY change
++ * deviceid.
++ */
++ dprintk("%s pnfs_dlm_device %s:%s already in cache "
++ " replace ds_list with new ds_list %s\n", __func__,
++ found->disk_name, found->ds_list, new->ds_list);
++ memset(found->ds_list, 0, DISK_NAME_LEN);
++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list));
++ found->num_ds = new->num_ds;
++ kfree(new);
++ } else {
++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__,
++ new->disk_name, new->ds_list);
++ spin_lock(&dlm_device_list_lock);
++ list_add(&new->dlm_dev_list, &dlm_device_list);
++ spin_unlock(&dlm_device_list_lock);
++ }
++ dprintk("<-- %s Success\n", __func__);
++ return 0;
+
-+struct pan_aggregation_map_s {
-+ pan_agg_map_version_t version;
-+ pan_agg_obj_state_t avail_state;
-+ pan_stor_obj_id_t obj_id;
-+};
++out_free:
++ kfree(new);
++ dprintk("<-- %s returns %d\n", __func__, err);
++ return err;
++}
+
-+typedef struct pan_aggregation_map_s pan_aggregation_map_t;
++void nfsd4_pnfs_dlm_shutdown(void)
++{
++ struct dlm_device_entry *dlm_pdev, *next;
+
-+struct pan_agg_comp_obj_s {
-+ pan_stor_dev_id_t dev_id;
-+ pan_agg_comp_state_t avail_state;
-+ pan_agg_comp_flag_t comp_flags;
-+};
++ dprintk("--> %s\n", __func__);
+
-+typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t;
++ spin_lock(&dlm_device_list_lock);
++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list,
++ dlm_dev_list) {
++ list_del(&dlm_pdev->dlm_dev_list);
++ kfree(dlm_pdev);
++ }
++ spin_unlock(&dlm_device_list_lock);
++}
+
-+struct pan_agg_simple_header_s {
-+ pan_uint8_t unused;
-+};
++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb,
++ u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *res)
++{
++ if (layout_type != LAYOUT_NFSV4_1_FILES) {
++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' "
++ "(type: %x)\n", __func__, layout_type);
++ return -ENOTSUPP;
++ }
+
-+typedef struct pan_agg_simple_header_s pan_agg_simple_header_t;
++ res->gd_eof = 1;
++ if (res->gd_cookie)
++ return -ENOENT;
+
-+struct pan_agg_raid1_header_s {
-+ pan_uint16_t num_comps;
-+};
++ res->gd_cookie = 1;
++ res->gd_verf = 1;
++ res->gd_devid = 1;
++ return 0;
++}
++
++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb,
++ struct exp_xdr_stream *xdr,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *devid)
++{
++ int err, len, i = 0;
++ struct pnfs_filelayout_device fdev;
++ struct pnfs_filelayout_devaddr *daddr;
++ struct dlm_device_entry *dlm_pdev;
++ char *bufp;
++
++ err = -ENOTSUPP;
++ if (layout_type != LAYOUT_NFSV4_1_FILES) {
++ dprintk("%s: ERROR: layout type isn't 'file' "
++ "(type: %x)\n", __func__, layout_type);
++ return err;
++ }
++
++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO
++ * with a gdia_device_id != 1 is invalid.
++ */
++ err = -EINVAL;
++ if (devid->devid != 1) {
++ dprintk("%s: WARNING: didn't receive a deviceid of "
++ "1 (got: 0x%llx)\n", __func__, devid->devid);
++ return err;
++ }
+
-+typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t;
++ /*
++ * If the DS list has not been established, return -EINVAL
++ */
++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb);
++ if (!dlm_pdev) {
++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__,
++ sb->s_bdev->bd_disk->disk_name);
++ return err;
++ }
+
-+struct pan_agg_raid0_header_s {
-+ pan_uint16_t num_comps;
-+ pan_uint32_t stripe_unit;
-+};
++ dprintk("%s: Found disk %s with DS list |%s|\n",
++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list);
+
-+typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t;
++ memset(&fdev, '\0', sizeof(fdev));
++ fdev.fl_device_length = dlm_pdev->num_ds;
+
-+struct pan_agg_raid5_left_header_s {
-+ pan_uint16_t num_comps;
-+ pan_uint32_t stripe_unit0;
-+ pan_uint32_t stripe_unit1;
-+ pan_uint32_t stripe_unit2;
-+};
++ err = -ENOMEM;
++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length;
++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL);
++ if (!fdev.fl_device_list) {
++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list "
++ "buffer for %d DSes.\n", __func__, i);
++ fdev.fl_device_length = 0;
++ goto out;
++ }
+
-+typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t;
++ /* Set a simple stripe indicie */
++ fdev.fl_stripeindices_length = fdev.fl_device_length;
++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) *
++ fdev.fl_stripeindices_length, GFP_KERNEL);
+
-+typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t;
++ if (!fdev.fl_stripeindices_list) {
++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices "
++ "list buffer for %d DSes.\n", __func__, i);
++ goto out;
++ }
++ for (i = 0; i < fdev.fl_stripeindices_length; i++)
++ fdev.fl_stripeindices_list[i] = i;
+
-+struct pan_agg_grp_raid5_left_header_s {
-+ pan_uint16_t num_comps;
-+ pan_uint32_t stripe_unit;
-+ pan_uint16_t rg_width;
-+ pan_uint16_t rg_depth;
-+ pan_uint8_t group_layout_policy;
-+};
++ /* Transfer the data server list with a single multipath entry */
++ bufp = dlm_pdev->ds_list;
++ for (i = 0; i < fdev.fl_device_length; i++) {
++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL);
++ if (!daddr) {
++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device "
++ "addr buffer.\n", __func__);
++ goto out;
++ }
+
-+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00)
-+#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01)
++ daddr->r_netid.data = "tcp";
++ daddr->r_netid.len = 3;
+
-+#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00)
-+#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01)
-+#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02)
-+#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03)
-+#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04)
-+#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06)
-+#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01)
-+#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06)
++ len = strcspn(bufp, ",");
++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL);
++ memcpy(daddr->r_addr.data, bufp, len);
++ /*
++ * append the port number. interpreted as two more bytes
++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049.
++ */
++ memcpy(daddr->r_addr.data + len, ".8.1", 4);
++ daddr->r_addr.len = len + 4;
+
-+struct pan_agg_layout_hdr_s {
-+ pan_agg_type_t type;
-+ pan_pad_t pad[3];
-+ union {
-+ pan_uint64_t null;
-+ pan_agg_simple_header_t simple;
-+ pan_agg_raid1_header_t raid1;
-+ pan_agg_raid0_header_t raid0;
-+ pan_agg_raid5_left_header_t raid5_left;
-+ pan_agg_grp_raid5_left_header_t grp_raid5_left;
-+ } hdr;
-+};
++ fdev.fl_device_list[i].fl_multipath_length = 1;
++ fdev.fl_device_list[i].fl_multipath_list = daddr;
+
-+typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t;
++ dprintk("%s: encoding DS |%s|\n", __func__, bufp);
+
-+struct pan_agg_comp_obj_a_s {
-+ pan_rpc_arrdim_t size;
-+ pan_agg_comp_obj_t *data;
-+};
-+typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a;
++ bufp += len + 1;
++ }
+
-+struct pan_agg_full_map_s {
-+ pan_aggregation_map_t map_hdr;
-+ pan_agg_layout_hdr_t layout_hdr;
-+ pan_agg_comp_obj_a components;
-+};
++ /* have nfsd encode the device info */
++ err = filelayout_encode_devinfo(xdr, &fdev);
++out:
++ for (i = 0; i < fdev.fl_device_length; i++)
++ kfree(fdev.fl_device_list[i].fl_multipath_list);
++ kfree(fdev.fl_device_list);
++ kfree(fdev.fl_stripeindices_list);
++ dprintk("<-- %s returns %d\n", __func__, err);
++ return err;
++}
+
-+typedef struct pan_agg_full_map_s pan_agg_full_map_t;
++static int get_stripe_unit(int blocksize)
++{
++ if (blocksize >= NFSSVC_MAXBLKSIZE)
++ return blocksize;
++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
++}
+
+/*
-+ * from pan_obsd_rpc_types.h
++ * Look up inode block device in pnfs_dlm_device list.
++ * Hash on the inode->i_ino and number of data servers.
+ */
-+typedef pan_uint8_t pan_obsd_security_key_a[16];
++static int dlm_ino_hash(struct inode *ino)
++{
++ struct dlm_device_entry *de;
++ u32 hash_mask = 0;
+
-+typedef pan_uint8_t pan_obsd_capability_key_a[20];
++ /* If can't find the inode block device in the pnfs_dlm_deivce list
++ * then don't hand out a layout
++ */
++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb);
++ if (!de)
++ return -1;
++ hash_mask = de->num_ds - 1;
++ return ino->i_ino & hash_mask;
++}
+
-+typedef pan_uint8_t pan_obsd_key_holder_id_t;
++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode,
++ struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_layoutget_arg *args,
++ struct nfsd4_pnfs_layoutget_res *res)
++{
++ struct pnfs_filelayout_layout *layout = NULL;
++ struct knfsd_fh *fhp = NULL;
++ int index;
++ enum nfsstat4 rc = NFS4_OK;
+
-+#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01)
-+#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02)
++ dprintk("%s: LAYOUT_GET\n", __func__);
+
-+struct pan_obsd_key_holder_s {
-+ pan_obsd_key_holder_id_t select;
-+ pan_pad_t pad[3];
-+ union {
-+ pan_obsd_security_key_a basis_key;
-+ pan_obsd_capability_key_a cap_key;
-+ } key;
-+};
++ /* DLM exported file systems only support layouts for READ */
++ if (res->lg_seg.iomode == IOMODE_RW)
++ return NFS4ERR_BADIOMODE;
+
-+typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t;
++ index = dlm_ino_hash(inode);
++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index,
++ inode->i_ino);
++ if (index < 0)
++ return NFS4ERR_LAYOUTUNAVAILABLE;
+
-+/*
-+ * from pan_sm_sec.h
-+ */
-+typedef pan_uint8_t pan_sm_sec_type_t;
-+typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t;
++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
++ /* Always give out whole file layouts */
++ res->lg_seg.offset = 0;
++ res->lg_seg.length = NFS4_MAX_UINT64;
++ /* Always give out READ ONLY layouts */
++ res->lg_seg.iomode = IOMODE_READ;
+
-+struct pan_obsd_capability_generic_otw_t_s {
-+ pan_rpc_arrdim_t size;
-+ pan_uint8_t *data;
-+};
-+typedef struct pan_obsd_capability_generic_otw_t_s
-+ pan_obsd_capability_generic_otw_t;
++ layout = kzalloc(sizeof(*layout), GFP_KERNEL);
++ if (layout == NULL) {
++ rc = NFS4ERR_LAYOUTTRYLATER;
++ goto error;
++ }
+
-+struct pan_sm_sec_obsd_s {
-+ pan_obsd_key_holder_t key;
-+ pan_obsd_capability_generic_otw_t cap_otw;
-+ pan_sm_sec_otw_allo_mode_t allo_mode;
-+};
++ /* Set file layout response args */
++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
++ layout->lg_stripe_type = STRIPE_SPARSE;
++ layout->lg_commit_through_mds = false;
++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
++ layout->lg_fh_length = 1;
++ layout->device_id.sbid = args->lg_sbid;
++ layout->device_id.devid = 1; /*FSFTEMP*/
++ layout->lg_first_stripe_index = index; /*FSFTEMP*/
++ layout->lg_pattern_offset = 0;
+
-+typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t;
++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
++ if (fhp == NULL) {
++ rc = NFS4ERR_LAYOUTTRYLATER;
++ goto error;
++ }
+
-+struct pan_sm_sec_s {
-+ pan_sm_sec_type_t type;
-+ pan_pad_t pad[3];
-+ union {
-+ pan_rpc_none_t none;
-+ pan_sm_sec_obsd_t obsd;
-+ } variant;
-+};
++ memcpy(fhp, args->lg_fh, sizeof(*fhp));
++ pnfs_fh_mark_ds(fhp);
++ layout->lg_fh_list = fhp;
+
-+typedef struct pan_sm_sec_s pan_sm_sec_t;
++ /* Call nfsd to encode layout */
++ rc = filelayout_encode_layout(xdr, layout);
++exit:
++ kfree(layout);
++ kfree(fhp);
++ return rc;
++
++error:
++ res->lg_seg.length = 0;
++ goto exit;
++}
++
++static int
++nfsd4_pnfs_dlm_layouttype(struct super_block *sb)
++{
++ return LAYOUT_NFSV4_1_FILES;
++}
+
-+struct pan_sm_sec_a_s {
-+ pan_rpc_arrdim_t size;
-+ pan_sm_sec_t *data;
++/* For use by DLM cluster file systems exported by pNFSD */
++const struct pnfs_export_operations pnfs_dlm_export_ops = {
++ .layout_type = nfsd4_pnfs_dlm_layouttype,
++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo,
++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter,
++ .layout_get = nfsd4_pnfs_dlm_layoutget,
+};
-+typedef struct pan_sm_sec_a_s pan_sm_sec_a;
-+typedef pan_otw_t *pan_sm_sec_otw_t;
-+
++EXPORT_SYMBOL(pnfs_dlm_export_ops);
+diff --git a/fs/nfsd/nfs4pnfsds.c b/fs/nfsd/nfs4pnfsds.c
+new file mode 100644
+index 0000000..8ebc64d
+--- /dev/null
++++ b/fs/nfsd/nfs4pnfsds.c
+@@ -0,0 +1,620 @@
+/*
-+ * from pan_sm_types.h
-+ */
-+typedef pan_uint64_t pan_sm_cap_handle_t;
++* linux/fs/nfsd/nfs4pnfsds.c
++*
++* Copyright (c) 2005 The Regents of the University of Michigan.
++* All rights reserved.
++*
++* Andy Adamson <andros at umich.edu>
++*
++* Redistribution and use in source and binary forms, with or without
++* modification, are permitted provided that the following conditions
++* are met:
++*
++* 1. Redistributions of source code must retain the above copyright
++* notice, this list of conditions and the following disclaimer.
++* 2. Redistributions in binary form must reproduce the above copyright
++* notice, this list of conditions and the following disclaimer in the
++* documentation and/or other materials provided with the distribution.
++* 3. Neither the name of the University nor the names of its
++* contributors may be used to endorse or promote products derived
++* from this software without specific prior written permission.
++*
++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*
++*/
++#if defined(CONFIG_PNFSD)
+
-+struct pan_sm_map_cap_s {
-+ pan_agg_full_map_t full_map;
-+ pan_stor_offset_t offset;
-+ pan_stor_len_t length;
-+ pan_sm_sec_a secs;
-+ pan_sm_cap_handle_t handle;
-+ pan_timespec_t expiration_time;
-+ pan_stor_action_t action_mask;
-+ pan_uint32_t flags;
-+};
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
-+typedef struct pan_sm_map_cap_s pan_sm_map_cap_t;
++#include <linux/param.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/debug.h>
++#include <linux/nfs4.h>
++#include <linux/exportfs.h>
++#include <linux/sched.h>
++
++#include "nfsd.h"
++#include "pnfsd.h"
++#include "state.h"
+
+/*
-+ * from pan_sm_ops.h
++ *******************
++ * PNFS
++ *******************
+ */
-+typedef pan_rpc_none_t pan_sm_cache_ptr_t;
-+
+/*
-+ * from pan_sam_api.h
++ * Hash tables for pNFS Data Server state
++ *
++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using
++ * this data server (DS).
++ *
++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained
++ * from any MDS.
++ *
++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained
++ * from any MDS.
++ *
+ */
-+typedef pan_uint32_t pan_sam_access_flags_t;
-+
-+typedef struct pan_sam_dev_error_s pan_sam_dev_error_t;
-+struct pan_sam_dev_error_s {
-+ pan_stor_dev_id_t dev_id;
-+ pan_stor_op_t stor_op;
-+ pan_status_t error;
-+};
-+
-+typedef struct pan_sam_ext_status_s pan_sam_ext_status_t;
-+struct pan_sam_ext_status_s {
-+ pan_uint32_t available;
-+ pan_uint32_t size;
-+ pan_sam_dev_error_t *errors;
-+};
-+
-+enum pan_sam_rpc_sec_sel_e {
-+ PAN_SAM_RPC_SEC_DEFAULT,
-+ PAN_SAM_RPC_SEC_ATLEAST,
-+ PAN_SAM_RPC_SEC_EXACTLY
-+};
-+typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t;
++/* Hash tables for clientid state */
++#define CLIENT_HASH_BITS 4
++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
+
-+typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t;
-+struct pan_sam_obj_sec_s {
-+ pan_stor_sec_level_t min_security;
-+ pan_sm_map_cap_t *map_ccaps;
-+};
++#define clientid_hashval(id) \
++ ((id) & CLIENT_HASH_MASK)
+
-+typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t;
-+struct pan_sam_rpc_sec_s {
-+ pan_sam_rpc_sec_sel_t selector;
-+};
++/* hash table for pnfs_ds_stateid */
++#define STATEID_HASH_BITS 10
++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
+
-+typedef struct pan_sam_read_args_s pan_sam_read_args_t;
-+struct pan_sam_read_args_s {
-+ pan_stor_obj_id_t obj_id;
-+ pan_sm_cache_ptr_t obj_ent;
-+ void *return_attr;
-+ void *checksum;
-+ pan_stor_offset_t offset;
-+ pan_uint16_t sm_options;
-+ void *callout;
-+ void *callout_arg;
-+};
++#define stateid_hashval(owner_id, file_id) \
++ (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
-+typedef struct pan_sam_read_res_s pan_sam_read_res_t;
-+struct pan_sam_read_res_s {
-+ pan_status_t result;
-+ pan_sam_ext_status_t ext_status;
-+ pan_stor_len_t length;
-+ void *attr;
-+ void *checksum;
-+};
++static struct list_head mds_id_tbl;
++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE];
++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE];
+
-+typedef void (*pan_sam_read_cb_t)(
-+ void *user_arg1,
-+ void *user_arg2,
-+ pan_sam_read_res_t *res_p,
-+ pan_status_t status);
++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp);
++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp);
+
-+#define PAN_SAM_ACCESS_NONE 0x0000
-+#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020
++/* Mutex for data server state. Needs to be separate from
++ * mds state mutex since a node can be both mds and ds */
++static DEFINE_MUTEX(ds_mutex);
++static struct thread_info *ds_mutex_owner;
+
-+typedef struct pan_sam_write_args_s pan_sam_write_args_t;
-+struct pan_sam_write_args_s {
-+ pan_stor_obj_id_t obj_id;
-+ pan_sm_cache_ptr_t obj_ent;
-+ pan_stor_offset_t offset;
-+ void *attr;
-+ void *return_attr;
-+};
++static void
++ds_lock_state(void)
++{
++ mutex_lock(&ds_mutex);
++ ds_mutex_owner = current_thread_info();
++}
+
-+typedef struct pan_sam_write_res_s pan_sam_write_res_t;
-+struct pan_sam_write_res_s {
-+ pan_status_t result;
-+ pan_sam_ext_status_t ext_status;
-+ pan_stor_len_t length;
-+ pan_stor_delta_len_t delta_capacity_used;
-+ pan_bool_t parity_dirty;
-+ void *attr;
-+};
++static void
++ds_unlock_state(void)
++{
++ BUG_ON(ds_mutex_owner != current_thread_info());
++ ds_mutex_owner = NULL;
++ mutex_unlock(&ds_mutex);
++}
+
-+typedef void (*pan_sam_write_cb_t)(
-+ void *user_arg1,
-+ void *user_arg2,
-+ pan_sam_write_res_t *res_p,
-+ pan_status_t status);
++static int
++cmp_clid(const clientid_t *cl1, const clientid_t *cl2)
++{
++ return (cl1->cl_boot == cl2->cl_boot) &&
++ (cl1->cl_id == cl2->cl_id);
++}
+
-+/*
-+ * from pan_mgr_types.h
-+ */
-+#define PAN_MGR_ID_TYPE_SHIFT 56
-+#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL)
-+#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL)
++void
++nfs4_pnfs_state_init(void)
++{
++ int i;
+
-+typedef pan_uint16_t pan_mgr_type_t;
-+typedef pan_uint64_t pan_mgr_id_t;
++ for (i = 0; i < CLIENT_HASH_SIZE; i++)
++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]);
+
-+#define PAN_MGR_SM ((pan_mgr_type_t) 2U)
-+#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U)
++ for (i = 0; i < STATEID_HASH_SIZE; i++)
++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]);
+
-+/*
-+ * from pan_mgr_types_c.h
-+ */
-+#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \
-+ pan_mgr_id_t _id1, _id2; \
-+\
-+ _id1 = (_mgr_type_); \
-+ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \
-+ _id1 &= PAN_MGR_ID_TYPE_MASK; \
-+ _id2 = (_mgr_uniq_); \
-+ _id2 &= PAN_MGR_ID_UNIQ_MASK; \
-+ _id1 |= _id2; \
-+ *(_mgr_id_p_) = _id1; \
++ INIT_LIST_HEAD(&mds_id_tbl);
+}
+
-+/*
-+ * from pan_storage_c.h
-+ */
-+#define pan_stor_is_device_id_an_obsd_id(_device_id_) \
-+ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \
-+ == PAN_MGR_OBSD)
++static struct pnfs_mds_id *
++find_pnfs_mds_id(u32 mdsid)
++{
++ struct pnfs_mds_id *local = NULL;
+
-+/*
-+ * pnfs_shim internal definitions
-+ */
++ dprintk("pNFSD: %s\n", __func__);
++ list_for_each_entry(local, &mds_id_tbl, di_hash) {
++ if (local->di_mdsid == mdsid)
++ return local;
++ }
++ return NULL;
++}
+
-+struct panfs_shim_io_state {
-+ struct objlayout_io_state ol_state;
++static struct pnfs_ds_clientid *
++find_pnfs_ds_clientid(const clientid_t *clid)
++{
++ struct pnfs_ds_clientid *local = NULL;
++ unsigned int hashval;
+
-+ pan_sg_entry_t *sg_list;
-+ pan_sam_obj_sec_t obj_sec;
-+ void *ucreds;
-+ union {
-+ struct {
-+ pan_sam_read_args_t args;
-+ pan_sam_read_res_t res;
-+ } read;
-+ struct {
-+ pan_sam_write_args_t args;
-+ pan_sam_write_res_t res;
-+ } write;
-+ } u;
-+};
++ dprintk("pNFSD: %s\n", __func__);
+
-+#endif /* _PANLAYOUT_PANFS_SHIM_H */
-diff -up linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
---- linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-09-30 12:25:08.391280000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-09-30 12:25:08.392286000 -0400
-@@ -0,0 +1,435 @@
-+/*
-+ * pnfs_osd_xdr.c
-+ *
-+ * Object-Based pNFS Layout XDR layer
-+ *
-+ * Copyright (C) 2007-2009 Panasas Inc.
-+ * All rights reserved.
-+ *
-+ * Benny Halevy <bhalevy at panasas.com>
-+ *
-+ * This program is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License version 2
-+ * See the file COPYING included with this distribution for more details.
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the Panasas company nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ */
++ hashval = clientid_hashval(clid->cl_id);
++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) {
++ if (cmp_clid(&local->dc_mdsclid, clid))
++ return local;
++ }
++ return NULL;
++}
+
-+#include <linux/pnfs_osd_xdr.h>
++static struct pnfs_ds_stateid *
++find_pnfs_ds_stateid(stateid_t *stid)
++{
++ struct pnfs_ds_stateid *local = NULL;
++ u32 st_id = stid->si_stateownerid;
++ u32 f_id = stid->si_fileid;
++ unsigned int hashval;
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
++ dprintk("pNFSD: %s\n", __func__);
+
-+/*
-+ * The following implementation is based on these Internet Drafts:
-+ *
-+ * draft-ietf-nfsv4-minorversion-21
-+ * draft-ietf-nfsv4-pnfs-obj-12
-+ */
++ hashval = stateid_hashval(st_id, f_id);
++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash)
++ if ((local->ds_stid.si_stateownerid == st_id) &&
++ (local->ds_stid.si_fileid == f_id) &&
++ (local->ds_stid.si_boot == stid->si_boot)) {
++ stateid_t *sid = &local->ds_stid;
++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n",
++ __func__, local, local->ds_flags,
++ STATEID_VAL(sid));
++ return local;
++ }
++ return NULL;
++}
+
-+/*
-+ * struct pnfs_osd_objid {
-+ * struct pnfs_deviceid oid_device_id;
-+ * u64 oid_partition_id;
-+ * u64 oid_object_id;
-+ * };
-+ */
-+static inline u32 *
-+pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid)
++static void
++release_ds_mdsid(struct kref *kref)
+{
-+ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data));
-+ READ64(objid->oid_partition_id);
-+ READ64(objid->oid_object_id);
-+ return p;
++ struct pnfs_mds_id *mdp =
++ container_of(kref, struct pnfs_mds_id, di_ref);
++ dprintk("pNFSD: %s\n", __func__);
++
++ list_del(&mdp->di_hash);
++ list_del(&mdp->di_mdsclid);
++ kfree(mdp);
+}
+
-+static inline u32 *
-+pnfs_osd_xdr_decode_opaque_cred(u32 *p,
-+ struct pnfs_osd_opaque_cred *opaque_cred)
++static void
++release_ds_clientid(struct kref *kref)
+{
-+ READ32(opaque_cred->cred_len);
-+ COPYMEM(opaque_cred->cred, opaque_cred->cred_len);
-+ return p;
++ struct pnfs_ds_clientid *dcp =
++ container_of(kref, struct pnfs_ds_clientid, dc_ref);
++ struct pnfs_mds_id *mdp;
++ dprintk("pNFSD: %s\n", __func__);
++
++ mdp = find_pnfs_mds_id(dcp->dc_mdsid);
++ if (mdp)
++ put_ds_mdsid(mdp);
++
++ list_del(&dcp->dc_hash);
++ list_del(&dcp->dc_stateid);
++ list_del(&dcp->dc_permdsid);
++ kfree(dcp);
+}
+
-+/*
-+ * struct pnfs_osd_object_cred {
-+ * struct pnfs_osd_objid oc_object_id;
-+ * u32 oc_osd_version;
-+ * u32 oc_cap_key_sec;
-+ * struct pnfs_osd_opaque_cred oc_cap_key
-+ * struct pnfs_osd_opaque_cred oc_cap;
-+ * };
-+ */
-+static inline u32 *
-+pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp,
-+ u8 **credp)
++static void
++release_ds_stateid(struct kref *kref)
+{
-+ u8 *cred;
-+
-+ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id);
-+ READ32(comp->oc_osd_version);
-+ READ32(comp->oc_cap_key_sec);
++ struct pnfs_ds_stateid *dsp =
++ container_of(kref, struct pnfs_ds_stateid, ds_ref);
++ struct pnfs_ds_clientid *dcp;
++ dprintk("pNFS %s: dsp %p\n", __func__, dsp);
+
-+ cred = *credp;
-+ comp->oc_cap_key.cred = cred;
-+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key);
-+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len));
-+ comp->oc_cap.cred = cred;
-+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap);
-+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len));
-+ *credp = cred;
++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid);
++ if (dcp)
++ put_ds_clientid(dcp);
+
-+ return p;
++ list_del(&dsp->ds_hash);
++ list_del(&dsp->ds_perclid);
++ kfree(dsp);
+}
+
-+/*
-+ * struct pnfs_osd_data_map {
-+ * u32 odm_num_comps;
-+ * u64 odm_stripe_unit;
-+ * u32 odm_group_width;
-+ * u32 odm_group_depth;
-+ * u32 odm_mirror_cnt;
-+ * u32 odm_raid_algorithm;
-+ * };
-+ */
-+static inline u32 *
-+pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map)
++static inline void
++put_ds_clientid(struct pnfs_ds_clientid *dcp)
+{
-+ READ32(data_map->odm_num_comps);
-+ READ64(data_map->odm_stripe_unit);
-+ READ32(data_map->odm_group_width);
-+ READ32(data_map->odm_group_depth);
-+ READ32(data_map->odm_mirror_cnt);
-+ READ32(data_map->odm_raid_algorithm);
-+ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
-+ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
-+ __func__,
-+ data_map->odm_num_comps,
-+ (unsigned long long)data_map->odm_stripe_unit,
-+ data_map->odm_group_width,
-+ data_map->odm_group_depth,
-+ data_map->odm_mirror_cnt,
-+ data_map->odm_raid_algorithm);
-+ return p;
++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
++ atomic_read(&dcp->dc_ref.refcount));
++ kref_put(&dcp->dc_ref, release_ds_clientid);
+}
+
-+struct pnfs_osd_layout *
-+pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p)
++static inline void
++get_ds_clientid(struct pnfs_ds_clientid *dcp)
+{
-+ int i;
-+ u32 *start = p;
-+ struct pnfs_osd_object_cred *comp;
-+ u8 *cred;
++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp,
++ atomic_read(&dcp->dc_ref.refcount));
++ kref_get(&dcp->dc_ref);
++}
+
-+ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map);
-+ READ32(layout->olo_comps_index);
-+ READ32(layout->olo_num_comps);
-+ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1);
-+ comp = layout->olo_comps;
-+ cred = (u8 *)(comp + layout->olo_num_comps);
-+ dprintk("%s: comps_index=%u num_comps=%u\n",
-+ __func__, layout->olo_comps_index, layout->olo_num_comps);
-+ for (i = 0; i < layout->olo_num_comps; i++) {
-+ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred);
-+ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx "
-+ "key_len=%u cap_len=%u\n",
-+ __func__, i,
-+ _DEVID_LO(&comp->oc_object_id.oid_device_id),
-+ _DEVID_HI(&comp->oc_object_id.oid_device_id),
-+ comp->oc_object_id.oid_partition_id,
-+ comp->oc_object_id.oid_object_id,
-+ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
-+ comp++;
-+ }
-+ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__,
-+ (char *)p - (char *)start, cred, (char *)cred - (char *)layout);
-+ return layout;
++static inline void
++put_ds_mdsid(struct pnfs_mds_id *mdp)
++{
++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
++ atomic_read(&mdp->di_ref.refcount));
++ kref_put(&mdp->di_ref, release_ds_mdsid);
+}
+
-+/*
-+ * Get Device Information Decoding
-+ *
-+ * Note: since Device Information is currently done synchronously, most
-+ * of the actual fields are left inside the rpc buffer and are only
-+ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
-+ * should not be freed while the returned information is in use.
-+ */
++static inline void
++get_ds_mdsid(struct pnfs_mds_id *mdp)
++{
++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp,
++ atomic_read(&mdp->di_ref.refcount));
++ kref_get(&mdp->di_ref);
++}
+
-+u32 *__xdr_read_calc_nfs4_string(
-+ u32 *p, struct nfs4_string *str, u8 **freespace)
++static inline void
++put_ds_stateid(struct pnfs_ds_stateid *dsp)
+{
-+ u32 len;
-+ char *data;
-+ bool need_copy;
++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
++ atomic_read(&dsp->ds_ref.refcount));
++ kref_put(&dsp->ds_ref, release_ds_stateid);
++}
+
-+ READ32(len);
-+ data = (char *)p;
++static inline void
++get_ds_stateid(struct pnfs_ds_stateid *dsp)
++{
++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp,
++ atomic_read(&dsp->ds_ref.refcount));
++ kref_get(&dsp->ds_ref);
++}
+
-+ if (data[len]) { /* Not null terminated we'll need extra space */
-+ data = *freespace;
-+ *freespace += len + 1;
-+ need_copy = true;
-+ } else {
-+ need_copy = false;
-+ }
++void
++nfs4_pnfs_state_shutdown(void)
++{
++ struct pnfs_ds_stateid *dsp;
++ int i;
+
-+ if (str) {
-+ str->len = len;
-+ str->data = data;
-+ if (need_copy) {
-+ memcpy(data, p, len);
-+ data[len] = 0;
++ dprintk("pNFSD %s: -->\n", __func__);
++
++ ds_lock_state();
++ for (i = 0; i < STATEID_HASH_SIZE; i++) {
++ while (!list_empty(&ds_stid_hashtbl[i])) {
++ dsp = list_entry(ds_stid_hashtbl[i].next,
++ struct pnfs_ds_stateid, ds_hash);
++ put_ds_stateid(dsp);
+ }
+ }
-+
-+ p += XDR_QUADLEN(len);
-+ return p;
++ ds_unlock_state();
+}
+
-+u32 *__xdr_read_calc_u8_opaque(
-+ u32 *p, struct nfs4_string *str)
++static struct pnfs_mds_id *
++alloc_init_mds_id(struct pnfs_get_state *gsp)
+{
-+ u32 len;
-+
-+ READ32(len);
++ struct pnfs_mds_id *mdp;
+
-+ if (str) {
-+ str->len = len;
-+ str->data = (char *)p;
-+ }
++ dprintk("pNFSD: %s\n", __func__);
+
-+ p += XDR_QUADLEN(len);
-+ return p;
++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL);
++ if (!mdp)
++ return NULL;
++ INIT_LIST_HEAD(&mdp->di_hash);
++ INIT_LIST_HEAD(&mdp->di_mdsclid);
++ list_add(&mdp->di_hash, &mds_id_tbl);
++ mdp->di_mdsid = gsp->dsid;
++ mdp->di_mdsboot = 0;
++ kref_init(&mdp->di_ref);
++ return mdp;
+}
+
-+/*
-+ * struct pnfs_osd_targetid {
-+ * u32 oti_type;
-+ * struct nfs4_string oti_scsi_device_id;
-+ * };
-+ */
-+u32 *__xdr_read_calc_targetid(
-+ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace)
++static struct pnfs_ds_clientid *
++alloc_init_ds_clientid(struct pnfs_get_state *gsp)
+{
-+ u32 oti_type;
++ struct pnfs_mds_id *mdp;
++ struct pnfs_ds_clientid *dcp;
++ clientid_t *clid = (clientid_t *)&gsp->clid;
++ unsigned int hashval = clientid_hashval(clid->cl_id);
+
-+ READ32(oti_type);
-+ if (targetid)
-+ targetid->oti_type = oti_type;
++ dprintk("pNFSD: %s\n", __func__);
+
-+ switch (oti_type) {
-+ case OBJ_TARGET_SCSI_NAME:
-+ case OBJ_TARGET_SCSI_DEVICE_ID:
-+ p = __xdr_read_calc_u8_opaque(p,
-+ targetid ? &targetid->oti_scsi_device_id : NULL);
++ mdp = find_pnfs_mds_id(gsp->dsid);
++ if (!mdp) {
++ mdp = alloc_init_mds_id(gsp);
++ if (!mdp)
++ return NULL;
++ } else {
++ get_ds_mdsid(mdp);
+ }
+
-+ return p;
++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL);
++ if (!dcp)
++ return NULL;
++
++ INIT_LIST_HEAD(&dcp->dc_hash);
++ INIT_LIST_HEAD(&dcp->dc_stateid);
++ INIT_LIST_HEAD(&dcp->dc_permdsid);
++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]);
++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid);
++ dcp->dc_mdsclid = *clid;
++ kref_init(&dcp->dc_ref);
++ dcp->dc_mdsid = gsp->dsid;
++ return dcp;
+}
+
-+/*
-+ * struct pnfs_osd_net_addr {
-+ * struct nfs4_string r_netid;
-+ * struct nfs4_string r_addr;
-+ * };
-+ */
-+u32 *__xdr_read_calc_net_addr(
-+ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace)
++static struct pnfs_ds_stateid *
++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp)
+{
++ struct pnfs_ds_stateid *dsp;
++ u32 st_id = stidp->si_stateownerid;
++ u32 f_id = stidp->si_fileid;
++ unsigned int hashval;
+
-+ p = __xdr_read_calc_nfs4_string(p,
-+ netaddr ? &netaddr->r_netid : NULL,
-+ freespace);
++ dprintk("pNFSD: %s\n", __func__);
+
-+ p = __xdr_read_calc_nfs4_string(p,
-+ netaddr ? &netaddr->r_addr : NULL,
-+ freespace);
++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL);
++ if (!dsp)
++ return dsp;
+
-+ return p;
++ INIT_LIST_HEAD(&dsp->ds_hash);
++ INIT_LIST_HEAD(&dsp->ds_perclid);
++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t));
++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle);
++ dsp->ds_access = 0;
++ dsp->ds_status = 0;
++ dsp->ds_flags = 0L;
++ kref_init(&dsp->ds_ref);
++ set_bit(DS_STATEID_NEW, &dsp->ds_flags);
++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags);
++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
++ init_waitqueue_head(&dsp->ds_waitq);
++
++ hashval = stateid_hashval(st_id, f_id);
++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]);
++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
++ return dsp;
+}
+
-+/*
-+ * struct pnfs_osd_targetaddr {
-+ * u32 ota_available;
-+ * struct pnfs_osd_net_addr ota_netaddr;
-+ * };
-+ */
-+u32 *__xdr_read_calc_targetaddr(
-+ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace)
++static int
++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh,
++ struct pnfs_get_state *gsp)
+{
-+ u32 ota_available;
++ struct pnfs_ds_clientid *dcp;
++ int new = 0;
+
-+ READ32(ota_available);
-+ if (targetaddr)
-+ targetaddr->ota_available = ota_available;
++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
+
-+ if (ota_available) {
-+ p = __xdr_read_calc_net_addr(p,
-+ targetaddr ? &targetaddr->ota_netaddr : NULL,
-+ freespace);
++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid);
++ if (!dcp) {
++ dcp = alloc_init_ds_clientid(gsp);
++ if (!dcp)
++ return 1;
++ new = 1;
++ }
++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) {
++ list_add(&dsp->ds_perclid, &dcp->dc_stateid);
++ if (!new)
++ get_ds_clientid(dcp);
+ }
+
-+ return p;
++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t));
++ dsp->ds_access = gsp->access;
++ dsp->ds_status = 0;
++ dsp->ds_verifier[0] = gsp->verifier[0];
++ dsp->ds_verifier[1] = gsp->verifier[1];
++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t));
++ set_bit(DS_STATEID_VALID, &dsp->ds_flags);
++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags);
++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags);
++ return 0;
+}
+
-+/*
-+ * struct pnfs_osd_deviceaddr {
-+ * struct pnfs_osd_targetid oda_targetid;
-+ * struct pnfs_osd_targetaddr oda_targetaddr;
-+ * u8 oda_lun[8];
-+ * struct nfs4_string oda_systemid;
-+ * struct pnfs_osd_object_cred oda_root_obj_cred;
-+ * struct nfs4_string oda_osdname;
-+ * };
++int
++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs)
++{
++ stateid_t *stid = (stateid_t *)&gs->stid;
++ struct pnfs_ds_stateid *dsp;
++
++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__,
++ STATEID_VAL(stid));
++
++ ds_lock_state();
++ dsp = find_pnfs_ds_stateid(stid);
++ if (dsp)
++ put_ds_stateid(dsp);
++ ds_unlock_state();
++
++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp);
++
++ if (dsp)
++ return 0;
++ return -ENOENT;
++}
++
++/* Retrieves and validates stateid.
++ * If stateid exists and its fields match, return it.
++ * If stateid exists but either the generation or
++ * ownerids don't match, check with mds to see if it is valid.
++ * If the stateid doesn't exist, the first thread creates a
++ * invalid *marker* stateid, then checks to see if the
++ * stateid exists on the mds. If so, it validates the *marker*
++ * stateid and updates its fields. Subsequent threads that
++ * find the *marker* stateid wait until it is valid or an error
++ * occurs.
++ * Called with ds_state_lock.
+ */
-+u32 *__xdr_read_calc_deviceaddr(
-+ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace)
++static struct pnfs_ds_stateid *
++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp)
+{
-+ p = __xdr_read_calc_targetid(p,
-+ deviceaddr ? &deviceaddr->oda_targetid : NULL,
-+ freespace);
++ struct inode *ino = cfh->fh_dentry->d_inode;
++ struct super_block *sb;
++ struct pnfs_ds_stateid *dsp = NULL;
++ struct pnfs_get_state gs = {
++ .access = 0,
++ };
++ int status = 0, waiter = 0;
+
-+ p = __xdr_read_calc_targetaddr(p,
-+ deviceaddr ? &deviceaddr->oda_targetaddr : NULL,
-+ freespace);
++ dprintk("pNFSD: %s -->\n", __func__);
+
-+ if (deviceaddr)
-+ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun));
-+ else
-+ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun));
++ dsp = find_pnfs_ds_stateid(stidp);
++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) &&
++ (stidp->si_generation == dsp->ds_stid.si_generation))
++ goto out_noput;
+
-+ p = __xdr_read_calc_u8_opaque(p,
-+ deviceaddr ? &deviceaddr->oda_systemid : NULL);
++ sb = ino->i_sb;
++ if (!sb || !sb->s_pnfs_op->get_state)
++ goto out_noput;
+
-+ if (deviceaddr) {
-+ p = pnfs_osd_xdr_decode_object_cred(p,
-+ &deviceaddr->oda_root_obj_cred, freespace);
++ /* Uninitialize current state if it exists yet it doesn't match.
++ * If it is already invalid, another thread is checking state */
++ if (dsp) {
++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags))
++ waiter = 1;
+ } else {
-+ *freespace += pnfs_osd_object_cred_incore_sz(p);
-+ p += pnfs_osd_object_cred_xdr_sz(p);
++ dsp = alloc_init_ds_stateid(cfh, stidp);
++ if (!dsp)
++ goto out_noput;
+ }
+
-+ p = __xdr_read_calc_u8_opaque(p,
-+ deviceaddr ? &deviceaddr->oda_osdname : NULL);
-+
-+ return p;
-+}
-+
-+size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p)
-+{
-+ u8 *null_freespace = NULL;
-+ size_t sz;
++ dprintk("pNFSD: %s Starting loop\n", __func__);
++ get_ds_stateid(dsp);
++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
++ ds_unlock_state();
+
-+ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace);
-+ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace;
++ /* Another thread is checking the state */
++ if (waiter) {
++ dprintk("pNFSD: %s waiting\n", __func__);
++ wait_event_interruptible_timeout(dsp->ds_waitq,
++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) ||
++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)),
++ msecs_to_jiffies(1024));
++ dprintk("pNFSD: %s awake\n", __func__);
++ ds_lock_state();
++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
++ goto out;
+
-+ return sz;
-+}
++ continue;
++ }
+
-+void pnfs_osd_xdr_decode_deviceaddr(
-+ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p)
-+{
-+ u8 *freespace = (u8 *)(deviceaddr + 1);
++ /* Validate stateid on mds */
++ dprintk("pNFSD: %s Checking state on MDS\n", __func__);
++ memcpy(&gs.stid, stidp, sizeof(stateid_t));
++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs);
++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status);
++ ds_lock_state();
++ /* if !status and stateid is valid, update id and mark valid */
++ if (status || update_ds_stateid(dsp, cfh, &gs)) {
++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags);
++ /* remove invalid stateid from list */
++ put_ds_stateid(dsp);
++ wake_up(&dsp->ds_waitq);
++ goto out;
++ }
+
-+ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
++ wake_up(&dsp->ds_waitq);
++ }
++out:
++ if (dsp)
++ put_ds_stateid(dsp);
++out_noput:
++ if (dsp)
++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n",
++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid));
++ /* If error, return null */
++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags))
++ dsp = NULL;
++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
++ return dsp;
+}
+
-+/*
-+ * struct pnfs_osd_layoutupdate {
-+ * u32 dsu_valid;
-+ * s64 dsu_delta;
-+ * u32 olu_ioerr_flag;
-+ * };
-+ */
+int
-+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
-+ struct pnfs_osd_layoutupdate *lou)
++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid)
+{
-+ __be32 *p = xdr_reserve_space(xdr, 16);
-+
-+ if (!p)
-+ return -E2BIG;
++ struct pnfs_ds_stateid *dsp;
++ int status = 0;
+
-+ *p++ = cpu_to_be32(lou->dsu_valid);
-+ if (lou->dsu_valid)
-+ p = xdr_encode_hyper(p, lou->dsu_delta);
-+ *p++ = cpu_to_be32(lou->olu_ioerr_flag);
-+ return 0;
-+}
++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__,
++ STATEID_VAL(stateid));
+
-+/*
-+ * struct pnfs_osd_objid {
-+ * struct pnfs_deviceid oid_device_id;
-+ * u64 oid_partition_id;
-+ * u64 oid_object_id;
-+ */
-+static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
-+ struct pnfs_osd_objid *object_id)
-+{
-+ __be32 *p;
++ /* Must release state lock while verifying stateid on mds */
++ nfs4_unlock_state();
++ ds_lock_state();
++ dsp = nfsv4_ds_get_state(cfh, stateid);
++ if (dsp) {
++ get_ds_stateid(dsp);
++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__,
++ STATEID_VAL(&dsp->ds_stid));
+
-+ p = xdr_reserve_space(xdr, 32);
-+ if (!p)
-+ return -E2BIG;
++ dprintk("NFSD: %s: dsp %p fh_size %u:%u "
++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] "
++ "gen %x:%x\n",
++ __func__, dsp,
++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size,
++ ((unsigned *)&cfh->fh_handle.fh_base)[0],
++ ((unsigned *)&cfh->fh_handle.fh_base)[1],
++ ((unsigned *)&cfh->fh_handle.fh_base)[2],
++ ((unsigned *)&cfh->fh_handle.fh_base)[3],
++ ((unsigned *)&dsp->ds_fh.fh_base)[0],
++ ((unsigned *)&dsp->ds_fh.fh_base)[1],
++ ((unsigned *)&dsp->ds_fh.fh_base)[2],
++ ((unsigned *)&dsp->ds_fh.fh_base)[3],
++ stateid->si_generation, dsp->ds_stid.si_generation);
++ }
+
-+ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
-+ sizeof(object_id->oid_device_id.data));
-+ p = xdr_encode_hyper(p, object_id->oid_partition_id);
-+ p = xdr_encode_hyper(p, object_id->oid_object_id);
++ if (!dsp ||
++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) ||
++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base,
++ dsp->ds_fh.fh_size) != 0) ||
++ (stateid->si_generation > dsp->ds_stid.si_generation))
++ status = nfserr_bad_stateid;
++ else if (stateid->si_generation < dsp->ds_stid.si_generation)
++ status = nfserr_old_stateid;
+
-+ return 0;
++ if (dsp)
++ put_ds_stateid(dsp);
++ ds_unlock_state();
++ nfs4_lock_state();
++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status));
++ return status;
+}
+
-+/*
-+ * struct pnfs_osd_ioerr {
-+ * struct pnfs_osd_objid oer_component;
-+ * u64 oer_comp_offset;
-+ * u64 oer_comp_length;
-+ * u32 oer_iswrite;
-+ * u32 oer_errno;
-+ * };
-+ */
-+int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
-+ struct pnfs_osd_ioerr *ioerr)
++void
++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p)
+{
-+ __be32 *p;
-+ int ret;
-+
-+ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
-+ if (ret)
-+ return ret;
++ struct pnfs_ds_stateid *dsp = NULL;
+
-+ p = xdr_reserve_space(xdr, 24);
-+ if (!p)
-+ return -E2BIG;
++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid);
+
-+ p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
-+ p = xdr_encode_hyper(p, ioerr->oer_comp_length);
-+ *p++ = cpu_to_be32(ioerr->oer_iswrite);
-+ *p = cpu_to_be32(ioerr->oer_errno);
++ ds_lock_state();
++ if (stateid != NULL) {
++ dsp = find_pnfs_ds_stateid(stateid);
++ if (dsp)
++ get_ds_stateid(dsp);
++ }
+
-+ return 0;
++ /* XXX: Should we fetch the stateid or wait if some other
++ * thread is currently retrieving the stateid ? */
++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) {
++ *p++ = dsp->ds_verifier[0];
++ *p++ = dsp->ds_verifier[1];
++ put_ds_stateid(dsp);
++ } else {
++ /* must be on MDS */
++ ds_unlock_state();
++ sb->s_pnfs_op->get_verifier(sb, p);
++ ds_lock_state();
++ p += 2;
++ }
++ ds_unlock_state();
++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp);
++ return;
+}
-diff -up linux-2.6.35.noarch/fs/nfs/pagelist.c.orig linux-2.6.35.noarch/fs/nfs/pagelist.c
---- linux-2.6.35.noarch/fs/nfs/pagelist.c.orig 2010-09-30 12:22:45.186045000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/pagelist.c 2010-09-30 12:25:08.413285000 -0400
-@@ -20,6 +20,7 @@
- #include <linux/nfs_mount.h>
-
- #include "internal.h"
-+#include "pnfs.h"
++
++#endif /* CONFIG_PNFSD */
+diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
+index 59ec449..00cbf11 100644
+--- a/fs/nfsd/nfs4proc.c
++++ b/fs/nfsd/nfs4proc.c
+@@ -34,10 +34,14 @@
+ */
+ #include <linux/file.h>
+ #include <linux/slab.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd4_block.h>
- static struct kmem_cache *nfs_page_cachep;
+ #include "cache.h"
+ #include "xdr4.h"
+ #include "vfs.h"
++#include "pnfsd.h"
-@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p)
- struct nfs_page *
- nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
- struct page *page,
-- unsigned int offset, unsigned int count)
-+ unsigned int offset, unsigned int count,
-+ struct pnfs_layout_segment *lseg)
- {
- struct nfs_page *req;
+ #define NFSDDBG_FACILITY NFSDDBG_PROC
-@@ -81,6 +83,9 @@ nfs_create_request(struct nfs_open_conte
- req->wb_context = get_nfs_open_context(ctx);
- req->wb_lock_context = nfs_get_lock_context(ctx);
- kref_init(&req->wb_kref);
-+ req->wb_lseg = lseg;
-+ if (lseg)
-+ get_lseg(lseg);
- return req;
+@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
+ */
+ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
++#if defined(CONFIG_SPNFS)
++ if (!status && spnfs_enabled()) {
++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
++
++ status = spnfs_open(inode, open);
++ if (status) {
++ dprintk(
++ "nfsd: pNFS could not be enabled for inode: %lu\n",
++ inode->i_ino);
++ /*
++ * XXX When there's a failure then need to indicate to
++ * future ops that no pNFS is available. Should I save
++ * the status in the inode? It's kind of a big hammer.
++ * But there may be no stripes available?
++ */
++ }
++ }
++#endif /* CONFIG_SPNFS */
+ out:
+ if (open->op_stateowner) {
+ nfs4_get_stateowner(open->op_stateowner);
+@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ &access->ac_supported);
}
-@@ -156,9 +161,12 @@ void nfs_clear_request(struct nfs_page *
- put_nfs_open_context(ctx);
- req->wb_context = NULL;
- }
-+ if (req->wb_lseg != NULL) {
-+ put_lseg(req->wb_lseg);
-+ req->wb_lseg = NULL;
++static void
++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf)
++{
++ u32 *p = (u32 *)verf->data;
++
++#if defined(CONFIG_PNFSD)
++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) {
++ nfs4_ds_get_verifier(NULL, sb, p);
++ return;
+ }
- }
++#endif /* CONFIG_PNFSD */
++
++ *p++ = nfssvc_boot.tv_sec;
++ *p++ = nfssvc_boot.tv_usec;
++}
++
+ static __be32
+ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_commit *commit)
+ {
+ __be32 status;
+- u32 *p = (u32 *)commit->co_verf.data;
+- *p++ = nfssvc_boot.tv_sec;
+- *p++ = nfssvc_boot.tv_usec;
-
- /**
- * nfs_release_request - Release the count on an NFS read/write request
- * @req: request to release
-@@ -237,7 +245,8 @@ void nfs_pageio_init(struct nfs_pageio_d
- * Return 'true' if this is the case, else return 'false'.
- */
- static int nfs_can_coalesce_requests(struct nfs_page *prev,
-- struct nfs_page *req)
-+ struct nfs_page *req,
-+ struct nfs_pageio_descriptor *pgio)
++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
++ &commit->co_verf);
+ status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ commit->co_count);
+ if (status == nfserr_symlink)
+@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
- if (req->wb_context->cred != prev->wb_context->cred)
- return 0;
-@@ -251,6 +260,12 @@ static int nfs_can_coalesce_requests(str
- return 0;
- if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return 0;
-+ if (req->wb_lseg != prev->wb_lseg)
-+ return 0;
-+#ifdef CONFIG_NFS_V4_1
-+ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
-+ return 0;
-+#endif /* CONFIG_NFS_V4_1 */
- return 1;
- }
+ stateid_t *stateid = &write->wr_stateid;
+ struct file *filp = NULL;
+- u32 *p;
+ __be32 status = nfs_ok;
+ unsigned long cnt;
-@@ -283,7 +298,7 @@ static int nfs_pageio_do_add_request(str
- if (newlen > desc->pg_bsize)
- return 0;
- prev = nfs_list_entry(desc->pg_list.prev);
-- if (!nfs_can_coalesce_requests(prev, req))
-+ if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
- } else
- desc->pg_base = req->wb_pgbase;
-@@ -372,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs
- * @idx_start: lower bound of page->index to scan
- * @npages: idx_start + npages sets the upper bound to scan.
- * @tag: tag to scan for
-+ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver
- *
- * Moves elements from one of the inode request lists.
- * If the number of requests is set to 0, the entire address_space
-@@ -381,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs
- */
- int nfs_scan_list(struct nfs_inode *nfsi,
- struct list_head *dst, pgoff_t idx_start,
-- unsigned int npages, int tag)
-+ unsigned int npages, int tag, int *use_pnfs)
- {
- struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
- struct nfs_page *req;
-@@ -412,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi
- radix_tree_tag_clear(&nfsi->nfs_page_tree,
- req->wb_index, tag);
- nfs_list_add_request(req, dst);
-+ if (req->wb_lseg)
-+ *use_pnfs = 1;
- res++;
- if (res == INT_MAX)
- goto out;
-diff -up linux-2.6.35.noarch/fs/nfs/pnfs.c.orig linux-2.6.35.noarch/fs/nfs/pnfs.c
---- linux-2.6.35.noarch/fs/nfs/pnfs.c.orig 2010-09-30 12:25:08.417283000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/pnfs.c 2010-09-30 12:25:08.419282000 -0400
-@@ -0,0 +1,1762 @@
-+/*
-+ * pNFS functions to call and manage layout drivers.
-+ *
-+ * Copyright (c) 2002 [year of first publication]
-+ * The Regents of the University of Michigan
-+ * All Rights Reserved
-+ *
-+ * Dean Hildebrand <dhildebz at umich.edu>
-+ *
-+ * Permission is granted to use, copy, create derivative works, and
-+ * redistribute this software and such derivative works for any purpose,
-+ * so long as the name of the University of Michigan is not used in
-+ * any advertising or publicity pertaining to the use or distribution
-+ * of this software without specific, written prior authorization. If
-+ * the above copyright notice or any other identification of the
-+ * University of Michigan is included in any copy of any portion of
-+ * this software, then the disclaimer below must also be included.
-+ *
-+ * This software is provided as is, without representation or warranty
-+ * of any kind either express or implied, including without limitation
-+ * the implied warranties of merchantability, fitness for a particular
-+ * purpose, or noninfringement. The Regents of the University of
-+ * Michigan shall not be liable for any damages, including special,
-+ * indirect, incidental, or consequential damages, with respect to any
-+ * claim arising out of or in connection with the use of the software,
-+ * even if it has been or is hereafter advised of the possibility of
-+ * such damages.
-+ */
+@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+
+ cnt = write->wr_buflen;
+ write->wr_how_written = write->wr_stable_how;
+- p = (u32 *)write->wr_verifier.data;
+- *p++ = nfssvc_boot.tv_sec;
+- *p++ = nfssvc_boot.tv_usec;
+
++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb,
++ &write->wr_verifier);
++#if defined(CONFIG_SPNFS)
++#if defined(CONFIG_SPNFS_BLOCK)
++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) {
++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode,
++ RETURN_FILE, write->wr_offset, write->wr_buflen);
++ if (!status) {
++ status = nfsd_write(rqstp, &cstate->current_fh, filp,
++ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
++ &cnt, &write->wr_how_written);
++ }
++ } else
++#endif
++
++ if (spnfs_enabled()) {
++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode,
++ write->wr_offset, write->wr_buflen, write->wr_vlen,
++ rqstp);
++ if (status == nfs_ok) {
++ /* DMXXX: HACK to get filesize set */
++ /* write one byte at offset+length-1 */
++ struct kvec k[1];
++ char zero = 0;
++ unsigned long cnt = 1;
+
-+#include <linux/nfs_fs.h>
-+#include "internal.h"
-+#include "pnfs.h"
-+#include "iostat.h"
++ k[0].iov_base = (void *)&zero;
++ k[0].iov_len = 1;
++ nfsd_write(rqstp, &cstate->current_fh, filp,
++ write->wr_offset+write->wr_buflen-1, k, 1,
++ &cnt, &write->wr_how_written);
++ }
++ } else /* we're not an MDS */
++ status = nfsd_write(rqstp, &cstate->current_fh, filp,
++ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
++ &cnt, &write->wr_how_written);
++#else
+ status = nfsd_write(rqstp, &cstate->current_fh, filp,
+ write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+ &cnt, &write->wr_how_written);
++#endif /* CONFIG_SPNFS */
+
-+#define NFSDBG_FACILITY NFSDBG_PNFS
+ if (filp)
+ fput(filp);
+
+@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ return status == nfserr_same ? nfs_ok : status;
+ }
+
++#if defined(CONFIG_PNFSD)
+
-+/* Locking:
-+ *
-+ * pnfs_spinlock:
-+ * protects pnfs_modules_tbl.
-+ */
-+static DEFINE_SPINLOCK(pnfs_spinlock);
++static __be32
++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp,
++ unsigned int layout_type)
++{
++ int status, type;
+
-+/*
-+ * pnfs_modules_tbl holds all pnfs modules
-+ */
-+static LIST_HEAD(pnfs_modules_tbl);
++ /* check to see if pNFS is supported. */
++ status = nfserr_layoutunavailable;
++ if (exp && exp->ex_pnfs == 0) {
++ dprintk("%s: Underlying file system "
++ "is not exported over pNFS\n", __func__);
++ goto out;
++ }
++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) {
++ dprintk("%s: Underlying file system "
++ "does not support pNFS\n", __func__);
++ goto out;
++ }
+
-+/* Return the registered pnfs layout driver module matching given id */
-+static struct pnfs_layoutdriver_type *
-+find_pnfs_driver_locked(u32 id)
-+{
-+ struct pnfs_layoutdriver_type *local;
++ type = sb->s_pnfs_op->layout_type(sb);
+
-+ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
-+ if (local->id == id)
-+ goto out;
-+ local = NULL;
++ /* check to see if requested layout type is supported. */
++ status = nfserr_unknown_layouttype;
++ if (!type)
++ dprintk("BUG: %s: layout_type 0 is reserved and must not be "
++ "used by filesystem\n", __func__);
++ else if (type != layout_type)
++ dprintk("%s: requested layout type %d "
++ "does not match supported type %d\n",
++ __func__, layout_type, type);
++ else
++ status = nfs_ok;
+out:
-+ dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
-+ return local;
++ return status;
+}
+
-+static struct pnfs_layoutdriver_type *
-+find_pnfs_driver(u32 id)
++static __be32
++nfsd4_getdevlist(struct svc_rqst *rqstp,
++ struct nfsd4_compound_state *cstate,
++ struct nfsd4_pnfs_getdevlist *gdlp)
+{
-+ struct pnfs_layoutdriver_type *local;
++ struct super_block *sb;
++ struct svc_fh *current_fh = &cstate->current_fh;
++ int status;
+
-+ spin_lock(&pnfs_spinlock);
-+ local = find_pnfs_driver_locked(id);
-+ spin_unlock(&pnfs_spinlock);
-+ return local;
-+}
++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n",
++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices,
++ gdlp->gd_cookie, gdlp->gd_verf);
+
-+/* Set cred to indicate we require a layoutcommit
-+ * If we don't even have a layout, we don't need to commit it.
-+ */
-+void
-+pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx)
-+{
-+ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx);
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ if (has_layout(nfsi) &&
-+ !test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state)) {
-+ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred);
-+ __set_bit(NFS_LAYOUT_NEED_LCOMMIT,
-+ &nfsi->layout->state);
-+ nfsi->change_attr++;
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
-+ dprintk("%s: Set layoutcommit\n", __func__);
-+ return;
-+ }
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
-+}
+
-+/* Update last_write_offset for layoutcommit.
-+ * TODO: We should only use commited extents, but the current nfs
-+ * implementation does not calculate the written range in nfs_commit_done.
-+ * We therefore update this field in writeback_done.
-+ */
-+void
-+pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent)
-+{
-+ loff_t end_pos;
++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++ if (status)
++ goto out;
+
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ if (offset < nfsi->layout->write_begin_pos)
-+ nfsi->layout->write_begin_pos = offset;
-+ end_pos = offset + extent - 1; /* I'm being inclusive */
-+ if (end_pos > nfsi->layout->write_end_pos)
-+ nfsi->layout->write_end_pos = end_pos;
-+ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n",
-+ __func__,
-+ (unsigned long) extent,
-+ (unsigned long) offset ,
-+ (unsigned long) nfsi->layout->write_begin_pos,
-+ (unsigned long) nfsi->layout->write_end_pos);
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
-+}
++ status = nfserr_inval;
++ sb = current_fh->fh_dentry->d_inode->i_sb;
++ if (!sb)
++ goto out;
+
-+void
-+unset_pnfs_layoutdriver(struct nfs_server *nfss)
-+{
-+ if (nfss->pnfs_curr_ld) {
-+ nfss->pnfs_curr_ld->uninitialize_mountpoint(nfss);
-+ module_put(nfss->pnfs_curr_ld->owner);
++ /* We must be able to encode at list one device */
++ if (!gdlp->gd_maxdevices)
++ goto out;
++
++ /* Ensure underlying file system supports pNFS and,
++ * if so, the requested layout type
++ */
++ status = nfsd4_layout_verify(sb, current_fh->fh_export,
++ gdlp->gd_layout_type);
++ if (status)
++ goto out;
++
++ /* Do nothing if underlying file system does not support
++ * getdevicelist */
++ if (!sb->s_pnfs_op->get_device_iter) {
++ status = nfserr_notsupp;
++ goto out;
+ }
-+ nfss->pnfs_curr_ld = NULL;
++
++ /* Set up arguments so device can be retrieved at encode time */
++ gdlp->gd_fhp = &cstate->current_fh;
++out:
++ return status;
+}
+
-+/*
-+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
-+ * Currently only one pNFS layout driver per filesystem is supported.
-+ *
-+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
-+ */
-+void
-+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
-+ u32 id)
++static __be32
++nfsd4_getdevinfo(struct svc_rqst *rqstp,
++ struct nfsd4_compound_state *cstate,
++ struct nfsd4_pnfs_getdevinfo *gdp)
+{
-+ struct pnfs_layoutdriver_type *ld_type = NULL;
++ struct super_block *sb;
++ int status;
++ clientid_t clid;
+
-+ if (id == 0)
-+ goto out_no_driver;
-+ if (!(server->nfs_client->cl_exchange_flags &
-+ (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
-+ printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
-+ id, server->nfs_client->cl_exchange_flags);
-+ goto out_no_driver;
-+ }
-+ ld_type = find_pnfs_driver(id);
-+ if (!ld_type) {
-+ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
-+ ld_type = find_pnfs_driver(id);
-+ if (!ld_type) {
-+ dprintk("%s: No pNFS module found for %u.\n",
-+ __func__, id);
-+ goto out_no_driver;
-+ }
-+ }
-+ if (!try_module_get(ld_type->owner)) {
-+ dprintk("%s: Could not grab reference on module\n", __func__);
-+ goto out_no_driver;
-+ }
-+ server->pnfs_curr_ld = ld_type;
-+ if (ld_type->initialize_mountpoint(server, mntfh)) {
-+ printk(KERN_ERR
-+ "%s: Error initializing mount point for layout driver %u.\n",
-+ __func__, id);
-+ module_put(ld_type->owner);
-+ goto out_no_driver;
++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n",
++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid,
++ gdp->gd_devid.devid, gdp->gd_maxcount);
++
++ status = nfserr_inval;
++ sb = find_sbid_id(gdp->gd_devid.sbid);
++ dprintk("%s: sb %p\n", __func__, sb);
++ if (!sb) {
++ status = nfserr_noent;
++ goto out;
+ }
-+ dprintk("%s: pNFS module for %u set\n", __func__, id);
-+ return;
+
-+out_no_driver:
-+ dprintk("%s: Using NFSv4 I/O\n", __func__);
-+ server->pnfs_curr_ld = NULL;
++ /* Ensure underlying file system supports pNFS and,
++ * if so, the requested layout type
++ */
++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type);
++ if (status)
++ goto out;
++
++ /* Set up arguments so device can be retrieved at encode time */
++ gdp->gd_sb = sb;
++
++ /* Update notifications */
++ copy_clientid(&clid, cstate->session);
++ pnfs_set_device_notify(&clid, gdp->gd_notify_types);
++out:
++ return status;
+}
+
-+int
-+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
++static __be32
++nfsd4_layoutget(struct svc_rqst *rqstp,
++ struct nfsd4_compound_state *cstate,
++ struct nfsd4_pnfs_layoutget *lgp)
+{
-+ int status = -EINVAL;
-+ struct pnfs_layoutdriver_type *tmp;
++ int status;
++ struct super_block *sb;
++ struct svc_fh *current_fh = &cstate->current_fh;
+
-+ if (ld_type->id == 0) {
-+ printk(KERN_ERR "%s id 0 is reserved\n", __func__);
-+ return status;
-+ }
-+ if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
-+ printk(KERN_ERR "%s Layout driver must provide "
-+ "alloc_lseg and free_lseg.\n", __func__);
-+ return status;
-+ }
++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++ if (status)
++ goto out;
+
-+ if (!ld_type->read_pagelist || !ld_type->write_pagelist ||
-+ !ld_type->commit) {
-+ printk(KERN_ERR "%s Layout driver must provide "
-+ "read_pagelist, write_pagelist, and commit.\n",
-+ __func__);
-+ return status;
-+ }
++ status = nfserr_inval;
++ sb = current_fh->fh_dentry->d_inode->i_sb;
++ if (!sb)
++ goto out;
+
-+ spin_lock(&pnfs_spinlock);
-+ tmp = find_pnfs_driver_locked(ld_type->id);
-+ if (!tmp) {
-+ list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
-+ status = 0;
-+ dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
-+ ld_type->name);
-+ } else {
-+ printk(KERN_ERR "%s Module with id %d already loaded!\n",
-+ __func__, ld_type->id);
++ /* Ensure underlying file system supports pNFS and,
++ * if so, the requested layout type
++ */
++ status = nfsd4_layout_verify(sb, current_fh->fh_export,
++ lgp->lg_seg.layout_type);
++ if (status)
++ goto out;
++
++ status = nfserr_badiomode;
++ if (lgp->lg_seg.iomode != IOMODE_READ &&
++ lgp->lg_seg.iomode != IOMODE_RW) {
++ dprintk("pNFS %s: invalid iomode %d\n", __func__,
++ lgp->lg_seg.iomode);
++ goto out;
+ }
-+ spin_unlock(&pnfs_spinlock);
+
++ /* Set up arguments so layout can be retrieved at encode time */
++ lgp->lg_fhp = current_fh;
++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session);
++ status = nfs_ok;
++out:
+ return status;
+}
-+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
-+void
-+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
++static __be32
++nfsd4_layoutcommit(struct svc_rqst *rqstp,
++ struct nfsd4_compound_state *cstate,
++ struct nfsd4_pnfs_layoutcommit *lcp)
+{
-+ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
-+ spin_lock(&pnfs_spinlock);
-+ list_del(&ld_type->pnfs_tblid);
-+ spin_unlock(&pnfs_spinlock);
-+}
-+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
++ int status;
++ struct inode *ino = NULL;
++ struct iattr ia;
++ struct super_block *sb;
++ struct svc_fh *current_fh = &cstate->current_fh;
+
-+/*
-+ * pNFS client layout cache
-+ */
++ dprintk("NFSD: nfsd4_layoutcommit \n");
++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++ if (status)
++ goto out;
+
-+static struct pnfs_layout_hdr *
-+pnfs_alloc_layout_hdr(struct inode *ino)
-+{
-+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
-+ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
-+ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
-+}
++ status = nfserr_inval;
++ ino = current_fh->fh_dentry->d_inode;
++ if (!ino)
++ goto out;
+
-+static void
-+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
-+{
-+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->inode)->pnfs_curr_ld;
-+ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
-+}
++ status = nfserr_inval;
++ sb = ino->i_sb;
++ if (!sb)
++ goto out;
+
-+static void
-+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
-+{
-+ assert_spin_locked(&lo->inode->i_lock);
-+ lo->refcount++;
-+}
++ /* Ensure underlying file system supports pNFS and,
++ * if so, the requested layout type
++ */
++ status = nfsd4_layout_verify(sb, current_fh->fh_export,
++ lcp->args.lc_seg.layout_type);
++ if (status)
++ goto out;
+
-+static void
-+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
-+{
-+ assert_spin_locked(&lo->inode->i_lock);
-+ BUG_ON(lo->refcount == 0);
++ /* This will only extend the file length. Do a quick
++ * check to see if there is any point in waiting for the update
++ * locks.
++ * TODO: Is this correct for all back ends?
++ */
++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n",
++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1,
++ ino->i_size);
+
-+ lo->refcount--;
-+ if (!lo->refcount) {
-+ dprintk("%s: freeing layout cache %p\n", __func__, lo);
-+ BUG_ON(!list_empty(&lo->layouts));
-+ NFS_I(lo->inode)->layout = NULL;
-+ pnfs_free_layout_hdr(lo);
++ /* Set clientid from sessionid */
++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session);
++ lcp->res.lc_size_chg = 0;
++ if (sb->s_pnfs_op->layout_commit) {
++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res);
++ dprintk("%s:layout_commit result %d\n", __func__, status);
++ } else {
++ fh_lock(current_fh);
++ if ((lcp->args.lc_newoffset == 0) ||
++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) {
++ status = 0;
++ lcp->res.lc_size_chg = 0;
++ fh_unlock(current_fh);
++ goto out;
++ }
++
++ /* Try our best to update the file size */
++ dprintk("%s: Modifying file size\n", __func__);
++ ia.ia_valid = ATTR_SIZE;
++ ia.ia_size = lcp->args.lc_last_wr + 1;
++ status = notify_change(current_fh->fh_dentry, &ia);
++ fh_unlock(current_fh);
++ dprintk("%s:notify_change result %d\n", __func__, status);
++ }
++
++ if (!status && lcp->res.lc_size_chg &&
++ EX_ISSYNC(current_fh->fh_export)) {
++ dprintk("%s: Synchronously writing inode size %llu\n",
++ __func__, ino->i_size);
++ write_inode_now(ino, 1);
++ lcp->res.lc_newsize = i_size_read(ino);
+ }
++out:
++ return status;
+}
+
-+void
-+put_layout_hdr(struct inode *inode)
++static __be32
++nfsd4_layoutreturn(struct svc_rqst *rqstp,
++ struct nfsd4_compound_state *cstate,
++ struct nfsd4_pnfs_layoutreturn *lrp)
+{
-+ spin_lock(&inode->i_lock);
-+ put_layout_hdr_locked(NFS_I(inode)->layout);
-+ spin_unlock(&inode->i_lock);
++ int status;
++ struct super_block *sb;
++ struct svc_fh *current_fh = &cstate->current_fh;
+
-+}
++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
++ if (status)
++ goto out;
+
-+static void
-+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
-+{
-+ INIT_LIST_HEAD(&lseg->fi_list);
-+ kref_init(&lseg->kref);
-+ lseg->valid = true;
-+ lseg->layout = lo;
-+}
++ status = nfserr_inval;
++ sb = current_fh->fh_dentry->d_inode->i_sb;
++ if (!sb)
++ goto out;
+
-+static void
-+destroy_lseg(struct kref *kref)
-+{
-+ struct pnfs_layout_segment *lseg =
-+ container_of(kref, struct pnfs_layout_segment, kref);
-+ struct pnfs_layout_hdr *local = lseg->layout;
++ /* Ensure underlying file system supports pNFS and,
++ * if so, the requested layout type
++ */
++ status = nfsd4_layout_verify(sb, current_fh->fh_export,
++ lrp->args.lr_seg.layout_type);
++ if (status)
++ goto out;
+
-+ dprintk("--> %s\n", __func__);
-+ NFS_SERVER(local->inode)->pnfs_curr_ld->free_lseg(lseg);
-+ /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
-+ put_layout_hdr_locked(local);
++ status = nfserr_inval;
++ if (lrp->args.lr_return_type != RETURN_FILE &&
++ lrp->args.lr_return_type != RETURN_FSID &&
++ lrp->args.lr_return_type != RETURN_ALL) {
++ dprintk("pNFS %s: invalid return_type %d\n", __func__,
++ lrp->args.lr_return_type);
++ goto out;
++ }
++
++ status = nfserr_inval;
++ if (lrp->args.lr_seg.iomode != IOMODE_READ &&
++ lrp->args.lr_seg.iomode != IOMODE_RW &&
++ lrp->args.lr_seg.iomode != IOMODE_ANY) {
++ dprintk("pNFS %s: invalid iomode %d\n", __func__,
++ lrp->args.lr_seg.iomode);
++ goto out;
++ }
++
++ /* Set clientid from sessionid */
++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session);
++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE);
++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp);
++out:
++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n",
++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present);
++ return status;
+}
++#endif /* CONFIG_PNFSD */
+
-+static void
-+put_lseg_common(struct nfs_inode *nfsi, struct pnfs_layout_segment *lseg)
-+{
-+ bool do_wake_up;
+ /*
+ * NULL call.
+ */
+@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[] = {
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_RECLAIM_COMPLETE",
+ },
++#if defined(CONFIG_PNFSD)
++ [OP_GETDEVICELIST] = {
++ .op_func = (nfsd4op_func)nfsd4_getdevlist,
++ .op_name = "OP_GETDEVICELIST",
++ },
++ [OP_GETDEVICEINFO] = {
++ .op_func = (nfsd4op_func)nfsd4_getdevinfo,
++ .op_flags = ALLOWED_WITHOUT_FH,
++ .op_name = "OP_GETDEVICEINFO",
++ },
++ [OP_LAYOUTGET] = {
++ .op_func = (nfsd4op_func)nfsd4_layoutget,
++ .op_name = "OP_LAYOUTGET",
++ },
++ [OP_LAYOUTCOMMIT] = {
++ .op_func = (nfsd4op_func)nfsd4_layoutcommit,
++ .op_name = "OP_LAYOUTCOMMIT",
++ },
++ [OP_LAYOUTRETURN] = {
++ .op_func = (nfsd4op_func)nfsd4_layoutreturn,
++ .op_name = "OP_LAYOUTRETURN",
++ },
++#endif /* CONFIG_PNFSD */
+ };
+
+ static const char *nfsd4_op_name(unsigned opnum)
+diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
+index cf0d2ff..770b494 100644
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -42,6 +42,8 @@
+ #include "xdr4.h"
+ #include "vfs.h"
+
++#include "pnfsd.h"
+
-+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
-+ atomic_read(&lseg->kref.refcount), lseg->valid);
-+ do_wake_up = !lseg->valid;
-+ nfsi = NFS_I(lseg->layout->inode);
-+ kref_put(&lseg->kref, destroy_lseg);
-+ if (do_wake_up)
-+ rpc_wake_up(&nfsi->lo_rpcwaitq);
+ #define NFSDDBG_FACILITY NFSDDBG_PROC
+
+ /* Globals */
+@@ -59,8 +61,6 @@ static u64 current_sessionid = 1;
+ #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+
+ /* forward declarations */
+-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
+-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
+ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+ static void nfs4_set_recdir(char *recdir);
+
+@@ -68,6 +68,7 @@ static void nfs4_set_recdir(char *recdir);
+
+ /* Currently used for almost all code touching nfsv4 state: */
+ static DEFINE_MUTEX(client_mutex);
++struct task_struct *client_mutex_owner;
+
+ /*
+ * Currently used for the del_recall_lru and file hash table. In an
+@@ -85,11 +86,21 @@ void
+ nfs4_lock_state(void)
+ {
+ mutex_lock(&client_mutex);
++ client_mutex_owner = current;
+}
+
-+void
-+put_lseg_locked(struct pnfs_layout_segment *lseg)
-+{
-+ if (!lseg)
-+ return;
-+
-+ assert_spin_locked(&lseg->layout->inode->i_lock);
-+ put_lseg_common(NFS_I(lseg->layout->inode), lseg);
-+}
-+EXPORT_SYMBOL_GPL(put_lseg_locked);
++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current)
+
+void
-+put_lseg(struct pnfs_layout_segment *lseg)
++nfs4_bug_on_unlocked_state(void)
+{
-+ struct nfs_inode *nfsi;
++ BUG_ON(client_mutex_owner != current);
+ }
+
+ void
+ nfs4_unlock_state(void)
+ {
++ client_mutex_owner = NULL;
+ mutex_unlock(&client_mutex);
+ }
+
+@@ -108,7 +119,7 @@ opaque_hashval(const void *ptr, int nbytes)
+
+ static struct list_head del_recall_lru;
+
+-static inline void
++inline void
+ put_nfs4_file(struct nfs4_file *fi)
+ {
+ if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+@@ -119,7 +130,7 @@ put_nfs4_file(struct nfs4_file *fi)
+ }
+ }
+
+-static inline void
++inline void
+ get_nfs4_file(struct nfs4_file *fi)
+ {
+ atomic_inc(&fi->fi_ref);
+@@ -179,10 +190,16 @@ static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+
+ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+ {
+- if (fp->fi_fds[oflag]) {
+- fput(fp->fi_fds[oflag]);
+- fp->fi_fds[oflag] = NULL;
+- }
++ struct file *fd = fp->fi_fds[oflag];
+
-+ if (!lseg)
++ if (!fd)
+ return;
+
-+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
-+ atomic_read(&lseg->kref.refcount), lseg->valid);
-+ nfsi = NFS_I(lseg->layout->inode);
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ put_lseg_common(nfsi, lseg);
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
-+}
-+EXPORT_SYMBOL_GPL(put_lseg);
-+
-+void get_lseg(struct pnfs_layout_segment *lseg)
-+{
-+ kref_get(&lseg->kref);
-+}
-+EXPORT_SYMBOL_GPL(get_lseg);
-+
-+static inline u64
-+end_offset(u64 start, u64 len)
-+{
-+ u64 end;
-+
-+ end = start + len;
-+ return end >= start ? end: NFS4_MAX_UINT64;
-+}
-+
-+/* last octet in a range */
-+static inline u64
-+last_byte_offset(u64 start, u64 len)
-+{
-+ u64 end;
-+
-+ BUG_ON(!len);
-+ end = start + len;
-+ return end > start ? end - 1: NFS4_MAX_UINT64;
-+}
++ fp->fi_fds[oflag] = NULL;
++ BUG_ON_UNLOCKED_STATE();
++ nfs4_unlock_state(); /* allow nested layout recall/return */
++ fput(fd);
++ nfs4_lock_state();
+ }
+
+ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+@@ -308,8 +325,8 @@ static DEFINE_SPINLOCK(client_lock);
+ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+ * used in reboot/reset lease grace period processing
+ *
+- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
+- * setclientid_confirmed info.
++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold
++ * confirmed setclientid_confirmed info.
+ *
+ * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
+ * setclientid info.
+@@ -334,6 +351,7 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp)
+ list_del(&stp->st_hash);
+ list_del(&stp->st_perfile);
+ list_del(&stp->st_perstateowner);
++ release_pnfs_ds_dev_list(stp);
+ }
+
+ static void free_generic_stateid(struct nfs4_stateid *stp)
+@@ -856,6 +874,8 @@ expire_client(struct nfs4_client *clp)
+ struct nfs4_delegation *dp;
+ struct list_head reaplist;
+
++ BUG_ON_UNLOCKED_STATE();
+
-+/*
-+ * is l2 fully contained in l1?
-+ * start1 end1
-+ * [----------------------------------)
-+ * start2 end2
-+ * [----------------)
-+ */
-+static inline int
-+lo_seg_contained(struct pnfs_layout_range *l1,
-+ struct pnfs_layout_range *l2)
+ INIT_LIST_HEAD(&reaplist);
+ spin_lock(&recall_lock);
+ while (!list_empty(&clp->cl_delegations)) {
+@@ -875,6 +895,7 @@ expire_client(struct nfs4_client *clp)
+ sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
+ release_openowner(sop);
+ }
++ pnfs_expire_client(clp);
+ nfsd4_set_callback_client(clp, NULL);
+ if (clp->cl_cb_conn.cb_xprt)
+ svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+@@ -887,6 +908,13 @@ expire_client(struct nfs4_client *clp)
+ spin_unlock(&client_lock);
+ }
+
++void expire_client_lock(struct nfs4_client *clp)
+{
-+ u64 start1 = l1->offset;
-+ u64 end1 = end_offset(start1, l1->length);
-+ u64 start2 = l2->offset;
-+ u64 end2 = end_offset(start2, l2->length);
-+
-+ return (start1 <= start2) && (end1 >= end2);
++ nfs4_lock_state();
++ expire_client(clp);
++ nfs4_unlock_state();
+}
+
-+/*
-+ * is l1 and l2 intersecting?
-+ * start1 end1
-+ * [----------------------------------)
-+ * start2 end2
-+ * [----------------)
-+ */
-+static inline int
-+lo_seg_intersecting(struct pnfs_layout_range *l1,
-+ struct pnfs_layout_range *l2)
+ static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+ {
+ memcpy(target->cl_verifier.data, source->data,
+@@ -976,6 +1004,11 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+ INIT_LIST_HEAD(&clp->cl_strhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
++#if defined(CONFIG_PNFSD)
++ INIT_LIST_HEAD(&clp->cl_layouts);
++ INIT_LIST_HEAD(&clp->cl_layoutrecalls);
++ atomic_set(&clp->cl_deviceref, 0);
++#endif /* CONFIG_PNFSD */
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ clp->cl_time = get_seconds();
+@@ -1025,7 +1058,7 @@ move_to_confirmed(struct nfs4_client *clp)
+ renew_client(clp);
+ }
+
+-static struct nfs4_client *
++struct nfs4_client *
+ find_confirmed_client(clientid_t *clid)
+ {
+ struct nfs4_client *clp;
+@@ -1095,6 +1128,24 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+ return NULL;
+ }
+
++int
++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *),
++ void *arg)
+{
-+ u64 start1 = l1->offset;
-+ u64 end1 = end_offset(start1, l1->length);
-+ u64 start2 = l2->offset;
-+ u64 end2 = end_offset(start2, l2->length);
-+
-+ return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
-+ (end2 == NFS4_MAX_UINT64 || end2 > start1);
-+}
++ struct nfs4_client *clp, *next;
++ int i, status = 0;
+
-+/*
-+ * iomode matching rules:
-+ * range lseg match
-+ * ----- ----- -----
-+ * ANY READ true
-+ * ANY RW true
-+ * RW READ false
-+ * RW RW true
-+ * READ READ true
-+ * READ RW false
-+ */
-+static inline int
-+should_free_lseg(struct pnfs_layout_segment *lseg,
-+ struct pnfs_layout_range *range)
-+{
-+ return (range->iomode == IOMODE_ANY ||
-+ lseg->range.iomode == range->iomode) &&
-+ lo_seg_intersecting(&lseg->range, range);
-+}
++ for (i = 0; i < CLIENT_HASH_SIZE; i++)
++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i],
++ cl_strhash) {
++ status = func(clp, arg);
++ if (status)
++ break;
++ }
+
-+static inline bool
-+_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg)
-+{
-+ return atomic_read(&lseg->kref.refcount) == 1;
++ return status;
+}
+
-+static void
-+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo,
-+ struct pnfs_layout_range *range)
-+{
-+ struct pnfs_layout_segment *lseg, *next;
-+ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
-+ __func__, lo, range->offset, range->length, range->iomode);
+ static void
+ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+ {
+@@ -1227,8 +1278,12 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+ static void
+ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+ {
+- /* pNFS is not supported */
++#if defined(CONFIG_PNFSD)
++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS |
++ EXCHGID4_FLAG_USE_PNFS_DS;
++#else /* CONFIG_PNFSD */
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
++#endif /* CONFIG_PNFSD */
+
+ /* Referrals are supported, Migration is not. */
+ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+@@ -1418,6 +1473,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
+ struct nfsd4_clid_slot *cs_slot = NULL;
+ int status = 0;
+
++#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
++ /* XXX hack to get local ip address */
++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local,
++ sizeof(pnfsd_lexp_addr));
++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen;
++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+
-+ assert_spin_locked(&lo->inode->i_lock);
-+ list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
-+ if (!should_free_lseg(lseg, range) ||
-+ !_pnfs_can_return_lseg(lseg))
-+ continue;
-+ dprintk("%s: freeing lseg %p iomode %d "
-+ "offset %llu length %llu\n", __func__,
-+ lseg, lseg->range.iomode, lseg->range.offset,
-+ lseg->range.length);
-+ list_del(&lseg->fi_list);
-+ put_lseg_locked(lseg);
-+ }
-+ if (list_empty(&lo->segs)) {
-+ struct nfs_client *clp;
+ nfs4_lock_state();
+ unconf = find_unconfirmed_client(&cr_ses->clientid);
+ conf = find_confirmed_client(&cr_ses->clientid);
+@@ -1457,25 +1519,26 @@ nfsd4_create_session(struct svc_rqst *rqstp,
+ cs_slot->sl_seqid++; /* from 0 to 1 */
+ move_to_confirmed(unconf);
+
+- if (cr_ses->flags & SESSION4_BACK_CHAN) {
+- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+- svc_xprt_get(rqstp->rq_xprt);
+- rpc_copy_addr(
+- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
+- sa);
+- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+- unconf->cl_cb_conn.cb_minorversion =
+- cstate->minorversion;
+- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
+- unconf->cl_cb_seq_nr = 1;
+- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
+- }
++ if (is_ds_only_session(unconf->cl_exchange_flags))
++ cr_ses->flags &= ~SESSION4_BACK_CHAN;
+
-+ clp = NFS_SERVER(lo->inode)->nfs_client;
-+ spin_lock(&clp->cl_lock);
-+ /* List does not take a reference, so no need for put here */
-+ list_del_init(&lo->layouts);
-+ spin_unlock(&clp->cl_lock);
-+ pnfs_invalidate_layout_stateid(lo);
+ conf = unconf;
+ } else {
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+
++ if (cr_ses->flags & SESSION4_BACK_CHAN) {
++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
++ svc_xprt_get(rqstp->rq_xprt);
++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa);
++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion;
++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
++ conf->cl_cb_seq_nr = 1;
++ nfsd4_probe_callback(conf, &conf->cl_cb_conn);
+ }
+
-+ dprintk("%s:Return\n", __func__);
-+}
-+
+ /*
+ * We do not support RDMA or persistent sessions
+ */
+@@ -1863,7 +1926,7 @@ out:
+
+ /* OPEN Share state helper functions */
+ static inline struct nfs4_file *
+-alloc_init_file(struct inode *ino)
++alloc_init_file(struct inode *ino, struct svc_fh *current_fh)
+ {
+ struct nfs4_file *fp;
+ unsigned int hashval = file_hashval(ino);
+@@ -1879,6 +1942,16 @@ alloc_init_file(struct inode *ino)
+ fp->fi_had_conflict = false;
+ memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+ memset(fp->fi_access, 0, sizeof(fp->fi_access));
++#if defined(CONFIG_PNFSD)
++ INIT_LIST_HEAD(&fp->fi_layouts);
++ INIT_LIST_HEAD(&fp->fi_layout_states);
++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid;
++ fp->fi_fsid.minor = 0;
++ fp->fi_fhlen = current_fh->fh_handle.fh_size;
++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval));
++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base,
++ fp->fi_fhlen);
++#endif /* CONFIG_PNFSD */
+ spin_lock(&recall_lock);
+ list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ spin_unlock(&recall_lock);
+@@ -1887,7 +1960,7 @@ alloc_init_file(struct inode *ino)
+ return NULL;
+ }
+
+-static void
+void
-+pnfs_layoutget_release(struct pnfs_layout_hdr *lo)
+ nfsd4_free_slab(struct kmem_cache **slab)
+ {
+ if (*slab == NULL)
+@@ -1903,6 +1976,7 @@ nfsd4_free_slabs(void)
+ nfsd4_free_slab(&file_slab);
+ nfsd4_free_slab(&stateid_slab);
+ nfsd4_free_slab(&deleg_slab);
++ nfsd4_free_pnfs_slabs();
+ }
+
+ static int
+@@ -1924,6 +1998,8 @@ nfsd4_init_slabs(void)
+ sizeof(struct nfs4_delegation), 0, 0, NULL);
+ if (deleg_slab == NULL)
+ goto out_nomem;
++ if (nfsd4_init_pnfs_slabs())
++ goto out_nomem;
+ return 0;
+ out_nomem:
+ nfsd4_free_slabs();
+@@ -1997,6 +2073,9 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
+ INIT_LIST_HEAD(&stp->st_perstateowner);
+ INIT_LIST_HEAD(&stp->st_lockowners);
+ INIT_LIST_HEAD(&stp->st_perfile);
++#if defined(CONFIG_PNFSD)
++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
++#endif /* CONFIG_PNFSD */
+ list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
+ list_add(&stp->st_perstateowner, &sop->so_stateids);
+ list_add(&stp->st_perfile, &fp->fi_stateids);
+@@ -2038,6 +2117,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
+ {
+ struct nfs4_stateowner *so = NULL;
+
++ BUG_ON_UNLOCKED_STATE();
+ list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+ if (same_owner_str(so, &open->op_owner, &open->op_clientid))
+ return so;
+@@ -2046,7 +2126,7 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
+ }
+
+ /* search file_hashtbl[] for file */
+-static struct nfs4_file *
++struct nfs4_file *
+ find_file(struct inode *ino)
+ {
+ unsigned int hashval = file_hashval(ino);
+@@ -2064,6 +2144,18 @@ find_file(struct inode *ino)
+ return NULL;
+ }
+
++struct nfs4_file *
++find_alloc_file(struct inode *ino, struct svc_fh *current_fh)
+{
-+ struct nfs_inode *nfsi = NFS_I(lo->inode);
++ struct nfs4_file *fp;
+
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ /*
-+ * Matched in _pnfs_update_layout for layoutget
-+ * and by get_layout in _pnfs_return_layout for layoutreturn
-+ */
-+ put_layout_hdr_locked(lo);
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
++ fp = find_file(ino);
++ if (fp)
++ return fp;
++
++ return alloc_init_file(ino, current_fh);
+}
+
-+void
-+pnfs_layoutreturn_release(struct pnfs_layout_hdr *lo,
-+ struct pnfs_layout_range *range)
+ static inline int access_valid(u32 x, u32 minorversion)
+ {
+ if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
+@@ -2592,7 +2684,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+ goto out;
+ status = nfserr_resource;
+- fp = alloc_init_file(ino);
++ fp = alloc_init_file(ino, current_fh);
+ if (fp == NULL)
+ goto out;
+ }
+@@ -2813,7 +2905,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
+ return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
+ }
+
+-static int
++int
+ STALE_STATEID(stateid_t *stateid)
+ {
+ if (stateid->si_boot == boot_time)
+@@ -2823,6 +2915,16 @@ STALE_STATEID(stateid_t *stateid)
+ return 1;
+ }
+
++__be32
++nfs4_check_stateid(stateid_t *stateid)
+{
-+ struct nfs_inode *nfsi = NFS_I(lo->inode);
-+
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ if (range)
-+ pnfs_clear_lseg_list(lo, range);
-+ /*
-+ * Matched in _pnfs_update_layout for layoutget
-+ * and by get_layout in _pnfs_return_layout for layoutreturn
-+ */
-+ put_layout_hdr_locked(lo);
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
++ return nfserr_bad_stateid;
++ if (STALE_STATEID(stateid))
++ return nfserr_stale_stateid;
++ return 0;
+}
+
-+void
-+pnfs_destroy_layout(struct nfs_inode *nfsi)
+ static inline int
+ access_permit_read(unsigned long access_bmap)
+ {
+@@ -2934,6 +3036,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+ if (grace_disallows_io(ino))
+ return nfserr_grace;
+
++#if defined(CONFIG_PNFSD)
++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) {
++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
++ status = nfserr_bad_stateid;
++ else
++#ifdef CONFIG_GFS2_FS_LOCKING_DLM
++ {
++ dprintk("%s Don't check DS stateid\n", __func__);
++ return 0;
++ }
++#else /* CONFIG_GFS2_FS_LOCKING_DLM */
++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh,
++ stateid);
++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
++ goto out;
++ }
++#endif /* CONFIG_PNFSD */
++
+ if (nfsd4_has_session(cstate))
+ flags |= HAS_SESSION;
+
+@@ -3015,13 +3135,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+ *stpp = NULL;
+ *sopp = NULL;
+
+- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
+- return nfserr_bad_stateid;
+- }
+-
+- if (STALE_STATEID(stateid))
+- return nfserr_stale_stateid;
++ status = nfs4_check_stateid(stateid);
++ if (status)
++ return status;
+
+ if (nfsd4_has_session(cstate))
+ flags |= HAS_SESSION;
+@@ -3295,11 +3411,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (nfsd4_has_session(cstate))
+ flags |= HAS_SESSION;
+ nfs4_lock_state();
+- status = nfserr_bad_stateid;
+- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+- goto out;
+- status = nfserr_stale_stateid;
+- if (STALE_STATEID(stateid))
++ status = nfs4_check_stateid(stateid);
++ if (status)
+ goto out;
+ status = nfserr_bad_stateid;
+ if (!is_delegation_stateid(stateid))
+@@ -3328,26 +3441,6 @@ out:
+ #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
+ #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
+
+-static inline u64
+-end_offset(u64 start, u64 len)
+-{
+- u64 end;
+-
+- end = start + len;
+- return end >= start ? end: NFS4_MAX_UINT64;
+-}
+-
+-/* last octet in a range */
+-static inline u64
+-last_byte_offset(u64 start, u64 len)
+-{
+- u64 end;
+-
+- BUG_ON(!len);
+- end = start + len;
+- return end > start ? end - 1: NFS4_MAX_UINT64;
+-}
+-
+ #define lockownerid_hashval(id) \
+ ((id) & LOCK_HASH_MASK)
+
+@@ -3364,7 +3457,7 @@ static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
+ static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
+ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
+
+-static struct nfs4_stateid *
++struct nfs4_stateid *
+ find_stateid(stateid_t *stid, int flags)
+ {
+ struct nfs4_stateid *local;
+@@ -3393,7 +3486,7 @@ find_stateid(stateid_t *stid, int flags)
+ return NULL;
+ }
+
+-static struct nfs4_delegation *
++struct nfs4_delegation *
+ find_delegation_stateid(struct inode *ino, stateid_t *stid)
+ {
+ struct nfs4_file *fp;
+@@ -3524,6 +3617,9 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
+ INIT_LIST_HEAD(&stp->st_perfile);
+ INIT_LIST_HEAD(&stp->st_perstateowner);
+ INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
++#if defined(CONFIG_PNFSD)
++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id);
++#endif /* CONFIG_PNFSD */
+ list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
+ list_add(&stp->st_perfile, &fp->fi_stateids);
+ list_add(&stp->st_perstateowner, &sop->so_stateids);
+@@ -4100,6 +4196,9 @@ nfs4_state_init(void)
+ INIT_LIST_HEAD(&client_lru);
+ INIT_LIST_HEAD(&del_recall_lru);
+ reclaim_str_hashtbl_size = 0;
++#if defined(CONFIG_PNFSD)
++ nfs4_pnfs_state_init();
++#endif /* CONFIG_PNFSD */
+ return 0;
+ }
+
+@@ -4204,6 +4303,7 @@ __nfs4_state_shutdown(void)
+ }
+
+ nfsd4_shutdown_recdir();
++ nfs4_pnfs_state_shutdown();
+ }
+
+ void
+diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
+index 1a468bb..b93906b 100644
+--- a/fs/nfsd/nfs4xdr.c
++++ b/fs/nfsd/nfs4xdr.c
+@@ -47,9 +47,14 @@
+ #include <linux/nfsd_idmap.h>
+ #include <linux/nfs4_acl.h>
+ #include <linux/sunrpc/svcauth_gss.h>
++#include <linux/exportfs.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd4_block.h>
+
+ #include "xdr4.h"
+ #include "vfs.h"
++#include "pnfsd.h"
+
+ #define NFSDDBG_FACILITY NFSDDBG_XDR
+
+@@ -1244,6 +1249,138 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
+ DECODE_TAIL;
+ }
+
++#if defined(CONFIG_PNFSD)
++static __be32
++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp,
++ struct nfsd4_pnfs_getdevlist *gdevl)
+{
-+ struct pnfs_layout_hdr *lo;
-+ struct pnfs_layout_range range = {
-+ .iomode = IOMODE_ANY,
-+ .offset = 0,
-+ .length = NFS4_MAX_UINT64,
-+ };
++ DECODE_HEAD;
+
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ lo = nfsi->layout;
-+ if (lo) {
-+ pnfs_clear_lseg_list(lo, &range);
-+ WARN_ON(!list_empty(&nfsi->layout->segs));
-+ WARN_ON(!list_empty(&nfsi->layout->layouts));
-+ WARN_ON(nfsi->layout->refcount != 1);
++ READ_BUF(16 + sizeof(nfs4_verifier));
++ READ32(gdevl->gd_layout_type);
++ READ32(gdevl->gd_maxdevices);
++ READ64(gdevl->gd_cookie);
++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
+
-+ /* Matched by refcount set to 1 in alloc_init_layout_hdr */
-+ put_layout_hdr_locked(lo);
-+ }
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
++ DECODE_TAIL;
+}
+
-+/*
-+ * Called by the state manger to remove all layouts established under an
-+ * expired lease.
-+ */
-+void
-+pnfs_destroy_all_layouts(struct nfs_client *clp)
++static __be32
++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp,
++ struct nfsd4_pnfs_getdevinfo *gdev)
+{
-+ struct pnfs_layout_hdr *lo;
-+ LIST_HEAD(tmp_list);
-+
-+ spin_lock(&clp->cl_lock);
-+ list_splice_init(&clp->cl_layouts, &tmp_list);
-+ spin_unlock(&clp->cl_lock);
++ u32 num;
++ DECODE_HEAD;
+
-+ while (!list_empty(&tmp_list)) {
-+ lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-+ layouts);
-+ dprintk("%s freeing layout for inode %lu\n", __func__,
-+ lo->inode->i_ino);
-+ pnfs_destroy_layout(NFS_I(lo->inode));
++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid));
++ READ64(gdev->gd_devid.sbid);
++ READ64(gdev->gd_devid.devid);
++ READ32(gdev->gd_layout_type);
++ READ32(gdev->gd_maxcount);
++ READ32(num);
++ if (num) {
++ READ_BUF(4);
++ READ32(gdev->gd_notify_types);
++ } else {
++ gdev->gd_notify_types = 0;
+ }
-+}
+
-+static void
-+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
-+ const nfs4_stateid *stateid)
-+{
-+ /* TODO - should enforce that embedded seqid, in the case
-+ * that the two stateid.others are equal, only increases.
-+ * Complicated by wrap-around.
-+ */
-+ write_seqlock(&lo->seqlock);
-+ memcpy(lo->stateid.data, stateid->data, sizeof(lo->stateid.data));
-+ write_sequnlock(&lo->seqlock);
++ DECODE_TAIL;
+}
+
-+static void
-+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
-+ struct nfs4_state *state)
++static __be32
++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
++ struct nfsd4_pnfs_layoutget *lgp)
+{
-+ int seq;
-+
-+ dprintk("--> %s\n", __func__);
-+ write_seqlock(&lo->seqlock);
-+ do {
-+ seq = read_seqbegin(&state->seqlock);
-+ memcpy(lo->stateid.data, state->stateid.data,
-+ sizeof(state->stateid.data));
-+ } while (read_seqretry(&state->seqlock, seq));
-+ set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-+ write_sequnlock(&lo->seqlock);
-+ dprintk("<-- %s\n", __func__);
-+}
++ DECODE_HEAD;
+
-+void
-+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-+ struct nfs4_state *open_state)
-+{
-+ int seq;
++ READ_BUF(36);
++ READ32(lgp->lg_signal);
++ READ32(lgp->lg_seg.layout_type);
++ READ32(lgp->lg_seg.iomode);
++ READ64(lgp->lg_seg.offset);
++ READ64(lgp->lg_seg.length);
++ READ64(lgp->lg_minlength);
++ nfsd4_decode_stateid(argp, &lgp->lg_sid);
++ READ_BUF(4);
++ READ32(lgp->lg_maxcount);
+
-+ dprintk("--> %s\n", __func__);
-+ do {
-+ seq = read_seqbegin(&lo->seqlock);
-+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
-+ /* This will trigger retry of the read */
-+ pnfs_layout_from_open_stateid(lo, open_state);
-+ } else
-+ memcpy(dst->data, lo->stateid.data,
-+ sizeof(lo->stateid.data));
-+ } while (read_seqretry(&lo->seqlock, seq));
-+ dprintk("<-- %s\n", __func__);
++ DECODE_TAIL;
+}
+
-+/*
-+* Get layout from server.
-+* for now, assume that whole file layouts are requested.
-+* arg->offset: 0
-+* arg->length: all ones
-+*/
-+static struct pnfs_layout_segment *
-+send_layoutget(struct pnfs_layout_hdr *lo,
-+ struct nfs_open_context *ctx,
-+ struct pnfs_layout_range *range)
++static __be32
++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
++ struct nfsd4_pnfs_layoutcommit *lcp)
+{
-+ struct inode *ino = lo->inode;
-+ struct nfs_server *server = NFS_SERVER(ino);
-+ struct nfs4_layoutget *lgp;
-+ struct pnfs_layout_segment *lseg = NULL;
-+
-+ dprintk("--> %s\n", __func__);
-+
-+ BUG_ON(ctx == NULL);
-+ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
-+ if (lgp == NULL) {
-+ pnfs_layoutget_release(lo);
-+ return NULL;
-+ }
-+ lgp->args.minlength = PAGE_CACHE_SIZE;
-+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-+ lgp->args.range = *range;
-+ lgp->args.type = server->pnfs_curr_ld->id;
-+ lgp->args.inode = ino;
-+ lgp->args.ctx = get_nfs_open_context(ctx);
-+ lgp->lsegpp = &lseg;
++ DECODE_HEAD;
++ u32 timechange;
+
-+ /* Synchronously retrieve layout information from server and
-+ * store in lseg.
++ READ_BUF(20);
++ READ64(lcp->args.lc_seg.offset);
++ READ64(lcp->args.lc_seg.length);
++ READ32(lcp->args.lc_reclaim);
++ nfsd4_decode_stateid(argp, &lcp->lc_sid);
++ READ_BUF(4);
++ READ32(lcp->args.lc_newoffset);
++ if (lcp->args.lc_newoffset) {
++ READ_BUF(8);
++ READ64(lcp->args.lc_last_wr);
++ } else
++ lcp->args.lc_last_wr = 0;
++ READ_BUF(4);
++ READ32(timechange);
++ if (timechange) {
++ READ_BUF(12);
++ READ64(lcp->args.lc_mtime.seconds);
++ READ32(lcp->args.lc_mtime.nseconds);
++ } else {
++ lcp->args.lc_mtime.seconds = 0;
++ lcp->args.lc_mtime.nseconds = 0;
++ }
++ READ_BUF(8);
++ READ32(lcp->args.lc_seg.layout_type);
++ /* XXX: saving XDR'ed layout update. Since we don't have the
++ * current_fh yet, and therefore no export_ops, we can't call
++ * the layout specific decode routines. File and pVFS2
++ * do not use the layout update....
+ */
-+ nfs4_proc_layoutget(lgp);
-+ if (!lseg) {
-+ /* remember that LAYOUTGET failed and suspend trying */
-+ set_bit(lo_fail_bit(range->iomode), &lo->state);
++ READ32(lcp->args.lc_up_len);
++ if (lcp->args.lc_up_len > 0) {
++ READ_BUF(lcp->args.lc_up_len);
++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len);
+ }
-+ return lseg;
++
++ DECODE_TAIL;
+}
+
-+static struct pnfs_layout_segment *
-+has_layout_to_return(struct pnfs_layout_hdr *lo,
-+ struct pnfs_layout_range *range)
++static __be32
++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
++ struct nfsd4_pnfs_layoutreturn *lrp)
+{
-+ struct pnfs_layout_segment *out = NULL, *lseg;
-+ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
-+ __func__, lo, range->offset, range->length, range->iomode);
++ DECODE_HEAD;
+
-+ assert_spin_locked(&lo->inode->i_lock);
-+ list_for_each_entry(lseg, &lo->segs, fi_list)
-+ if (should_free_lseg(lseg, range)) {
-+ out = lseg;
-+ break;
++ READ_BUF(16);
++ READ32(lrp->args.lr_reclaim);
++ READ32(lrp->args.lr_seg.layout_type);
++ READ32(lrp->args.lr_seg.iomode);
++ READ32(lrp->args.lr_return_type);
++ if (lrp->args.lr_return_type == RETURN_FILE) {
++ READ_BUF(16);
++ READ64(lrp->args.lr_seg.offset);
++ READ64(lrp->args.lr_seg.length);
++ nfsd4_decode_stateid(argp, &lrp->lr_sid);
++ READ_BUF(4);
++ READ32(lrp->args.lrf_body_len);
++ if (lrp->args.lrf_body_len > 0) {
++ READ_BUF(lrp->args.lrf_body_len);
++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len);
+ }
++ }
+
-+ dprintk("%s:Return lseg=%p\n", __func__, out);
-+ return out;
++ DECODE_TAIL;
+}
++#endif /* CONFIG_PNFSD */
+
-+bool
-+pnfs_return_layout_barrier(struct nfs_inode *nfsi,
-+ struct pnfs_layout_range *range)
-+{
-+ struct pnfs_layout_segment *lseg;
-+ bool ret = false;
+ static __be32
+ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+ {
+@@ -1345,11 +1482,19 @@ static nfsd4_dec nfsd41_dec_ops[] = {
+ [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
+ [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
++#if defined(CONFIG_PNFSD)
++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo,
++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist,
++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
++#else /* CONFIG_PNFSD */
+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
++#endif /* CONFIG_PNFSD */
+ [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
+ [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
+@@ -1805,19 +1950,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
+ goto out_nfserr;
+ }
+ }
+- if ((buflen -= 16) < 0)
+- goto out_resource;
+
+ if (unlikely(bmval2)) {
++ if ((buflen -= 16) < 0)
++ goto out_resource;
+ WRITE32(3);
+ WRITE32(bmval0);
+ WRITE32(bmval1);
+ WRITE32(bmval2);
+ } else if (likely(bmval1)) {
++ if ((buflen -= 12) < 0)
++ goto out_resource;
+ WRITE32(2);
+ WRITE32(bmval0);
+ WRITE32(bmval1);
+ } else {
++ if ((buflen -= 8) < 0)
++ goto out_resource;
+ WRITE32(1);
+ WRITE32(bmval0);
+ }
+@@ -1828,15 +1977,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
+ u32 word1 = nfsd_suppattrs1(minorversion);
+ u32 word2 = nfsd_suppattrs2(minorversion);
+
+- if ((buflen -= 12) < 0)
+- goto out_resource;
+ if (!aclsupport)
+ word0 &= ~FATTR4_WORD0_ACL;
+ if (!word2) {
++ if ((buflen -= 12) < 0)
++ goto out_resource;
+ WRITE32(2);
+ WRITE32(word0);
+ WRITE32(word1);
+ } else {
++ if ((buflen -= 16) < 0)
++ goto out_resource;
+ WRITE32(3);
+ WRITE32(word0);
+ WRITE32(word1);
+@@ -2150,6 +2301,36 @@ out_acl:
+ }
+ WRITE64(stat.ino);
+ }
++#if defined(CONFIG_PNFSD)
++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
++ struct super_block *sb = dentry->d_inode->i_sb;
++ int type = 0;
+
-+ spin_lock(&nfsi->vfs_inode.i_lock);
-+ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) {
-+ if (!should_free_lseg(lseg, range))
-+ continue;
-+ lseg->valid = false;
-+ if (!_pnfs_can_return_lseg(lseg)) {
-+ dprintk("%s: wait on lseg %p refcount %d\n",
-+ __func__, lseg,
-+ atomic_read(&lseg->kref.refcount));
-+ ret = true;
-+ }
++ /* Query the filesystem for supported pNFS layout types.
++ * Currently, we only support one layout type per file system.
++ * The export_ops->layout_type() returns the pnfs_layouttype4.
++ */
++ buflen -= 4;
++ if (buflen < 0) /* length */
++ goto out_resource;
++
++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type)
++ type = sb->s_pnfs_op->layout_type(sb);
++ if (type) {
++ if ((buflen -= 4) < 0) /* type */
++ goto out_resource;
++ WRITE32(1); /* length */
++ WRITE32(type); /* type */
++ } else
++ WRITE32(0); /* length */
+ }
-+ spin_unlock(&nfsi->vfs_inode.i_lock);
-+ dprintk("%s:Return %d\n", __func__, ret);
-+ return ret;
-+}
+
-+static int
-+return_layout(struct inode *ino, struct pnfs_layout_range *range,
-+ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo,
-+ bool wait)
-+{
-+ struct nfs4_layoutreturn *lrp;
-+ struct nfs_server *server = NFS_SERVER(ino);
-+ int status = -ENOMEM;
++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
++ if ((buflen -= 4) < 0)
++ goto out_resource;
++ WRITE32(stat.blksize);
++ }
++#endif /* CONFIG_PNFSD */
+ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ WRITE32(3);
+ WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+@@ -2380,6 +2561,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
+ if (!nfserr) {
+ RESERVE_SPACE(8);
+ WRITEMEM(commit->co_verf.data, 8);
++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n",
++ ((u32 *)(&commit->co_verf.data))[0],
++ ((u32 *)(&commit->co_verf.data))[1]);
+
-+ dprintk("--> %s\n", __func__);
+ ADJUST_ARGS();
+ }
+ return nfserr;
+@@ -2634,6 +2819,13 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+ }
+ read->rd_vlen = v;
+
++#if defined(CONFIG_SPNFS)
++ if (spnfs_enabled())
++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode,
++ read->rd_offset, &maxcount, read->rd_vlen,
++ resp->rqstp);
++ else /* we're not an MDS */
++#endif /* CONFIG_SPNFS */
+ nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
+ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
+ &maxcount);
+@@ -2940,6 +3132,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
+ WRITE32(write->wr_bytes_written);
+ WRITE32(write->wr_how_written);
+ WRITEMEM(write->wr_verifier.data, 8);
++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n",
++ ((u32 *)(&write->wr_verifier.data))[0],
++ ((u32 *)(&write->wr_verifier.data))[1]);
+ ADJUST_ARGS();
+ }
+ return nfserr;
+@@ -3083,6 +3278,343 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+ return 0;
+ }
+
++#if defined(CONFIG_PNFSD)
+
-+ BUG_ON(type != RETURN_FILE);
++/* Uses the export interface to iterate through the available devices
++ * and encodes them on the response stream.
++ */
++static __be32
++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp,
++ struct nfsd4_pnfs_getdevlist *gdevl,
++ unsigned int *dev_count)
++{
++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb;
++ __be32 nfserr;
++ int status;
++ __be32 *p;
++ struct nfsd4_pnfs_dev_iter_res res = {
++ .gd_cookie = gdevl->gd_cookie,
++ .gd_verf = gdevl->gd_verf,
++ .gd_eof = 0
++ };
++ u64 sbid;
+
-+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
-+ if (lrp == NULL) {
-+ if (lo && (type == RETURN_FILE))
-+ pnfs_layoutreturn_release(lo, NULL);
-+ goto out;
-+ }
-+ lrp->args.reclaim = 0;
-+ lrp->args.layout_type = server->pnfs_curr_ld->id;
-+ lrp->args.return_type = type;
-+ lrp->args.range = *range;
-+ lrp->args.inode = ino;
++ dprintk("%s: Begin\n", __func__);
+
-+ status = nfs4_proc_layoutreturn(lrp, wait);
-+out:
-+ dprintk("<-- %s status: %d\n", __func__, status);
-+ return status;
++ sbid = find_create_sbid(sb);
++ *dev_count = 0;
++ do {
++ status = sb->s_pnfs_op->get_device_iter(sb,
++ gdevl->gd_layout_type,
++ &res);
++ if (status) {
++ if (status == -ENOENT) {
++ res.gd_eof = 1;
++ /* return success */
++ break;
++ }
++ nfserr = nfserrno(status);
++ goto out_err;
++ }
++
++ /* Encode device id and layout type */
++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid));
++ WRITE64((__be64)sbid);
++ WRITE64(res.gd_devid); /* devid minor */
++ ADJUST_ARGS();
++ (*dev_count)++;
++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof);
++ gdevl->gd_cookie = res.gd_cookie;
++ gdevl->gd_verf = res.gd_verf;
++ gdevl->gd_eof = res.gd_eof;
++ nfserr = nfs_ok;
++out_err:
++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count);
++ return nfserr;
+}
+
-+int
-+_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range,
-+ const nfs4_stateid *stateid, /* optional */
-+ enum pnfs_layoutreturn_type type,
-+ bool wait)
++/* Encodes the response of get device list.
++*/
++static __be32
++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr,
++ struct nfsd4_pnfs_getdevlist *gdevl)
+{
-+ struct pnfs_layout_hdr *lo = NULL;
-+ struct nfs_inode *nfsi = NFS_I(ino);
-+ struct pnfs_layout_range arg;
-+ int status = 0;
++ unsigned int dev_count = 0, lead_count;
++ u32 *p_in = resp->p;
++ __be32 *p;
+
-+ dprintk("--> %s type %d\n", __func__, type);
++ dprintk("%s: err %d\n", __func__, nfserr);
++ if (nfserr)
++ return nfserr;
+
++ /* Ensure we have room for cookie, verifier, and devlist len,
++ * which we will backfill in after we encode as many devices as possible
++ */
++ lead_count = 8 + sizeof(nfs4_verifier) + 4;
++ RESERVE_SPACE(lead_count);
++ /* skip past these values */
++ p += XDR_QUADLEN(lead_count);
++ ADJUST_ARGS();
+
-+ arg.iomode = range ? range->iomode : IOMODE_ANY;
-+ arg.offset = 0;
-+ arg.length = NFS4_MAX_UINT64;
++ /* Iterate over as many device ids as possible on the xdr stream */
++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count);
++ if (nfserr)
++ goto out_err;
+
-+ if (type == RETURN_FILE) {
-+ spin_lock(&ino->i_lock);
-+ lo = nfsi->layout;
-+ if (lo && !has_layout_to_return(lo, &arg))
-+ lo = NULL;
-+ if (!lo) {
-+ spin_unlock(&ino->i_lock);
-+ dprintk("%s: no layout segments to return\n", __func__);
-+ goto out;
-+ }
++ /* Backfill in cookie, verf and number of devices encoded */
++ p = p_in;
++ WRITE64(gdevl->gd_cookie);
++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier));
++ WRITE32(dev_count);
+
-+ /* Reference matched in pnfs_layoutreturn_release */
-+ get_layout_hdr_locked(lo);
++ /* Skip over devices */
++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid));
++ ADJUST_ARGS();
+
-+ spin_unlock(&ino->i_lock);
++ /* are we at the end of devices? */
++ RESERVE_SPACE(4);
++ WRITE32(gdevl->gd_eof);
++ ADJUST_ARGS();
+
-+ if (layoutcommit_needed(nfsi)) {
-+ if (stateid && !wait) { /* callback */
-+ dprintk("%s: layoutcommit pending\n", __func__);
-+ status = -EAGAIN;
-+ goto out_put;
-+ }
-+ status = pnfs_layoutcommit_inode(ino, wait);
-+ if (status) {
-+ /* Return layout even if layoutcommit fails */
-+ dprintk("%s: layoutcommit failed, status=%d. "
-+ "Returning layout anyway\n",
-+ __func__, status);
-+ }
-+ }
++ dprintk("%s: done.\n", __func__);
+
-+ if (!stateid)
-+ status = return_layout(ino, &arg, type, lo, wait);
-+ else
-+ pnfs_layoutreturn_release(lo, &arg);
-+ }
++ nfserr = nfs_ok;
+out:
-+ dprintk("<-- %s status: %d\n", __func__, status);
-+ return status;
-+out_put:
-+ put_layout_hdr(ino);
++ return nfserr;
++out_err:
++ p = p_in;
++ ADJUST_ARGS();
+ goto out;
+}
+
-+/*
-+ * Compare two layout segments for sorting into layout cache.
-+ * We want to preferentially return RW over RO layouts, so ensure those
-+ * are seen first.
++/* For a given device id, have the file system retrieve and encode the
++ * associated device. For file layout, the encoding function is
++ * passed down to the file system. The file system then has the option
++ * of using this encoding function or one of its own.
++ *
++ * Note: the file system must return the XDR size of struct device_addr4
++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the
++ * gdir_mincount calculation.
+ */
-+static s64
-+cmp_layout(struct pnfs_layout_range *l1,
-+ struct pnfs_layout_range *l2)
++static __be32
++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
++ struct nfsd4_pnfs_getdevinfo *gdev)
+{
-+ s64 d;
-+
-+ /* higher offset > lower offset */
-+ d = l1->offset - l2->offset;
-+ if (d)
-+ return d;
-+
-+ /* longer length > shorter length */
-+ d = l1->length - l2->length;
-+ if (d)
-+ return d;
-+
-+ /* read > read/write */
-+ return (int)(l2->iomode == IOMODE_READ) -
-+ (int)(l1->iomode == IOMODE_READ);
-+}
++ struct super_block *sb;
++ int maxcount = 0, type_notify_len = 12;
++ __be32 *p, *p_save = NULL, *p_in = resp->p;
++ struct exp_xdr_stream xdr;
+
-+static void
-+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
-+ struct pnfs_layout_segment *lseg)
-+{
-+ struct pnfs_layout_segment *lp;
-+ int found = 0;
++ dprintk("%s: err %d\n", __func__, nfserr);
++ if (nfserr)
++ return nfserr;
+
-+ dprintk("%s:Begin\n", __func__);
++ sb = gdev->gd_sb;
+
-+ assert_spin_locked(&lo->inode->i_lock);
-+ if (list_empty(&lo->segs)) {
-+ struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
++ if (gdev->gd_maxcount != 0) {
++ /* FIXME: this will be bound by the session max response */
++ maxcount = svc_max_payload(resp->rqstp);
++ if (maxcount > gdev->gd_maxcount)
++ maxcount = gdev->gd_maxcount;
+
-+ spin_lock(&clp->cl_lock);
-+ BUG_ON(!list_empty(&lo->layouts));
-+ list_add_tail(&lo->layouts, &clp->cl_layouts);
-+ spin_unlock(&clp->cl_lock);
-+ }
-+ list_for_each_entry(lp, &lo->segs, fi_list) {
-+ if (cmp_layout(&lp->range, &lseg->range) > 0)
-+ continue;
-+ list_add_tail(&lseg->fi_list, &lp->fi_list);
-+ dprintk("%s: inserted lseg %p "
-+ "iomode %d offset %llu length %llu before "
-+ "lp %p iomode %d offset %llu length %llu\n",
-+ __func__, lseg, lseg->range.iomode,
-+ lseg->range.offset, lseg->range.length,
-+ lp, lp->range.iomode, lp->range.offset,
-+ lp->range.length);
-+ found = 1;
-+ break;
-+ }
-+ if (!found) {
-+ list_add_tail(&lseg->fi_list, &lo->segs);
-+ dprintk("%s: inserted lseg %p "
-+ "iomode %d offset %llu length %llu at tail\n",
-+ __func__, lseg, lseg->range.iomode,
-+ lseg->range.offset, lseg->range.length);
++ /* Ensure have room for type and notify field */
++ maxcount -= type_notify_len;
++ if (maxcount < 0) {
++ nfserr = -ETOOSMALL;
++ goto toosmall;
++ }
+ }
-+ get_layout_hdr_locked(lo);
+
-+ dprintk("%s:Return\n", __func__);
-+}
++ RESERVE_SPACE(4);
++ WRITE32(gdev->gd_layout_type);
++ ADJUST_ARGS();
+
-+static struct pnfs_layout_hdr *
-+alloc_init_layout_hdr(struct inode *ino)
-+{
-+ struct pnfs_layout_hdr *lo;
++ /* If maxcount is 0 then just update notifications */
++ if (gdev->gd_maxcount == 0)
++ goto handle_notifications;
+
-+ lo = pnfs_alloc_layout_hdr(ino);
-+ if (!lo)
-+ return NULL;
-+ lo->refcount = 1;
-+ INIT_LIST_HEAD(&lo->layouts);
-+ INIT_LIST_HEAD(&lo->segs);
-+ seqlock_init(&lo->seqlock);
-+ lo->inode = ino;
-+ return lo;
-+}
++ xdr.p = p_save = resp->p;
++ xdr.end = resp->end;
++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
+
-+static struct pnfs_layout_hdr *
-+pnfs_find_alloc_layout(struct inode *ino)
-+{
-+ struct nfs_inode *nfsi = NFS_I(ino);
-+ struct pnfs_layout_hdr *new = NULL;
++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type,
++ &gdev->gd_devid);
++ if (nfserr)
++ goto err;
+
-+ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
++ /* The file system should never write 0 bytes without
++ * returning an error
++ */
++ BUG_ON(xdr.p == p_save);
++ BUG_ON(xdr.p > xdr.end);
+
-+ assert_spin_locked(&ino->i_lock);
-+ if (nfsi->layout)
-+ return nfsi->layout;
++ /* Update the xdr stream with the number of bytes encoded
++ * by the file system.
++ */
++ p = xdr.p;
++ ADJUST_ARGS();
+
-+ spin_unlock(&ino->i_lock);
-+ new = alloc_init_layout_hdr(ino);
-+ spin_lock(&ino->i_lock);
++handle_notifications:
++ /* Encode supported device notifications */
++ RESERVE_SPACE(4);
++ if (sb->s_pnfs_op->set_device_notify) {
++ struct pnfs_devnotify_arg dn_args;
+
-+ if (likely(nfsi->layout == NULL)) /* Won the race? */
-+ nfsi->layout = new;
-+ else
-+ pnfs_free_layout_hdr(new);
-+ return nfsi->layout;
++ dn_args.dn_layout_type = gdev->gd_layout_type;
++ dn_args.dn_devid = gdev->gd_devid;
++ dn_args.dn_notify_types = gdev->gd_notify_types;
++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args);
++ if (nfserr)
++ goto err;
++ WRITE32(dn_args.dn_notify_types);
++ } else {
++ WRITE32(0);
++ }
++ ADJUST_ARGS();
++
++out:
++ return nfserrno(nfserr);
++toosmall:
++ dprintk("%s: maxcount too small\n", __func__);
++ RESERVE_SPACE(4);
++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len);
++ ADJUST_ARGS();
++ goto out;
++err:
++ /* Rewind to the beginning */
++ p = p_in;
++ ADJUST_ARGS();
++ if (nfserr == -ETOOSMALL)
++ goto toosmall;
++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr);
++ goto out;
+}
+
-+/*
-+ * iomode matching rules:
-+ * range lseg match
-+ * ----- ----- -----
-+ * ANY READ true
-+ * ANY RW true
-+ * RW READ false
-+ * RW RW true
-+ * READ READ true
-+ * READ RW true
-+ */
-+static int
-+is_matching_lseg(struct pnfs_layout_segment *lseg,
-+ struct pnfs_layout_range *range)
++static __be32
++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp,
++ __be32 nfserr,
++ struct nfsd4_pnfs_layoutget *lgp)
+{
-+ struct pnfs_layout_range range1;
-+
-+ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) ||
-+ !lo_seg_intersecting(&lseg->range, range))
-+ return 0;
++ int maxcount, leadcount;
++ struct super_block *sb;
++ struct exp_xdr_stream xdr;
++ __be32 *p, *p_save, *p_start = resp->p;
+
-+ /* range1 covers only the first byte in the range */
-+ range1 = *range;
-+ range1.length = 1;
-+ return lo_seg_contained(&lseg->range, &range1);
-+}
++ dprintk("%s: err %d\n", __func__, nfserr);
++ if (nfserr)
++ return nfserr;
+
-+/*
-+ * lookup range in layout
-+ */
-+struct pnfs_layout_segment *
-+pnfs_has_layout(struct pnfs_layout_hdr *lo,
-+ struct pnfs_layout_range *range)
-+{
-+ struct pnfs_layout_segment *lseg, *ret = NULL;
++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb;
++ maxcount = PAGE_SIZE;
++ if (maxcount > lgp->lg_maxcount)
++ maxcount = lgp->lg_maxcount;
+
-+ dprintk("%s:Begin\n", __func__);
++ /* Check for space on xdr stream */
++ leadcount = 36 + sizeof(stateid_opaque_t);
++ RESERVE_SPACE(leadcount);
++ /* encode layout metadata after file system encodes layout */
++ p += XDR_QUADLEN(leadcount);
++ ADJUST_ARGS();
+
-+ assert_spin_locked(&lo->inode->i_lock);
-+ list_for_each_entry(lseg, &lo->segs, fi_list) {
-+ if (is_matching_lseg(lseg, range)) {
-+ ret = lseg;
-+ get_lseg(ret);
-+ break;
-+ }
-+ if (cmp_layout(range, &lseg->range) > 0)
-+ break;
++ /* Ensure have room for ret_on_close, off, len, iomode, type */
++ maxcount -= leadcount;
++ if (maxcount < 0) {
++ printk(KERN_ERR "%s: buffer too small\n", __func__);
++ nfserr = nfserr_toosmall;
++ goto err;
+ }
+
-+ dprintk("%s:Return lseg %p ref %d valid %d\n",
-+ __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0,
-+ ret ? ret->valid : 0);
-+ return ret;
-+}
-+EXPORT_SYMBOL_GPL(pnfs_has_layout);
++ /* Set xdr info so file system can encode layout */
++ xdr.p = p_save = resp->p;
++ xdr.end = resp->end;
++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3))
++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3);
+
-+/*
-+ * Layout segment is retreived from the server if not cached.
-+ * The appropriate layout segment is referenced and returned to the caller.
-+ */
-+struct pnfs_layout_segment *
-+pnfs_update_layout(struct inode *ino,
-+ struct nfs_open_context *ctx,
-+ loff_t pos,
-+ u64 count,
-+ enum pnfs_iomode iomode)
-+{
-+ struct pnfs_layout_range arg = {
-+ .iomode = iomode,
-+ .offset = pos,
-+ .length = count,
-+ };
-+ struct nfs_inode *nfsi = NFS_I(ino);
-+ struct pnfs_layout_hdr *lo;
-+ struct pnfs_layout_segment *lseg = NULL;
++ /* Retrieve, encode, and merge layout; process stateid */
++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr);
++ if (nfserr)
++ goto err;
+
-+ if (!pnfs_enabled_sb(NFS_SERVER(ino)))
-+ return NULL;
-+ spin_lock(&ino->i_lock);
-+ lo = pnfs_find_alloc_layout(ino);
-+ if (lo == NULL) {
-+ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
-+ goto out_unlock;
++ /* Ensure file system returned enough bytes for the client
++ * to access.
++ */
++ if (lgp->lg_seg.length < lgp->lg_minlength) {
++ nfserr = nfserr_badlayout;
++ goto err;
+ }
+
-+ /* Check to see if the layout for the given range already exists */
-+ lseg = pnfs_has_layout(lo, &arg);
-+ if (lseg) {
-+ if (lseg->valid) {
-+ dprintk("%s: Using cached lseg %p for %llu@%llu "
-+ "iomode %d)\n",
-+ __func__,
-+ lseg,
-+ arg.length,
-+ arg.offset,
-+ arg.iomode);
++ /* The file system should never write 0 bytes without
++ * returning an error
++ */
++ BUG_ON(xdr.p == p_save);
+
-+ goto out_unlock;
-+ }
-+ /* someone is cleaning the layout */
-+ put_lseg_locked(lseg);
-+ lseg = NULL;
-+ }
++ /* Rewind to beginning and encode attrs */
++ resp->p = p_start;
++ RESERVE_SPACE(4);
++ WRITE32(lgp->lg_roc); /* return on close */
++ ADJUST_ARGS();
++ nfsd4_encode_stateid(resp, &lgp->lg_sid);
++ RESERVE_SPACE(28);
++ /* Note: response logr_layout array count, always one for now */
++ WRITE32(1);
++ WRITE64(lgp->lg_seg.offset);
++ WRITE64(lgp->lg_seg.length);
++ WRITE32(lgp->lg_seg.iomode);
++ WRITE32(lgp->lg_seg.layout_type);
+
-+ /* if LAYOUTGET already failed once we don't try again */
-+ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
-+ goto out_unlock;
++ /* Update the xdr stream with the number of bytes written
++ * by the file system
++ */
++ p = xdr.p;
++ ADJUST_ARGS();
+
-+ get_layout_hdr_locked(lo); /* Matched in pnfs_layoutget_release */
-+ spin_unlock(&ino->i_lock);
++ return nfs_ok;
++err:
++ resp->p = p_start;
++ return nfserr;
++}
+
-+ lseg = send_layoutget(lo, ctx, &arg);
++static __be32
++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
++ struct nfsd4_pnfs_layoutcommit *lcp)
++{
++ __be32 *p;
++
++ if (nfserr)
++ goto out;
++
++ RESERVE_SPACE(4);
++ WRITE32(lcp->res.lc_size_chg);
++ ADJUST_ARGS();
++ if (lcp->res.lc_size_chg) {
++ RESERVE_SPACE(8);
++ WRITE64(lcp->res.lc_newsize);
++ ADJUST_ARGS();
++ }
+out:
-+ dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-+ nfsi->layout->state, lseg);
-+ return lseg;
-+out_unlock:
-+ spin_unlock(&ino->i_lock);
-+ goto out;
++ return nfserr;
+}
+
-+int
-+pnfs_layout_process(struct nfs4_layoutget *lgp)
++static __be32
++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
++ struct nfsd4_pnfs_layoutreturn *lrp)
+{
-+ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
-+ struct nfs4_layoutget_res *res = &lgp->res;
-+ struct pnfs_layout_segment *lseg;
-+ struct inode *ino = lo->inode;
-+ int status = 0;
++ __be32 *p;
+
-+ /* Inject layout blob into I/O device driver */
-+ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
-+ if (!lseg || IS_ERR(lseg)) {
-+ if (!lseg)
-+ status = -ENOMEM;
-+ else
-+ status = PTR_ERR(lseg);
-+ dprintk("%s: Could not allocate layout: error %d\n",
-+ __func__, status);
++ if (nfserr)
+ goto out;
-+ }
+
-+ spin_lock(&ino->i_lock);
-+ init_lseg(lo, lseg);
-+ lseg->range = res->range;
-+ get_lseg(lseg);
-+ *lgp->lsegpp = lseg;
-+ pnfs_insert_layout(lo, lseg);
++ RESERVE_SPACE(4);
++ WRITE32(lrp->lrs_present != 0); /* got stateid? */
++ ADJUST_ARGS();
++ if (lrp->lrs_present)
++ nfsd4_encode_stateid(resp, &lrp->lr_sid);
++out:
++ return nfserr;
++}
++#endif /* CONFIG_PNFSD */
+
-+ if (res->return_on_close) {
-+ /* FI: This needs to be re-examined. At lo level,
-+ * all it needs is a bit indicating whether any of
-+ * the lsegs in the list have the flags set.
-+ *
-+ * The IOMODE_ANY line just seems nonsensical.
-+ */
-+ lo->roc_iomode |= res->range.iomode;
-+ if (!lo->roc_iomode)
-+ lo->roc_iomode = IOMODE_ANY;
-+ }
+ static __be32
+ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+ {
+@@ -3143,11 +3675,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
+ [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
+ [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
++#if defined(CONFIG_PNFSD)
++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo,
++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist,
++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
++#else /* CONFIG_PNFSD */
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
++#endif /* CONFIG_PNFSD */
+ [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
+ [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
+diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
+index b53b1d0..1bbd9c2 100644
+--- a/fs/nfsd/nfsctl.c
++++ b/fs/nfsd/nfsctl.c
+@@ -13,10 +13,15 @@
+ #include <linux/nfsd/syscall.h>
+ #include <linux/lockd/lockd.h>
+ #include <linux/sunrpc/clnt.h>
++#include <linux/nfsd/nfs4pnfsdlm.h>
+
+ #include "nfsd.h"
+ #include "cache.h"
+
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
++#include <linux/nfsd4_spnfs.h>
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
+
-+ /* Done processing layoutget. Set the layout stateid */
-+ pnfs_set_layout_stateid(lo, &res->stateid);
-+ spin_unlock(&ino->i_lock);
-+out:
-+ return status;
-+}
+ /*
+ * We have a single directory with 9 nodes in it.
+ */
+@@ -49,6 +54,9 @@ enum {
+ NFSD_Gracetime,
+ NFSD_RecoveryDir,
+ #endif
++#ifdef CONFIG_PNFSD
++ NFSD_pnfs_dlm_device,
++#endif
+ };
+
+ /*
+@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+ static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
+ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+ #endif
++#ifdef CONFIG_PNFSD
++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size);
++#endif
+
+ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+ [NFSD_Svc] = write_svc,
+@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+ [NFSD_Gracetime] = write_gracetime,
+ [NFSD_RecoveryDir] = write_recoverydir,
+ #endif
++#ifdef CONFIG_PNFSD
++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device,
++#endif
+ };
+
+ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
+@@ -1347,6 +1361,68 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+
+ #endif
+
++#ifdef CONFIG_PNFSD
+
-+void
-+readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset,
-+ size_t *count)
++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf,
++ size_t size)
+{
-+ struct page *first, *last;
-+ loff_t foff, i_size = i_size_read(inode);
-+ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
-+ size_t range;
++ char *mesg = buf;
++ char *pnfs_dlm_device;
++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX;
++ int len, ret = 0;
+
++ if (size > 0) {
++ ret = -EINVAL;
++ if (size > max_size || buf[size-1] != '\n')
++ return ret;
++ buf[size-1] = 0;
+
-+ first = list_entry((pages)->prev, struct page, lru);
-+ last = list_entry((pages)->next, struct page, lru);
++ pnfs_dlm_device = mesg;
++ len = qword_get(&mesg, pnfs_dlm_device, size);
++ if (len <= 0)
++ return ret;
+
-+ foff = (loff_t)first->index << PAGE_CACHE_SHIFT;
++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len);
++ } else
++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT);
+
-+ range = (last->index - first->index) * PAGE_CACHE_SIZE;
-+ if (last->index == end_index)
-+ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
-+ else
-+ range += PAGE_CACHE_SIZE;
-+ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff,
-+ range);
-+ *offset = foff;
-+ *count = range;
++ return ret <= 0 ? ret : strlen(buf);
+}
+
-+void
-+pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
++/**
++ * write_pnfs_dlm_device - Set or report the current pNFS data server list
++ *
++ * Input:
++ * buf: ignored
++ * size: zero
++ *
++ * OR
++ *
++ * Input:
++ * buf: C string containing a block device name,
++ * a colon, and then a comma separated
++ * list of pNFS data server IPv4 addresses
++ * size: non-zero length of C string in @buf
++ * Output:
++ * On success: passed-in buffer filled with '\n'-terminated C
++ * string containing a block device name, a colon, and
++ * then a comma separated list of pNFS
++ * data server IPv4 addresses.
++ * return code is the size in bytes of the string
++ * On error: return code is a negative errno value
++ */
++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size)
+{
-+ struct pnfs_layout_hdr *lo;
-+ struct pnfs_layoutdriver_type *ld;
-+
-+ pgio->pg_test = NULL;
-+
-+ lo = NFS_I(inode)->layout;
-+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
-+ if (!ld || !lo)
-+ return;
++ ssize_t rv;
+
-+ pgio->pg_test = ld->pg_test;
++ mutex_lock(&nfsd_mutex);
++ rv = __write_pnfs_dlm_device(file, buf, size);
++ mutex_unlock(&nfsd_mutex);
++ return rv;
+}
+
-+static u32
-+pnfs_getboundary(struct inode *inode)
-+{
-+ u32 stripe_size = 0;
-+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
++#endif /* CONFIG_PNFSD */
+
-+ if (!ld || !ld->get_stripesize)
-+ goto out;
+ /*----------------------------------------------------------------------------*/
+ /*
+ * populating the filesystem.
+@@ -1381,6 +1457,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
+ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
+ #endif
++#ifdef CONFIG_PNFSD
++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops,
++ S_IWUSR|S_IRUSR},
++#endif
+ /* last one */ {""}
+ };
+ return simple_fill_super(sb, 0x6e667364, nfsd_files);
+@@ -1419,6 +1499,9 @@ static int create_proc_exports_entry(void)
+ }
+ #endif
+
++#if defined(CONFIG_SPNFS_BLOCK)
++int nfsd_bl_init(void);
++#endif
+ static int __init init_nfsd(void)
+ {
+ int retval;
+@@ -1441,6 +1524,15 @@ static int __init init_nfsd(void)
+ retval = create_proc_exports_entry();
+ if (retval)
+ goto out_free_idmap;
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
++ retval = spnfs_init_proc();
++ if (retval != 0)
++ goto out_free_idmap;
++#if defined(CONFIG_SPNFS_BLOCK)
++ nfsd_bl_init();
++#endif /* CONFIG_SPNFS_BLOCK */
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
+
-+ /* The default is to not gather across stripes */
-+ if (pnfs_ld_gather_across_stripes(NFS_SERVER(inode)->pnfs_curr_ld))
-+ goto out;
+ retval = register_filesystem(&nfsd_fs_type);
+ if (retval)
+ goto out_free_all;
+@@ -1463,7 +1555,22 @@ out_free_stat:
+
+ static void __exit exit_nfsd(void)
+ {
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS)
++ remove_proc_entry("fs/nfs/spnfs/recall", NULL);
++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL);
++ remove_proc_entry("fs/nfs/spnfs/config", NULL);
++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL);
++ remove_proc_entry("fs/nfs/spnfs", NULL);
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */
+
-+ spin_lock(&inode->i_lock);
-+ if (NFS_I(inode)->layout)
-+ stripe_size = ld->get_stripesize(NFS_I(inode)->layout);
-+ spin_unlock(&inode->i_lock);
-+out:
-+ return stripe_size;
-+}
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL);
++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL);
++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */
++
+ nfsd_export_shutdown();
++ nfsd4_pnfs_dlm_shutdown();
+ nfsd_reply_cache_shutdown();
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
+diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
+index b76ac3a..cef6770 100644
+--- a/fs/nfsd/nfsd.h
++++ b/fs/nfsd/nfsd.h
+@@ -286,11 +286,22 @@ extern time_t nfsd4_grace;
+ #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+ NFSD4_SUPPORTED_ATTRS_WORD0
+
++#if defined(CONFIG_PNFSD)
++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES)
++#else /* CONFIG_PNFSD */
+ #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+ NFSD4_SUPPORTED_ATTRS_WORD1
++#endif /* CONFIG_PNFSD */
+
++#if defined(CONFIG_PNFSD)
++#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \
++ FATTR4_WORD2_LAYOUT_BLKSIZE)
++#else /* CONFIG_PNFSD */
+ #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
++#endif /* CONFIG_PNFSD */
+
+ static inline u32 nfsd_suppattrs0(u32 minorversion)
+ {
+diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
+index 55c8e63..544c957 100644
+--- a/fs/nfsd/nfsfh.c
++++ b/fs/nfsd/nfsfh.c
+@@ -10,6 +10,7 @@
+ #include <linux/exportfs.h>
+
+ #include <linux/sunrpc/svcauth_gss.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
+ #include "nfsd.h"
+ #include "vfs.h"
+ #include "auth.h"
+@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ {
+ struct knfsd_fh *fh = &fhp->fh_handle;
++ int fsid_type;
+ struct fid *fid = NULL, sfid;
+ struct svc_export *exp;
+ struct dentry *dentry;
+@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ return error;
+ if (fh->fh_auth_type != 0)
+ return error;
+- len = key_len(fh->fh_fsid_type) / 4;
++ fsid_type = pnfs_fh_fsid_type(fh);
++ len = key_len(fsid_type) / 4;
+ if (len == 0)
+ return error;
+ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ data_left -= len;
+ if (data_left < 0)
+ return error;
+- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth);
+ fid = (struct fid *)(fh->fh_auth + len);
+ } else {
+ __u32 tfh[2];
+diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
+index c16f8d8..4263812 100644
+--- a/fs/nfsd/nfsfh.h
++++ b/fs/nfsd/nfsfh.h
+@@ -14,6 +14,7 @@ enum nfsd_fsid {
+ FSID_UUID8,
+ FSID_UUID16,
+ FSID_UUID16_INUM,
++ FSID_MAX
+ };
+
+ enum fsid_source {
+@@ -203,4 +204,42 @@ fh_unlock(struct svc_fh *fhp)
+ }
+ }
+
++#if defined(CONFIG_PNFSD)
+
+/*
-+ * rsize is already set by caller to MDS rsize.
++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied
++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how
++ * to handle a given stateid.
+ */
-+void
-+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-+ struct inode *inode,
-+ struct nfs_open_context *ctx,
-+ struct list_head *pages,
-+ size_t *rsize)
++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
+{
-+ struct nfs_server *nfss = NFS_SERVER(inode);
-+ size_t count = 0;
-+ loff_t loff;
-+
-+ pgio->pg_iswrite = 0;
-+ pgio->pg_boundary = 0;
-+ pgio->pg_test = NULL;
-+ pgio->pg_lseg = NULL;
-+
-+ if (!pnfs_enabled_sb(nfss))
-+ return;
-+
-+ /* Calculate the total read-ahead count */
-+ readahead_range(inode, pages, &loff, &count);
-+
-+ if (count > 0) {
-+ pgio->pg_lseg = pnfs_update_layout(inode, ctx, loff, count,
-+ IOMODE_READ);
-+ if (!pgio->pg_lseg)
-+ return;
-+
-+ *rsize = NFS_SERVER(inode)->ds_rsize;
-+ pgio->pg_boundary = pnfs_getboundary(inode);
-+ if (pgio->pg_boundary)
-+ pnfs_set_pg_test(inode, pgio);
-+ }
++ return fh->fh_fsid_type >= FSID_MAX;
+}
+
-+void
-+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
-+ size_t *wsize)
++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh)
+{
-+ struct nfs_server *server = NFS_SERVER(inode);
-+
-+ pgio->pg_iswrite = 1;
-+ if (!pnfs_enabled_sb(server)) {
-+ pgio->pg_boundary = 0;
-+ pgio->pg_test = NULL;
-+ return;
-+ }
-+ pgio->pg_boundary = pnfs_getboundary(inode);
-+ pnfs_set_pg_test(inode, pgio);
-+ *wsize = server->ds_wsize;
++ BUG_ON(fh->fh_version != 1);
++ BUG_ON(pnfs_fh_is_ds(fh));
++ fh->fh_fsid_type += FSID_MAX;
+}
+
-+/* Set buffer size for data servers */
-+void
-+pnfs_set_ds_iosize(struct nfs_server *server)
-+{
-+ unsigned dssize = 0;
-+
-+ if (server->pnfs_curr_ld && server->pnfs_curr_ld->get_blocksize)
-+ dssize = server->pnfs_curr_ld->get_blocksize();
-+ if (dssize)
-+ server->ds_rsize = server->ds_wsize =
-+ nfs_block_size(dssize, NULL);
-+ else {
-+ server->ds_wsize = server->wsize;
-+ server->ds_rsize = server->rsize;
-+ }
-+}
++#else /* CONFIG_PNFSD */
+
-+static int
-+pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data)
++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh)
+{
-+ put_lseg(pdata->lseg);
-+ pdata->lseg = NULL;
-+ pdata->call_ops->rpc_call_done(task, data);
-+ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN)
-+ return -EAGAIN;
-+ if (pdata->pnfsflags & PNFS_NO_RPC) {
-+ pdata->call_ops->rpc_release(data);
-+ } else {
-+ /*
-+ * just restore original rpc call ops
-+ * rpc_release will be called later by the rpc scheduling layer.
-+ */
-+ task->tk_ops = pdata->call_ops;
-+ }
+ return 0;
+}
+
-+/* Post-write completion function
-+ * Invoked by all layout drivers when write_pagelist is done.
-+ *
-+ * NOTE: callers set data->pnfsflags PNFS_NO_RPC
-+ * so that the NFS cleanup routines perform only the page cache
-+ * cleanup.
-+ */
-+static void
-+pnfs_write_retry(struct work_struct *work)
-+{
-+ struct rpc_task *task;
-+ struct nfs_write_data *wdata;
-+ struct pnfs_layout_range range;
-+
-+ dprintk("%s enter\n", __func__);
-+ task = container_of(work, struct rpc_task, u.tk_work);
-+ wdata = container_of(task, struct nfs_write_data, task);
-+ range.iomode = IOMODE_RW;
-+ range.offset = wdata->args.offset;
-+ range.length = wdata->args.count;
-+ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true);
-+ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode),
-+ wdata->pdata.call_ops, wdata->pdata.how);
-+}
-+
-+void
-+pnfs_writeback_done(struct nfs_write_data *data)
-+{
-+ struct pnfs_call_data *pdata = &data->pdata;
-+
-+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
-+
-+ /* update last write offset and need layout commit
-+ * for non-files layout types (files layout calls
-+ * pnfs4_write_done for this)
-+ */
-+ if ((pdata->pnfsflags & PNFS_NO_RPC) &&
-+ data->task.tk_status >= 0 && data->res.count > 0) {
-+ struct nfs_inode *nfsi = NFS_I(data->inode);
-+
-+ pnfs_update_last_write(nfsi, data->args.offset, data->res.count);
-+ pnfs_need_layoutcommit(nfsi, data->args.context);
-+ }
-+
-+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
-+ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry);
-+ queue_work(nfsiod_workqueue, &data->task.u.tk_work);
-+ }
-+}
-+EXPORT_SYMBOL_GPL(pnfs_writeback_done);
++#endif /* CONFIG_PNFSD */
+
-+static void _pnfs_clear_lseg_from_pages(struct list_head *head)
++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */
++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh)
+{
-+ struct nfs_page *req;
++ int fsid_type = fh->fh_fsid_type;
+
-+ list_for_each_entry(req, head, wb_list) {
-+ put_lseg(req->wb_lseg);
-+ req->wb_lseg = NULL;
-+ }
++ if (pnfs_fh_is_ds(fh))
++ return fsid_type - FSID_MAX;
++ return fsid_type;
+}
+
+ #endif /* _LINUX_NFSD_FH_INT_H */
+diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
+index e2c4346..d1262ec 100644
+--- a/fs/nfsd/nfssvc.c
++++ b/fs/nfsd/nfssvc.c
+@@ -115,7 +115,7 @@ struct svc_program nfsd_program = {
+
+ };
+
+-u32 nfsd_supported_minorversion;
++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION;
+
+ int nfsd_vers(int vers, enum vers_op change)
+ {
+diff --git a/fs/nfsd/pnfsd.h b/fs/nfsd/pnfsd.h
+new file mode 100644
+index 0000000..a181bc3
+--- /dev/null
++++ b/fs/nfsd/pnfsd.h
+@@ -0,0 +1,143 @@
+/*
-+ * Call the appropriate parallel I/O subsystem write function.
-+ * If no I/O device driver exists, or one does match the returned
-+ * fstype, then return a positive status for regular NFS processing.
++ * Copyright (c) 2005 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros at umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
-+ * TODO: Is wdata->how and wdata->args.stable always the same value?
-+ * TODO: It seems in NFS, the server may not do a stable write even
-+ * though it was requested (and vice-versa?). To check, it looks
-+ * in data->res.verf->committed. Do we need this ability
-+ * for non-file layout drivers?
+ */
-+enum pnfs_try_status
-+pnfs_try_to_write_data(struct nfs_write_data *wdata,
-+ const struct rpc_call_ops *call_ops, int how)
-+{
-+ struct inode *inode = wdata->inode;
-+ enum pnfs_try_status trypnfs;
-+ struct nfs_server *nfss = NFS_SERVER(inode);
-+ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg;
-+
-+ wdata->pdata.call_ops = call_ops;
-+ wdata->pdata.pnfs_error = 0;
-+ wdata->pdata.how = how;
-+
-+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
-+ inode->i_ino, wdata->args.count, wdata->args.offset, how);
-+
-+ get_lseg(lseg);
+
-+ if (!pnfs_use_rpc(nfss))
-+ wdata->pdata.pnfsflags |= PNFS_NO_RPC;
-+ wdata->pdata.lseg = lseg;
-+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata,
-+ nfs_page_array_len(wdata->args.pgbase, wdata->args.count),
-+ how);
++#ifndef LINUX_NFSD_PNFSD_H
++#define LINUX_NFSD_PNFSD_H
+
-+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
-+ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
-+ wdata->pdata.lseg = NULL;
-+ put_lseg(lseg);
-+ _pnfs_clear_lseg_from_pages(&wdata->pages);
-+ } else {
-+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
-+ }
-+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-+ return trypnfs;
-+}
++#include <linux/list.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
+
-+/* Post-read completion function. Invoked by all layout drivers when
-+ * read_pagelist is done
-+ */
-+static void
-+pnfs_read_retry(struct work_struct *work)
-+{
-+ struct rpc_task *task;
-+ struct nfs_read_data *rdata;
-+ struct pnfs_layout_range range;
++#include "state.h"
++#include "xdr4.h"
+
-+ dprintk("%s enter\n", __func__);
-+ task = container_of(work, struct rpc_task, u.tk_work);
-+ rdata = container_of(task, struct nfs_read_data, task);
-+ range.iomode = IOMODE_RW;
-+ range.offset = rdata->args.offset;
-+ range.length = rdata->args.count;
-+ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true);
-+ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode),
-+ rdata->pdata.call_ops);
-+}
++/* outstanding layout stateid */
++struct nfs4_layout_state {
++ struct list_head ls_perfile;
++ struct list_head ls_layouts; /* list of nfs4_layouts */
++ struct kref ls_ref;
++ struct nfs4_client *ls_client;
++ struct nfs4_file *ls_file;
++ stateid_t ls_stateid;
++};
+
-+void
-+pnfs_read_done(struct nfs_read_data *data)
-+{
-+ struct pnfs_call_data *pdata = &data->pdata;
++/* outstanding layout */
++struct nfs4_layout {
++ struct list_head lo_perfile; /* hash by f_id */
++ struct list_head lo_perclnt; /* hash by clientid */
++ struct list_head lo_perstate;
++ struct nfs4_file *lo_file; /* backpointer */
++ struct nfs4_client *lo_client;
++ struct nfs4_layout_state *lo_state;
++ struct nfsd4_layout_seg lo_seg;
++};
+
-+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
++struct pnfs_inval_state {
++ struct knfsd_fh mdsfh; /* needed only by invalidate all */
++ stateid_t stid;
++ clientid_t clid;
++ u32 status;
++};
+
-+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
-+ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry);
-+ queue_work(nfsiod_workqueue, &data->task.u.tk_work);
-+ }
-+}
-+EXPORT_SYMBOL_GPL(pnfs_read_done);
++/* pNFS Data Server state */
++#define DS_STATEID_VALID 0
++#define DS_STATEID_ERROR 1
++#define DS_STATEID_NEW 2
+
-+/*
-+ * Call the appropriate parallel I/O subsystem read function.
-+ * If no I/O device driver exists, or one does match the returned
-+ * fstype, then return a positive status for regular NFS processing.
-+ */
-+enum pnfs_try_status
-+pnfs_try_to_read_data(struct nfs_read_data *rdata,
-+ const struct rpc_call_ops *call_ops)
-+{
-+ struct inode *inode = rdata->inode;
-+ struct nfs_server *nfss = NFS_SERVER(inode);
-+ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg;
-+ enum pnfs_try_status trypnfs;
++struct pnfs_ds_stateid {
++ struct list_head ds_hash; /* ds_stateid hash entry */
++ struct list_head ds_perclid; /* per client hash entry */
++ stateid_t ds_stid;
++ struct knfsd_fh ds_fh;
++ unsigned long ds_access;
++ u32 ds_status; /* from MDS */
++ u32 ds_verifier[2]; /* from MDS */
++ wait_queue_head_t ds_waitq;
++ unsigned long ds_flags;
++ struct kref ds_ref;
++ clientid_t ds_mdsclid;
++};
+
-+ rdata->pdata.call_ops = call_ops;
-+ rdata->pdata.pnfs_error = 0;
++struct pnfs_ds_clientid {
++ struct list_head dc_hash; /* mds_clid_hashtbl entry */
++ struct list_head dc_stateid; /* ds_stateid head */
++ struct list_head dc_permdsid; /* per mdsid hash entry */
++ clientid_t dc_mdsclid;
++ struct kref dc_ref;
++ uint32_t dc_mdsid;
++};
+
-+ dprintk("%s: Reading ino:%lu %u@%llu\n",
-+ __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
++struct pnfs_mds_id {
++ struct list_head di_hash; /* mds_nodeid list entry */
++ struct list_head di_mdsclid; /* mds_clientid head */
++ uint32_t di_mdsid;
++ time_t di_mdsboot; /* mds boot time */
++ struct kref di_ref;
++};
+
-+ get_lseg(lseg);
++/* notify device request (from exported filesystem) */
++struct nfs4_notify_device {
++ struct nfsd4_pnfs_cb_dev_list *nd_list;
++ struct nfs4_client *nd_client;
++ struct list_head nd_perclnt;
+
-+ if (!pnfs_use_rpc(nfss))
-+ rdata->pdata.pnfsflags |= PNFS_NO_RPC;
-+ rdata->pdata.lseg = lseg;
-+ trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata,
-+ nfs_page_array_len(rdata->args.pgbase, rdata->args.count));
-+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
-+ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC;
-+ rdata->pdata.lseg = NULL;
-+ put_lseg(lseg);
-+ _pnfs_clear_lseg_from_pages(&rdata->pages);
-+ } else {
-+ nfs_inc_stats(inode, NFSIOS_PNFS_READ);
-+ }
-+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-+ return trypnfs;
-+}
++ void *nd_args; /* nfsd internal */
++};
+
-+/*
-+ * This gives the layout driver an opportunity to read in page "around"
-+ * the data to be written. It returns 0 on success, otherwise an error code
-+ * which will either be passed up to user, or ignored if
-+ * some previous part of write succeeded.
-+ * Note the range [pos, pos+len-1] is entirely within the page.
-+ */
-+int _pnfs_write_begin(struct inode *inode, struct page *page,
-+ loff_t pos, unsigned len,
-+ struct pnfs_layout_segment *lseg,
-+ struct pnfs_fsdata **fsdata)
-+{
-+ struct pnfs_fsdata *data;
-+ int status = 0;
++u64 find_create_sbid(struct super_block *);
++struct super_block *find_sbid_id(u64);
++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *);
++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *,
++ struct nfsd4_pnfs_layoutreturn *);
++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *);
++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *);
++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
++int put_layoutrecall(struct nfs4_layoutrecall *);
++void nomatching_layout(struct nfs4_layoutrecall *);
++void *layoutrecall_done(struct nfs4_layoutrecall *);
++int nfsd4_cb_layout(struct nfs4_layoutrecall *);
++int nfsd_layout_recall_cb(struct super_block *, struct inode *,
++ struct nfsd4_pnfs_cb_layout *);
++int nfsd_device_notify_cb(struct super_block *,
++ struct nfsd4_pnfs_cb_dev_list *);
++int nfsd4_cb_notify_device(struct nfs4_notify_device *);
++void pnfs_set_device_notify(clientid_t *, unsigned int types);
++void pnfs_clear_device_notify(struct nfs4_client *);
+
-+ dprintk("--> %s: pos=%llu len=%u\n",
-+ __func__, (unsigned long long)pos, len);
-+ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
-+ if (!data) {
-+ status = -ENOMEM;
-+ goto out;
-+ }
-+ data->lseg = lseg; /* refcount passed into data to be managed there */
-+ status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
-+ lseg, page, pos, len, data);
-+ if (status) {
-+ kfree(data);
-+ data = NULL;
-+ }
-+out:
-+ *fsdata = data;
-+ dprintk("<-- %s: status=%d\n", __func__, status);
-+ return status;
-+}
++#if defined(CONFIG_PNFSD_LOCAL_EXPORT)
++extern struct sockaddr pnfsd_lexp_addr;
++extern size_t pnfs_lexp_addr_len;
+
-+/* pNFS Commit callback function for all layout drivers */
-+void
-+pnfs_commit_done(struct nfs_write_data *data)
-+{
-+ struct pnfs_call_data *pdata = &data->pdata;
++extern void pnfsd_lexp_init(struct inode *);
++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */
+
-+ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status);
++#endif /* LINUX_NFSD_PNFSD_H */
+diff --git a/fs/nfsd/pnfsd_lexp.c b/fs/nfsd/pnfsd_lexp.c
+new file mode 100644
+index 0000000..bf2f403
+--- /dev/null
++++ b/fs/nfsd/pnfsd_lexp.c
+@@ -0,0 +1,225 @@
++/*
++ * linux/fs/nfsd/pnfs_lexp.c
++ *
++ * pNFS export of local filesystems.
++ *
++ * Export local file systems over the files layout type.
++ * The MDS (metadata server) functions also as a single DS (data server).
++ * This is mostly useful for development and debugging purposes.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * Copyright (C) 2008 Benny Halevy, <bhalevy at panasas.com>
++ *
++ * Initial implementation was based on the pnfs-gfs2 patches done
++ * by David M. Richter <richterd at citi.umich.edu>
++ */
+
-+ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) {
-+ struct pnfs_layout_range range = {
-+ .iomode = IOMODE_RW,
-+ .offset = data->args.offset,
-+ .length = data->args.count,
-+ };
-+ dprintk("%s: retrying\n", __func__);
-+ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE,
-+ true);
-+ pnfs_initiate_commit(data, NFS_CLIENT(data->inode),
-+ pdata->call_ops, pdata->how, 1);
-+ }
-+}
-+EXPORT_SYMBOL_GPL(pnfs_commit_done);
++#include <linux/sunrpc/svc_xprt.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
+
-+enum pnfs_try_status
-+pnfs_try_to_commit(struct nfs_write_data *data,
-+ const struct rpc_call_ops *call_ops, int sync)
-+{
-+ struct inode *inode = data->inode;
-+ struct nfs_server *nfss = NFS_SERVER(data->inode);
-+ enum pnfs_try_status trypnfs;
++#include "pnfsd.h"
+
-+ dprintk("%s: Begin\n", __func__);
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
-+ if (!pnfs_use_rpc(nfss))
-+ data->pdata.pnfsflags |= PNFS_NO_RPC;
-+ /* We need to account for possibility that
-+ * each nfs_page can point to a different lseg (or be NULL).
-+ * For the immediate case of whole-file-only layouts, we at
-+ * least know there can be only a single lseg.
-+ * We still have to account for the possibility of some being NULL.
-+ * This will be done by passing the buck to the layout driver.
-+ */
-+ data->pdata.call_ops = call_ops;
-+ data->pdata.pnfs_error = 0;
-+ data->pdata.how = sync;
-+ data->pdata.lseg = NULL;
-+ trypnfs = nfss->pnfs_curr_ld->commit(data, sync);
-+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
-+ data->pdata.pnfsflags &= ~PNFS_NO_RPC;
-+ _pnfs_clear_lseg_from_pages(&data->pages);
-+ } else
-+ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT);
-+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-+ return trypnfs;
-+}
++struct sockaddr pnfsd_lexp_addr;
++size_t pnfs_lexp_addr_len;
+
-+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
++static int
++pnfsd_lexp_layout_type(struct super_block *sb)
+{
-+ struct nfs_server *nfss = NFS_SERVER(data->args.inode);
-+
-+ /* TODO: Maybe we should avoid this by allowing the layout driver
-+ * to directly xdr its layout on the wire.
-+ */
-+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
-+ nfss->pnfs_curr_ld->cleanup_layoutcommit(
-+ NFS_I(data->args.inode)->layout,
-+ &data->args, data->status);
++ int ret = LAYOUT_NFSV4_1_FILES;
++ dprintk("<-- %s: return %d\n", __func__, ret);
++ return ret;
+}
+
-+/*
-+ * Set up the argument/result storage required for the RPC call.
-+ */
+static int
-+pnfs_layoutcommit_setup(struct inode *inode,
-+ struct nfs4_layoutcommit_data *data,
-+ loff_t write_begin_pos, loff_t write_end_pos)
++pnfsd_lexp_get_device_iter(struct super_block *sb,
++ u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *res)
+{
-+ struct nfs_server *nfss = NFS_SERVER(inode);
-+ int result = 0;
-+
-+ dprintk("--> %s\n", __func__);
-+
-+ data->args.inode = inode;
-+ data->args.fh = NFS_FH(inode);
-+ data->args.layout_type = nfss->pnfs_curr_ld->id;
-+ data->res.fattr = &data->fattr;
-+ nfs_fattr_init(&data->fattr);
-+
-+ /* TODO: Need to determine the correct values */
-+ data->args.time_modify_changed = 0;
++ dprintk("--> %s: sb=%p\n", __func__, sb);
+
-+ /* Set values from inode so it can be reset
-+ */
-+ data->args.range.iomode = IOMODE_RW;
-+ data->args.range.offset = write_begin_pos;
-+ data->args.range.length = write_end_pos - write_begin_pos + 1;
-+ data->args.lastbytewritten = min(write_end_pos,
-+ i_size_read(inode) - 1);
-+ data->args.bitmask = nfss->attr_bitmask;
-+ data->res.server = nfss;
++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
+
-+ /* Call layout driver to set the arguments */
-+ if (nfss->pnfs_curr_ld->setup_layoutcommit)
-+ result = nfss->pnfs_curr_ld->setup_layoutcommit(
-+ NFS_I(inode)->layout, &data->args);
++ res->gd_eof = 1;
++ if (res->gd_cookie)
++ return -ENOENT;
++ res->gd_cookie = 1;
++ res->gd_verf = 1;
++ res->gd_devid = 1;
+
-+ dprintk("<-- %s Status %d\n", __func__, result);
-+ return result;
++ dprintk("<-- %s: return 0\n", __func__);
++ return 0;
+}
+
-+/* Issue a async layoutcommit for an inode.
-+ */
-+int
-+pnfs_layoutcommit_inode(struct inode *inode, int sync)
++static int
++pnfsd_lexp_get_device_info(struct super_block *sb,
++ struct exp_xdr_stream *xdr,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *devid)
+{
-+ struct nfs4_layoutcommit_data *data;
-+ struct nfs_inode *nfsi = NFS_I(inode);
-+ loff_t write_begin_pos;
-+ loff_t write_end_pos;
-+
-+ int status = 0;
++ int err;
++ struct pnfs_filelayout_device fdev;
++ struct pnfs_filelayout_multipath fl_devices[1];
++ u32 fl_stripe_indices[1] = { 0 };
++ struct pnfs_filelayout_devaddr daddr;
++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */
++ char daddr_buf[8*4 + 2*3 + 10];
+
-+ dprintk("%s Begin (sync:%d)\n", __func__, sync);
++ dprintk("--> %s: sb=%p\n", __func__, sb);
+
-+ BUG_ON(!has_layout(nfsi));
++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES);
+
-+ data = kzalloc(sizeof(*data), GFP_NOFS);
-+ if (!data)
-+ return -ENOMEM;
++ memset(&fdev, '\0', sizeof(fdev));
+
-+ spin_lock(&inode->i_lock);
-+ if (!layoutcommit_needed(nfsi)) {
-+ spin_unlock(&inode->i_lock);
-+ goto out_free;
++ if (devid->devid != 1) {
++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 "
++ "(got: 0x%llx)\n", __func__, devid->devid);
++ err = -EINVAL;
++ goto out;
+ }
+
-+ /* Clear layoutcommit properties in the inode so
-+ * new lc info can be generated
-+ */
-+ write_begin_pos = nfsi->layout->write_begin_pos;
-+ write_end_pos = nfsi->layout->write_end_pos;
-+ data->cred = nfsi->layout->cred;
-+ nfsi->layout->write_begin_pos = 0;
-+ nfsi->layout->write_end_pos = 0;
-+ nfsi->layout->cred = NULL;
-+ __clear_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
-+ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout, NULL);
-+
-+ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */
-+ get_layout_hdr_locked(NFS_I(inode)->layout);
++ /* count the number of comma-delimited DS IPs */
++ fdev.fl_device_length = 1;
++ fdev.fl_device_list = fl_devices;
+
-+ spin_unlock(&inode->i_lock);
++ fdev.fl_stripeindices_length = fdev.fl_device_length;
++ fdev.fl_stripeindices_list = fl_stripe_indices;
+
-+ /* Set up layout commit args */
-+ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos,
-+ write_end_pos);
-+ if (status) {
-+ /* The layout driver failed to setup the layoutcommit */
-+ put_rpccred(data->cred);
-+ put_layout_hdr(inode);
-+ goto out_free;
++ daddr.r_addr.data = daddr_buf;
++ daddr.r_addr.len = sizeof(daddr_buf);
++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr);
++ if (err < 0)
++ goto out;
++ daddr.r_addr.len = err;
++ switch (pnfsd_lexp_addr.sa_family) {
++ case AF_INET:
++ daddr.r_netid.data = "tcp";
++ daddr.r_netid.len = 3;
++ break;
++ case AF_INET6:
++ daddr.r_netid.data = "tcp6";
++ daddr.r_netid.len = 4;
++ break;
++ default:
++ BUG();
+ }
-+ status = nfs4_proc_layoutcommit(data, sync);
++ fdev.fl_device_list[0].fl_multipath_length = 1;
++ fdev.fl_device_list[0].fl_multipath_list = &daddr;
++
++ /* have nfsd encode the device info */
++ err = filelayout_encode_devinfo(xdr, &fdev);
+out:
-+ dprintk("%s end (err:%d)\n", __func__, status);
-+ return status;
-+out_free:
-+ kfree(data);
-+ goto out;
++ dprintk("<-- %s: return %d\n", __func__, err);
++ return err;
+}
+
-+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
++static int get_stripe_unit(int blocksize)
+{
-+ /* lseg refcounting handled directly in nfs_write_end */
-+ kfree(fsdata);
++ if (blocksize < NFSSVC_MAXBLKSIZE)
++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize);
++ dprintk("%s: return %d\n", __func__, blocksize);
++ return blocksize;
+}
+
-+/*
-+ * Device ID cache. Currently supports one layout type per struct nfs_client.
-+ * Add layout type to the lookup key to expand to support multiple types.
-+ */
-+int
-+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
-+ void (*free_callback)(struct pnfs_deviceid_node *))
++static enum nfsstat4
++pnfsd_lexp_layout_get(struct inode *inode,
++ struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_layoutget_arg *arg,
++ struct nfsd4_pnfs_layoutget_res *res)
+{
-+ struct pnfs_deviceid_cache *c;
++ enum nfsstat4 rc = NFS4_OK;
++ struct pnfs_filelayout_layout *layout = NULL;
++ struct knfsd_fh *fhp = NULL;
+
-+ c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
-+ if (!c)
-+ return -ENOMEM;
-+ spin_lock(&clp->cl_lock);
-+ if (clp->cl_devid_cache != NULL) {
-+ atomic_inc(&clp->cl_devid_cache->dc_ref);
-+ dprintk("%s [kref [%d]]\n", __func__,
-+ atomic_read(&clp->cl_devid_cache->dc_ref));
-+ kfree(c);
-+ } else {
-+ /* kzalloc initializes hlists */
-+ spin_lock_init(&c->dc_lock);
-+ atomic_set(&c->dc_ref, 1);
-+ c->dc_free_callback = free_callback;
-+ clp->cl_devid_cache = c;
-+ dprintk("%s [new]\n", __func__);
-+ }
-+ spin_unlock(&clp->cl_lock);
-+ return 0;
-+}
-+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
++ dprintk("--> %s: inode=%p\n", __func__, inode);
+
-+/* Must be called with locked c->dc_lock */
-+static struct pnfs_deviceid_node *
-+pnfs_unhash_deviceid(struct pnfs_deviceid_cache *c,
-+ struct nfs4_deviceid *id)
-+{
-+ struct pnfs_deviceid_node *d;
-+ struct hlist_node *n;
-+ long h = nfs4_deviceid_hash(id);
++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES;
++ res->lg_seg.offset = 0;
++ res->lg_seg.length = NFS4_MAX_UINT64;
+
-+ dprintk("%s hash %ld\n", __func__, h);
-+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
-+ if (!memcmp(&d->de_id, id, sizeof(*id))) {
-+ hlist_del_rcu(&d->de_node);
-+ synchronize_rcu();
-+ return d;
-+ }
++ layout = kzalloc(sizeof(*layout), GFP_KERNEL);
++ if (layout == NULL) {
++ rc = -ENOMEM;
++ goto error;
++ }
+
-+ return NULL;
-+}
++ /* Set file layout response args */
++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES;
++ layout->lg_stripe_type = STRIPE_SPARSE;
++ layout->lg_commit_through_mds = true;
++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize);
++ layout->lg_fh_length = 1;
++ layout->device_id.sbid = arg->lg_sbid;
++ layout->device_id.devid = 1; /*FSFTEMP*/
++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/
++ layout->lg_pattern_offset = 0;
+
-+/*
-+ * Called from pnfs_layoutdriver_type->free_lseg
-+ * last layout segment reference frees deviceid
-+ */
-+void
-+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-+ struct pnfs_deviceid_node *devid)
-+{
-+ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
-+ if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
-+ return;
++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL);
++ if (fhp == NULL) {
++ rc = -ENOMEM;
++ goto error;
++ }
+
-+ pnfs_unhash_deviceid(c, &devid->de_id);
-+ spin_unlock(&c->dc_lock);
++ memcpy(fhp, arg->lg_fh, sizeof(*fhp));
++ pnfs_fh_mark_ds(fhp);
++ layout->lg_fh_list = fhp;
+
-+ c->dc_free_callback(devid);
++ /* Call nfsd to encode layout */
++ rc = filelayout_encode_layout(xdr, layout);
++exit:
++ kfree(layout);
++ kfree(fhp);
++ dprintk("<-- %s: return %d\n", __func__, rc);
++ return rc;
++
++error:
++ res->lg_seg.length = 0;
++ goto exit;
+}
-+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+
-+void
-+pnfs_delete_deviceid(struct pnfs_deviceid_cache *c,
-+ struct nfs4_deviceid *id)
++static int
++pnfsd_lexp_layout_commit(struct inode *inode,
++ const struct nfsd4_pnfs_layoutcommit_arg *args,
++ struct nfsd4_pnfs_layoutcommit_res *res)
+{
-+ struct pnfs_deviceid_node *devid;
-+
-+ spin_lock(&c->dc_lock);
-+ devid = pnfs_unhash_deviceid(c, id);
-+ spin_unlock(&c->dc_lock);
++ dprintk("%s: (unimplemented)\n", __func__);
+
-+ dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
-+ if (atomic_dec_and_test(&devid->de_ref))
-+ c->dc_free_callback(devid);
++ return 0;
+}
-+EXPORT_SYMBOL_GPL(pnfs_delete_deviceid);
+
-+/* Find and reference a deviceid */
-+struct pnfs_deviceid_node *
-+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
++static int
++pnfsd_lexp_layout_return(struct inode *inode,
++ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
-+ struct pnfs_deviceid_node *d;
-+ struct hlist_node *n;
-+ long hash = nfs4_deviceid_hash(id);
++ dprintk("%s: (unimplemented)\n", __func__);
+
-+ dprintk("--> %s hash %ld\n", __func__, hash);
-+ rcu_read_lock();
-+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-+ if (!memcmp(&d->de_id, id, sizeof(*id))) {
-+ if (!atomic_inc_not_zero(&d->de_ref)) {
-+ goto fail;
-+ } else {
-+ rcu_read_unlock();
-+ return d;
-+ }
-+ }
-+ }
-+fail:
-+ rcu_read_unlock();
-+ return NULL;
++ return 0;
+}
-+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+
-+/*
-+ * Add a deviceid to the cache.
-+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
-+ */
-+struct pnfs_deviceid_node *
-+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh,
++ struct pnfs_get_state *p)
+{
-+ struct pnfs_deviceid_node *d;
-+ long hash = nfs4_deviceid_hash(&new->de_id);
-+
-+ dprintk("--> %s hash %ld\n", __func__, hash);
-+ spin_lock(&c->dc_lock);
-+ d = pnfs_find_get_deviceid(c, &new->de_id);
-+ if (d) {
-+ spin_unlock(&c->dc_lock);
-+ dprintk("%s [discard]\n", __func__);
-+ c->dc_free_callback(new);
-+ return d;
-+ }
-+ INIT_HLIST_NODE(&new->de_node);
-+ atomic_set(&new->de_ref, 1);
-+ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-+ spin_unlock(&c->dc_lock);
-+ dprintk("%s [new]\n", __func__);
-+ return new;
++ return 0; /* just use the current stateid */
+}
-+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
++
++static struct pnfs_export_operations pnfsd_lexp_ops = {
++ .layout_type = pnfsd_lexp_layout_type,
++ .get_device_info = pnfsd_lexp_get_device_info,
++ .get_device_iter = pnfsd_lexp_get_device_iter,
++ .layout_get = pnfsd_lexp_layout_get,
++ .layout_commit = pnfsd_lexp_layout_commit,
++ .layout_return = pnfsd_lexp_layout_return,
++ .get_state = pnfsd_lexp_get_state,
++};
+
+void
-+pnfs_put_deviceid_cache(struct nfs_client *clp)
++pnfsd_lexp_init(struct inode *inode)
+{
-+ struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
-+
-+ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
-+ if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-+ int i;
-+ /* Verify cache is empty */
-+ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-+ BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
-+ clp->cl_devid_cache = NULL;
-+ spin_unlock(&clp->cl_lock);
-+ kfree(local);
-+ }
++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops);
++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops;
+}
-+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
-diff -up linux-2.6.35.noarch/fs/nfs/pnfs.h.orig linux-2.6.35.noarch/fs/nfs/pnfs.h
---- linux-2.6.35.noarch/fs/nfs/pnfs.h.orig 2010-09-30 12:25:08.422283000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/pnfs.h 2010-09-30 12:25:08.424282000 -0400
-@@ -0,0 +1,601 @@
+diff --git a/fs/nfsd/spnfs_com.c b/fs/nfsd/spnfs_com.c
+new file mode 100644
+index 0000000..1ce9ee6
+--- /dev/null
++++ b/fs/nfsd/spnfs_com.c
+@@ -0,0 +1,535 @@
+/*
-+ * pNFS client data structures.
++ * fs/nfsd/spnfs_com.c
+ *
-+ * Copyright (c) 2002
-+ * The Regents of the University of Michigan
-+ * All Rights Reserved
++ * Communcation layer between spNFS kernel and userspace
++ * Based heavily on idmap.c
+ *
-+ * Dean Hildebrand <dhildebz at umich.edu>
++ */
++
++/*
++ * Copyright (c) 2002 The Regents of the University of Michigan.
++ * All rights reserved.
+ *
-+ * Permission is granted to use, copy, create derivative works, and
-+ * redistribute this software and such derivative works for any purpose,
-+ * so long as the name of the University of Michigan is not used in
-+ * any advertising or publicity pertaining to the use or distribution
-+ * of this software without specific, written prior authorization. If
-+ * the above copyright notice or any other identification of the
-+ * University of Michigan is included in any copy of any portion of
-+ * this software, then the disclaimer below must also be included.
++ * Marius Aamodt Eriksen <marius at umich.edu>
+ *
-+ * This software is provided as is, without representation or warranty
-+ * of any kind either express or implied, including without limitation
-+ * the implied warranties of merchantability, fitness for a particular
-+ * purpose, or noninfringement. The Regents of the University of
-+ * Michigan shall not be liable for any damages, including special,
-+ * indirect, incidental, or consequential damages, with respect to any
-+ * claim arising out of or in connection with the use of the software,
-+ * even if it has been or is hereafter advised of the possibility of
-+ * such damages.
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
++#include <linux/namei.h>
++#include <linux/mount.h>
++#include <linux/path.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/sunrpc/rpc_pipe_fs.h>
++#include <linux/nfsd/debug.h>
+
-+#ifndef FS_NFS_PNFS_H
-+#define FS_NFS_PNFS_H
++#include <linux/nfsd4_spnfs.h>
+
-+struct pnfs_layout_segment {
-+ struct list_head fi_list;
-+ struct pnfs_layout_range range;
-+ struct kref kref;
-+ bool valid;
-+ struct pnfs_layout_hdr *layout;
++#define NFSDDBG_FACILITY NFSDDBG_PROC
++
++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *,
++ char __user *, size_t);
++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *,
++ size_t);
++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *);
++
++static struct rpc_pipe_ops spnfs_upcall_ops = {
++ .upcall = spnfs_pipe_upcall,
++ .downcall = spnfs_pipe_downcall,
++ .destroy_msg = spnfs_pipe_destroy_msg,
+};
+
-+enum pnfs_try_status {
-+ PNFS_ATTEMPTED = 0,
-+ PNFS_NOT_ATTEMPTED = 1,
-+};
++/* evil global variable */
++struct spnfs *global_spnfs;
++struct spnfs_config *spnfs_config;
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++int spnfs_use_layoutsegments;
++uint64_t layoutsegment_size;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++
++/*
++ * Used by spnfs_enabled()
++ * Tracks if the subsystem has been initialized at some point. It doesn't
++ * matter if it's not currently initialized.
++ */
++static int spnfs_enabled_at_some_point;
++
++/* call this to start the ball rolling */
++/* code it like we're going to avoid the global variable in the future */
++int
++nfsd_spnfs_new(void)
++{
++ struct spnfs *spnfs = NULL;
++ struct path path;
++ struct nameidata nd;
++ int rc;
++
++ if (global_spnfs != NULL)
++ return -EEXIST;
++
++ path.mnt = rpc_get_mount();
++ if (IS_ERR(path.mnt))
++ return PTR_ERR(path.mnt);
++
++ /* FIXME: do not abuse rpc_pipefs/nfs */
++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd);
++ if (rc)
++ goto err;
++
++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL);
++ if (spnfs == NULL){
++ rc = -ENOMEM;
++ goto err;
++ }
++
++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs,
++ &spnfs_upcall_ops, 0);
++ if (IS_ERR(spnfs->spnfs_dentry)) {
++ rc = -EPIPE;
++ goto err;
++ }
++
++ mutex_init(&spnfs->spnfs_lock);
++ mutex_init(&spnfs->spnfs_plock);
++ init_waitqueue_head(&spnfs->spnfs_wq);
++
++ global_spnfs = spnfs;
++ spnfs_enabled_at_some_point = 1;
++
++ return 0;
++err:
++ rpc_put_mount();
++ kfree(spnfs);
++ return rc;
++}
++
++/* again, code it like we're going to remove the global variable */
++void
++nfsd_spnfs_delete(void)
++{
++ struct spnfs *spnfs = global_spnfs;
++
++ if (!spnfs)
++ return;
++ rpc_unlink(spnfs->spnfs_dentry);
++ rpc_put_mount();
++ global_spnfs = NULL;
++ kfree(spnfs);
++}
+
-+struct pnfs_fsdata {
-+ struct pnfs_layout_segment *lseg;
-+ int bypass_eof;
-+ void *private;
-+};
++/* RPC pipefs upcall/downcall routines */
++/* looks like this code is invoked by the rpc_pipe code */
++/* to handle upcalls on things we've queued elsewhere */
++/* See nfs_idmap_id for an exmaple of enqueueing */
++static ssize_t
++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
++ char __user *dst, size_t buflen)
++{
++ char *data = (char *)msg->data + msg->copied;
++ ssize_t mlen = msg->len - msg->copied;
++ ssize_t left;
+
-+#ifdef CONFIG_NFS_V4_1
++ if (mlen > buflen)
++ mlen = buflen;
+
-+#include <linux/nfs_page.h> /* For struct nfs_pageio_descriptor */
++ left = copy_to_user(dst, data, mlen);
++ if (left < 0) {
++ msg->errno = left;
++ return left;
++ }
++ mlen -= left;
++ msg->copied += mlen;
++ msg->errno = 0;
++ return mlen;
++}
+
-+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
++static ssize_t
++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
++{
++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode);
++ struct spnfs *spnfs = (struct spnfs *)rpci->private;
++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im;
++ int ret;
+
-+enum {
-+ NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
-+ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
-+ NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
-+ NFS_LAYOUT_NEED_LCOMMIT, /* LAYOUTCOMMIT needed */
-+};
++ if (mlen != sizeof(struct spnfs_msg))
++ return -ENOSPC;
+
-+enum layoutdriver_policy_flags {
-+ /* Should the pNFS client commit and return the layout upon a setattr */
-+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++ if (im_in == NULL)
++ return -ENOMEM;
+
-+ /* Should the NFS req. gather algorithm cross stripe boundaries? */
-+ PNFS_GATHER_ACROSS_STRIPES = 1 << 1,
++ if (copy_from_user(im_in, src, mlen) != 0)
++ return -EFAULT;
+
-+ /* Should the full nfs rpc cleanup code be used after io */
-+ PNFS_USE_RPC_CODE = 1 << 2,
-+};
++ mutex_lock(&spnfs->spnfs_plock);
+
-+/* Per-layout driver specific registration structure */
-+struct pnfs_layoutdriver_type {
-+ struct list_head pnfs_tblid;
-+ const u32 id;
-+ const char *name;
-+ struct module *owner;
-+ unsigned flags;
-+ int (*initialize_mountpoint) (struct nfs_server *, const struct nfs_fh *);
-+ int (*uninitialize_mountpoint) (struct nfs_server *);
++ ret = mlen;
++ im->im_status = im_in->im_status;
++ /* If we got an error, terminate now, and wake up pending upcalls */
++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) {
++ wake_up(&spnfs->spnfs_wq);
++ goto out;
++ }
+
-+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
-+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
++ ret = -EINVAL;
++ /* Did we match the current upcall? */
++ /* DMXXX: do not understand the comment above, from original code */
++ /* DMXXX: when do we _not_ match the current upcall? */
++ /* DMXXX: anyway, let's to a simplistic check */
++ if (im_in->im_type == im->im_type) {
++ /* copy the response into the spnfs struct */
++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res));
++ ret = mlen;
++ } else
++ dprintk("spnfs: downcall type != upcall type\n");
+
-+ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
-+ void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
-+ /* The stripe size of the file system */
-+ ssize_t (*get_stripesize) (struct pnfs_layout_hdr *layoutid);
++ wake_up(&spnfs->spnfs_wq);
++/* DMXXX handle rval processing */
++out:
++ mutex_unlock(&spnfs->spnfs_plock);
++ kfree(im_in);
++ return ret;
++}
+
-+ /* test for nfs page cache coalescing */
-+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
++static void
++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg)
++{
++ struct spnfs_msg *im = msg->data;
++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im);
+
-+ /* Retreive the block size of the file system.
-+ * If gather_across_stripes == 1, then the file system will gather
-+ * requests into the block size.
-+ * TODO: Where will the layout driver get this info? It is hard
-+ * coded in PVFS2.
-+ */
-+ ssize_t (*get_blocksize) (void);
++ if (msg->errno >= 0)
++ return;
++ mutex_lock(&spnfs->spnfs_plock);
++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */
++ wake_up(&spnfs->spnfs_wq);
++ mutex_unlock(&spnfs->spnfs_plock);
++}
+
-+/* read and write pagelist should return just 0 (to indicate that
-+ * the layout code has taken control) or 1 (to indicate that the
-+ * layout code wishes to fall back to normal nfs.) If 0 is returned,
-+ * information can be passed back through nfs_data->res and
-+ * nfs_data->task.tk_status, and the appropriate pnfs done function
-+ * MUST be called.
-+ */
-+ enum pnfs_try_status
-+ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages);
-+ enum pnfs_try_status
-+ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how);
-+ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
-+ loff_t pos, unsigned count,
-+ struct pnfs_fsdata *fsdata);
-+ int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
-+ unsigned count, unsigned copied,
-+ struct pnfs_layout_segment *lseg);
-+ void (*write_end_cleanup)(struct file *filp,
-+ struct pnfs_fsdata *fsdata);
++/* generic upcall. called by functions in spnfs_ops.c */
++int
++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg,
++ union spnfs_msg_res *res)
++{
++ struct rpc_pipe_msg msg;
++ struct spnfs_msg *im;
++ DECLARE_WAITQUEUE(wq, current);
++ int ret = -EIO;
++ int rval;
+
-+ /* Consistency ops */
-+ /* 2 problems:
-+ * 1) the page list contains nfs_pages, NOT pages
-+ * 2) currently the NFS code doesn't create a page array (as it does with read/write)
-+ */
-+ enum pnfs_try_status
-+ (*commit) (struct nfs_write_data *nfs_data, int how);
++ im = &spnfs->spnfs_im;
+
-+ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
-+ struct nfs4_layoutcommit_args *args);
++ mutex_lock(&spnfs->spnfs_lock);
++ mutex_lock(&spnfs->spnfs_plock);
+
-+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
-+ struct xdr_stream *xdr,
-+ const struct nfs4_layoutcommit_args *args);
++ memset(im, 0, sizeof(*im));
++ memcpy(im, upmsg, sizeof(*upmsg));
+
-+ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
-+ struct nfs4_layoutcommit_args *args,
-+ int status);
++ memset(&msg, 0, sizeof(msg));
++ msg.data = im;
++ msg.len = sizeof(*im);
+
-+ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
-+ struct xdr_stream *xdr,
-+ const struct nfs4_layoutreturn_args *args);
-+};
++ add_wait_queue(&spnfs->spnfs_wq, &wq);
++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg);
++ if (rval < 0) {
++ remove_wait_queue(&spnfs->spnfs_wq, &wq);
++ goto out;
++ }
+
-+struct pnfs_layout_hdr {
-+ unsigned long refcount;
-+ struct list_head layouts; /* other client layouts */
-+ struct list_head segs; /* layout segments list */
-+ int roc_iomode;/* return on close iomode, 0=none */
-+ seqlock_t seqlock; /* Protects the stateid */
-+ nfs4_stateid stateid;
-+ unsigned long state;
-+ struct rpc_cred *cred; /* layoutcommit credential */
-+ /* DH: These vars keep track of the maximum write range
-+ * so the values can be used for layoutcommit.
-+ */
-+ loff_t write_begin_pos;
-+ loff_t write_end_pos;
-+ struct inode *inode;
-+};
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ mutex_unlock(&spnfs->spnfs_plock);
++ schedule();
++ current->state = TASK_RUNNING;
++ remove_wait_queue(&spnfs->spnfs_wq, &wq);
++ mutex_lock(&spnfs->spnfs_plock);
+
-+struct pnfs_device {
-+ struct nfs4_deviceid dev_id;
-+ unsigned int layout_type;
-+ unsigned int mincount;
-+ struct page **pages;
-+ void *area;
-+ unsigned int pgbase;
-+ unsigned int pglen;
-+};
++ if (im->im_status & SPNFS_STATUS_SUCCESS) {
++ /* copy our result from the upcall */
++ memcpy(res, &im->im_res, sizeof(*res));
++ ret = 0;
++ }
+
-+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
++out:
++ memset(im, 0, sizeof(*im));
++ mutex_unlock(&spnfs->spnfs_plock);
++ mutex_unlock(&spnfs->spnfs_lock);
++ return(ret);
++}
+
-+struct pnfs_devicelist {
-+ unsigned int eof;
-+ unsigned int num_devs;
-+ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
-+};
++/*
++ * This is used to determine if the spnfsd daemon has been started at
++ * least once since the system came up. This is used to by the export
++ * mechanism to decide if spnfs is in use.
++ *
++ * Returns non-zero if the spnfsd has initialized the communication pipe
++ * at least once.
++ */
++int spnfs_enabled(void)
++{
++ return spnfs_enabled_at_some_point;
++}
++
++#ifdef CONFIG_PROC_FS
+
+/*
-+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
++ * procfs virtual files for user/kernel space communication:
++ *
++ * ctl - currently just an on/off switch...can be expanded
++ * getfh - fd to fh conversion
++ * recall - recall a layout from the command line, for example:
++ * echo <path> > /proc/fs/spnfs/recall
++ * config - configuration info, e.g., stripe size, num ds, etc.
+ */
-+#define NFS4_DEVICE_ID_HASH_BITS 5
-+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
-+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
-+static inline u32
-+nfs4_deviceid_hash(struct nfs4_deviceid *id)
++/*-------------- start ctl -------------------------*/
++static ssize_t ctl_write(struct file *file, const char __user *buf,
++ size_t count, loff_t *offset)
+{
-+ unsigned char *cptr = (unsigned char *)id->data;
-+ unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-+ u32 x = 0;
++ int cmd, rc;
+
-+ while (nbytes--) {
-+ x *= 37;
-+ x += *cptr++;
-+ }
-+ return x & NFS4_DEVICE_ID_HASH_MASK;
-+}
++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int)))
++ return -EFAULT;
++ if (cmd) {
++ rc = nfsd_spnfs_new();
++ if (rc != 0)
++ return rc;
++ } else
++ nfsd_spnfs_delete();
+
-+struct pnfs_deviceid_node {
-+ struct hlist_node de_node;
-+ struct nfs4_deviceid de_id;
-+ atomic_t de_ref;
-+};
++ return count;
++}
+
-+struct pnfs_deviceid_cache {
-+ spinlock_t dc_lock;
-+ atomic_t dc_ref;
-+ void (*dc_free_callback)(struct pnfs_deviceid_node *);
-+ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-+ struct hlist_head dc_to_free;
++static const struct file_operations ctl_ops = {
++ .write = ctl_write,
+};
++/*-------------- end ctl ---------------------------*/
+
-+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-+ void (*free_callback)(struct pnfs_deviceid_node *));
-+extern void pnfs_put_deviceid_cache(struct nfs_client *);
-+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-+ struct pnfs_deviceid_cache *,
-+ struct nfs4_deviceid *);
-+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-+ struct pnfs_deviceid_cache *,
-+ struct pnfs_deviceid_node *);
-+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-+ struct pnfs_deviceid_node *devid);
-+extern void pnfs_delete_deviceid(struct pnfs_deviceid_cache *,
-+ struct nfs4_deviceid *);
++/*-------------- start config -------------------------*/
++static ssize_t config_write(struct file *file, const char __user *buf,
++ size_t count, loff_t *offset)
++{
++ static struct spnfs_config cfg;
+
-+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
-+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
++ if (copy_from_user(&cfg, buf, count))
++ return -EFAULT;
+
-+/* nfs4proc.c */
-+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
-+ const struct nfs_fh *fh,
-+ struct pnfs_devicelist *devlist);
-+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
-+ struct pnfs_device *dev);
-+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
-+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
-+ int issync);
-+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
++ spnfs_config = &cfg;
++ return 0;
++}
+
-+/* pnfs.c */
-+void get_lseg(struct pnfs_layout_segment *lseg);
-+void put_lseg(struct pnfs_layout_segment *lseg);
-+void put_lseg_locked(struct pnfs_layout_segment *lseg);
-+struct pnfs_layout_segment *
-+pnfs_has_layout(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range);
-+struct pnfs_layout_segment *
-+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-+ loff_t pos, u64 count, enum pnfs_iomode access_type);
-+bool pnfs_return_layout_barrier(struct nfs_inode *, struct pnfs_layout_range *);
-+int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *,
-+ const nfs4_stateid *stateid, /* optional */
-+ enum pnfs_layoutreturn_type, bool wait);
-+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id);
-+void unset_pnfs_layoutdriver(struct nfs_server *);
-+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
-+ const struct rpc_call_ops *, int);
-+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
-+ const struct rpc_call_ops *);
-+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
-+int pnfs_layoutcommit_inode(struct inode *inode, int sync);
-+void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent);
-+void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx);
-+void pnfs_set_ds_iosize(struct nfs_server *server);
-+enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *,
-+ const struct rpc_call_ops *, int);
-+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
-+ struct nfs_open_context *, struct list_head *,
-+ size_t *);
-+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
-+ size_t *);
-+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
-+int pnfs_layout_process(struct nfs4_layoutget *lgp);
-+void pnfs_layoutget_release(struct pnfs_layout_hdr *);
-+void pnfs_layoutreturn_release(struct pnfs_layout_hdr *,
-+ struct pnfs_layout_range *range);
-+void pnfs_destroy_layout(struct nfs_inode *);
-+void pnfs_destroy_all_layouts(struct nfs_client *);
-+void put_layout_hdr(struct inode *inode);
-+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-+ struct nfs4_state *open_state);
-+void pnfs_read_done(struct nfs_read_data *);
-+void pnfs_writeback_done(struct nfs_write_data *);
-+void pnfs_commit_done(struct nfs_write_data *);
-+int _pnfs_write_begin(struct inode *inode, struct page *page,
-+ loff_t pos, unsigned len,
-+ struct pnfs_layout_segment *lseg,
-+ struct pnfs_fsdata **fsdata);
++static const struct file_operations config_ops = {
++ .write = config_write,
++};
++/*-------------- end config ---------------------------*/
+
-+static inline bool
-+has_layout(struct nfs_inode *nfsi)
++/*-------------- start getfh -----------------------*/
++static int getfh_open(struct inode *inode, struct file *file)
+{
-+ return nfsi->layout != NULL;
++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
++ if (file->private_data == NULL)
++ return -ENOMEM;
++
++ return 0;
+}
+
-+static inline int lo_fail_bit(u32 iomode)
++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count,
++ loff_t *offset)
+{
-+ return iomode == IOMODE_RW ?
-+ NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh)))
++ return -EFAULT;
++
++ return count;
+}
+
-+static inline void pnfs_invalidate_layout_stateid(struct pnfs_layout_hdr *lo)
++static ssize_t getfh_write(struct file *file, const char __user *buf,
++ size_t count, loff_t *offset)
+{
-+ write_seqlock(&lo->seqlock);
-+ clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
-+ write_sequnlock(&lo->seqlock);
++ int fd;
++
++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int)))
++ return -EFAULT;
++ if (spnfs_getfh(fd, file->private_data) != 0)
++ return -EIO;
++
++ return count;
+}
+
-+/* Return true if a layout driver is being used for this mountpoint */
-+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
++static int getfh_release(struct inode *inode, struct file *file)
+{
-+ return nfss->pnfs_curr_ld != NULL;
++ kfree(file->private_data);
++ return 0;
+}
+
-+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
-+ struct pnfs_fsdata *fsdata)
++static const struct file_operations getfh_ops = {
++ .open = getfh_open,
++ .read = getfh_read,
++ .write = getfh_write,
++ .release = getfh_release,
++};
++/*-------------- end getfh ------------------------*/
++
++
++/*-------------- start recall layout --------------*/
++static ssize_t recall_write(struct file *file, const char __user *buf,
++ size_t count, loff_t *offset)
+{
-+ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) ||
-+ !fsdata->bypass_eof;
++ char input[128];
++ char *path, *str, *p;
++ int rc;
++ u64 off = 0, len = 0;
++
++ if (count > 128)
++ return -EINVAL;
++
++ if (copy_from_user(input, buf, count))
++ return -EFAULT;
++
++ /* assumes newline-terminated path */
++ p = memchr(input, '\n', count);
++ if (p == NULL)
++ return -EINVAL;
++ *p = '\0';
++
++ /*
++ * Scan for path and, optionally, an offset and length
++ * of a layout segment to be recalled; if there are two
++ * fields, they're assumed to be path and offset.
++ */
++ p = input;
++ path = strsep(&p, " ");
++ if (path == NULL)
++ return -EINVAL;
++
++ str = strsep(&p, " ");
++ if (str != NULL) {
++ rc = strict_strtoull(str, 10, &off);
++ if (rc != 0)
++ return -EINVAL;
++
++ str = strsep(&p, " ");
++ if (str != NULL) {
++ rc = strict_strtoull(str, 10, &len);
++ if (rc != 0)
++ return -EINVAL;
++ }
++ }
++
++ rc = spnfs_test_layoutrecall(path, off, len);
++ if (rc != 0)
++ return rc;
++
++ return count;
+}
+
-+/* Should the pNFS client commit and return the layout upon a setattr */
-+static inline bool
-+pnfs_ld_layoutret_on_setattr(struct inode *inode)
++static const struct file_operations recall_ops = {
++ .write = recall_write,
++};
++/*-------------- end recall layout --------------*/
++
++
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++/*-------------- start layoutseg -------------------------*/
++static ssize_t layoutseg_write(struct file *file, const char __user *buf,
++ size_t count, loff_t *offset)
+{
-+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
-+ return false;
-+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
-+ PNFS_LAYOUTRET_ON_SETATTR;
++ char cmd[3];
++
++ if (copy_from_user(cmd, buf, 1))
++ return -EFAULT;
++ if (cmd[0] == '0')
++ spnfs_use_layoutsegments = 0;
++ else
++ spnfs_use_layoutsegments = 1;
++
++ return count;
+}
+
-+/* Should the NFS req. gather algorithm cross stripe boundaries? */
-+static inline bool
-+pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld)
++static const struct file_operations layoutseg_ops = {
++ .write = layoutseg_write,
++};
++/*-------------- end layoutseg ---------------------------*/
++
++/*-------------- start layoutsegsize -------------------------*/
++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf,
++ size_t count, loff_t *offset)
+{
-+ return ld->flags & PNFS_GATHER_ACROSS_STRIPES;
-+}
++ char cmd[50];
+
-+static inline bool pnfs_use_rpc(struct nfs_server *nfss)
-+{
-+ if (pnfs_enabled_sb(nfss))
-+ return nfss->pnfs_curr_ld->flags & PNFS_USE_RPC_CODE;
++ if (copy_from_user(cmd, buf, 49))
++ return -EFAULT;
++ layoutsegment_size = simple_strtoull(cmd, NULL, 10);
+
-+ return true;
++ return count;
+}
+
-+/* Should the pNFS client commit and return the layout on close
-+ */
-+static inline int
-+pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
-+{
-+ return nfsi->layout->roc_iomode;
-+}
++static const struct file_operations layoutsegsize_ops = {
++ .write = layoutsegsize_write,
++};
++/*-------------- end layoutsegsize ---------------------------*/
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
-+static inline int pnfs_write_begin(struct file *filp, struct page *page,
-+ loff_t pos, unsigned len,
-+ struct pnfs_layout_segment *lseg,
-+ void **fsdata)
++int
++spnfs_init_proc(void)
+{
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ struct nfs_server *nfss = NFS_SERVER(inode);
-+ int status = 0;
++ struct proc_dir_entry *entry;
+
-+ *fsdata = lseg;
-+ if (lseg && nfss->pnfs_curr_ld->write_begin)
-+ status = _pnfs_write_begin(inode, page, pos, len, lseg,
-+ (struct pnfs_fsdata **) fsdata);
-+ return status;
-+}
++ entry = proc_mkdir("fs/spnfs", NULL);
++ if (!entry)
++ return -ENOMEM;
+
-+/* CAREFUL - what happens if copied < len??? */
-+static inline int pnfs_write_end(struct file *filp, struct page *page,
-+ loff_t pos, unsigned len, unsigned copied,
-+ struct pnfs_layout_segment *lseg)
-+{
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ struct nfs_server *nfss = NFS_SERVER(inode);
++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL);
++ if (!entry)
++ return -ENOMEM;
++ entry->proc_fops = &ctl_ops;
+
-+ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
-+ return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
-+ copied, lseg);
-+ else
-+ return 0;
-+}
++ entry = create_proc_entry("fs/spnfs/config", 0, NULL);
++ if (!entry)
++ return -ENOMEM;
++ entry->proc_fops = &config_ops;
+
-+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
-+{
-+ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL);
++ if (!entry)
++ return -ENOMEM;
++ entry->proc_fops = &getfh_ops;
+
-+ if (fsdata && nfss->pnfs_curr_ld) {
-+ if (nfss->pnfs_curr_ld->write_end_cleanup)
-+ nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
-+ if (nfss->pnfs_curr_ld->write_begin)
-+ pnfs_free_fsdata(fsdata);
-+ }
-+}
++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL);
++ if (!entry)
++ return -ENOMEM;
++ entry->proc_fops = &recall_ops;
+
-+static inline int pnfs_return_layout(struct inode *ino,
-+ struct pnfs_layout_range *range,
-+ const nfs4_stateid *stateid, /* optional */
-+ enum pnfs_layoutreturn_type type,
-+ bool wait)
-+{
-+ struct nfs_inode *nfsi = NFS_I(ino);
-+ struct nfs_server *nfss = NFS_SERVER(ino);
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL);
++ if (!entry)
++ return -ENOMEM;
++ entry->proc_fops = &layoutseg_ops;
+
-+ if (pnfs_enabled_sb(nfss) &&
-+ (type != RETURN_FILE || has_layout(nfsi)))
-+ return _pnfs_return_layout(ino, range, stateid, type, wait);
++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL);
++ if (!entry)
++ return -ENOMEM;
++ entry->proc_fops = &layoutsegsize_ops;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
+ return 0;
+}
++#endif /* CONFIG_PROC_FS */
+diff --git a/fs/nfsd/spnfs_ops.c b/fs/nfsd/spnfs_ops.c
+new file mode 100644
+index 0000000..b97a5af
+--- /dev/null
++++ b/fs/nfsd/spnfs_ops.c
+@@ -0,0 +1,878 @@
++/*
++ * fs/nfsd/spnfs_ops.c
++ *
++ * Communcation layer between spNFS kernel and userspace
++ *
++ */
++/******************************************************************************
+
-+static inline bool
-+layoutcommit_needed(struct nfs_inode *nfsi)
-+{
-+ return has_layout(nfsi) &&
-+ test_bit(NFS_LAYOUT_NEED_LCOMMIT, &nfsi->layout->state);
-+}
++(c) 2007 Network Appliance, Inc. All Rights Reserved.
+
-+static inline int pnfs_get_write_status(struct nfs_write_data *data)
-+{
-+ return data->pdata.pnfs_error;
-+}
++Network Appliance provides this source code under the GPL v2 License.
++The GPL v2 license is available at
++http://opensource.org/licenses/gpl-license.php.
+
-+static inline int pnfs_get_read_status(struct nfs_read_data *data)
-+{
-+ return data->pdata.pnfs_error;
-+}
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
-+static inline struct pnfs_layout_segment *
-+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
-+{
-+ if (fsdata) {
-+ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
++******************************************************************************/
+
-+ if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
-+ return ((struct pnfs_fsdata *) fsdata)->lseg;
-+ return (struct pnfs_layout_segment *)fsdata;
-+ }
-+ return NULL;
-+}
++#include <linux/sched.h>
++#include <linux/file.h>
++#include <linux/namei.h>
++#include <linux/nfs_fs.h>
++#include <linux/nfsd4_spnfs.h>
++#include <linux/nfsd/debug.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
++#include <linux/nfsd/nfs4layoutxdr.h>
+
-+#else /* CONFIG_NFS_V4_1 */
++#include "pnfsd.h"
+
-+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
-+{
-+}
++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */
++/* #define CONFIG_SPNFS_TEST 1 */
+
-+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
-+{
-+}
++#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
-+static inline void get_lseg(struct pnfs_layout_segment *lseg)
-+{
-+}
++/*
++ * The functions that are called from elsewhere in the kernel
++ * to perform tasks in userspace
++ *
++ */
+
-+static inline void put_lseg(struct pnfs_layout_segment *lseg)
-+{
-+}
++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS
++extern int spnfs_use_layoutsegments;
++extern uint64_t layoutsegment_size;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
++extern struct spnfs *global_spnfs;
+
-+static inline void put_lseg_locked(struct pnfs_layout_segment *lseg)
++int
++spnfs_layout_type(struct super_block *sb)
+{
++ return LAYOUT_NFSV4_1_FILES;
+}
+
-+static inline struct pnfs_layout_segment *
-+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-+ loff_t pos, u64 count, enum pnfs_iomode access_type)
++enum nfsstat4
++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_layoutget_arg *lg_arg,
++ struct nfsd4_pnfs_layoutget_res *lg_res)
+{
-+ return NULL;
-+}
++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
++ struct spnfs_msg *im = NULL;
++ union spnfs_msg_res *res = NULL;
++ struct pnfs_filelayout_layout *flp = NULL;
++ int status, i;
++ enum nfsstat4 nfserr;
+
-+static inline bool
-+has_layout(struct nfs_inode *nfsi)
-+{
-+ return false;
-+}
++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++ if (im == NULL) {
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
+
-+static inline bool
-+layoutcommit_needed(struct nfs_inode *nfsi)
-+{
-+ return 0;
-+}
++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++ if (res == NULL) {
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
+
-+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
-+ struct pnfs_fsdata *fsdata)
-+{
-+ return 1;
-+}
++ im->im_type = SPNFS_TYPE_LAYOUTGET;
++ im->im_args.layoutget_args.inode = inode->i_ino;
++ im->im_args.layoutget_args.generation = inode->i_generation;
++
++ /* call function to queue the msg for upcall */
++ if (spnfs_upcall(spnfs, im, res) != 0) {
++ dprintk("failed spnfs upcall: layoutget\n");
++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
++ goto layoutget_cleanup;
++ }
++ status = res->layoutget_res.status;
++ if (status != 0) {
++ /* FIXME? until user mode is fixed, translate system error */
++ switch (status) {
++ case -E2BIG:
++ case -ETOOSMALL:
++ nfserr = NFS4ERR_TOOSMALL;
++ break;
++ case -ENOMEM:
++ case -EAGAIN:
++ case -EINTR:
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ break;
++ case -ENOENT:
++ nfserr = NFS4ERR_BADLAYOUT;
++ break;
++ default:
++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE;
++ }
++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n",
++ status, nfserr);
++ goto layoutget_cleanup;
++ }
+
-+static inline enum pnfs_try_status
-+pnfs_try_to_read_data(struct nfs_read_data *data,
-+ const struct rpc_call_ops *call_ops)
-+{
-+ return PNFS_NOT_ATTEMPTED;
-+}
++ lg_res->lg_return_on_close = 0;
++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS)
++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */
++ /* the amount requested by the client. */
++ if (spnfs_use_layoutsegments) {
++ if (layoutsegment_size != 0)
++ lg_res->lg_seg.length = layoutsegment_size;
++ } else
++ lg_res->lg_seg.length = NFS4_MAX_UINT64;
++#else
++ lg_res->lg_seg.length = NFS4_MAX_UINT64;
++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */
+
-+static inline enum pnfs_try_status
-+pnfs_try_to_write_data(struct nfs_write_data *data,
-+ const struct rpc_call_ops *call_ops, int how)
-+{
-+ return PNFS_NOT_ATTEMPTED;
-+}
++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL);
++ if (flp == NULL) {
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
++ flp->device_id.sbid = lg_arg->lg_sbid;
++ flp->device_id.devid = res->layoutget_res.devid;
++ flp->lg_layout_type = 1; /* XXX */
++ flp->lg_stripe_type = res->layoutget_res.stripe_type;
++ flp->lg_commit_through_mds = 0;
++ flp->lg_stripe_unit = res->layoutget_res.stripe_size;
++ flp->lg_first_stripe_index = 0;
++ flp->lg_pattern_offset = 0;
++ flp->lg_fh_length = res->layoutget_res.stripe_count;
+
-+static inline enum pnfs_try_status
-+pnfs_try_to_commit(struct nfs_write_data *data,
-+ const struct rpc_call_ops *call_ops, int how)
-+{
-+ return PNFS_NOT_ATTEMPTED;
-+}
++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh),
++ GFP_KERNEL);
++ if (flp->lg_fh_list == NULL) {
++ nfserr = NFS4ERR_LAYOUTTRYLATER;
++ goto layoutget_cleanup;
++ }
++ /*
++ * FIX: Doing an extra copy here. Should group res.flist's fh_len
++ * and fh_val into a knfsd_fh structure.
++ */
++ for (i = 0; i < flp->lg_fh_length; i++) {
++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len;
++ memcpy(&flp->lg_fh_list[i].fh_base,
++ res->layoutget_res.flist[i].fh_val,
++ res->layoutget_res.flist[i].fh_len);
++ }
+
-+static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync)
-+{
-+ return 0;
-+}
++ /* encode the layoutget body */
++ nfserr = filelayout_encode_layout(xdr, flp);
+
-+static inline bool
-+pnfs_ld_layoutret_on_setattr(struct inode *inode)
-+{
-+ return false;
-+}
++layoutget_cleanup:
++ if (flp) {
++ if (flp->lg_fh_list)
++ kfree(flp->lg_fh_list);
++ kfree(flp);
++ }
++ kfree(im);
++ kfree(res);
+
-+static inline bool pnfs_use_rpc(struct nfs_server *nfss)
-+{
-+ return true;
++ return nfserr;
+}
+
-+static inline int
-+pnfs_layout_roc_iomode(struct nfs_inode *nfsi)
++int
++spnfs_layoutcommit(void)
+{
+ return 0;
+}
+
-+static inline int pnfs_return_layout(struct inode *ino,
-+ struct pnfs_layout_range *range,
-+ const nfs4_stateid *stateid, /* optional */
-+ enum pnfs_layoutreturn_type type,
-+ bool wait)
++int
++spnfs_layoutreturn(struct inode *inode,
++ const struct nfsd4_pnfs_layoutreturn_arg *args)
+{
+ return 0;
+}
+
-+static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id)
-+{
-+}
-+
-+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
-+{
-+}
-+
-+static inline void pnfs_set_ds_iosize(struct nfs_server *server)
++int
++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len)
+{
-+ server->ds_wsize = server->ds_rsize = -1;
-+}
++ struct super_block *sb;
++ struct nfsd4_pnfs_cb_layout lr;
+
-+static inline int pnfs_write_begin(struct file *filp, struct page *page,
-+ loff_t pos, unsigned len,
-+ struct pnfs_layout_segment *lseg,
-+ void **fsdata)
-+{
-+ *fsdata = NULL;
-+ return 0;
-+}
++ switch (type) {
++ case RETURN_FILE:
++ sb = inode->i_sb;
++ dprintk("%s: recalling layout for ino = %lu\n",
++ __func__, inode->i_ino);
++ break;
++ case RETURN_FSID:
++ sb = inode->i_sb;
++ dprintk("%s: recalling layout for fsid x (unimplemented)\n",
++ __func__);
++ return 0;
++ case RETURN_ALL:
++ /* XXX figure out how to get a sb since there's no inode ptr */
++ dprintk("%s: recalling all layouts (unimplemented)\n",
++ __func__);
++ return 0;
++ default:
++ return -EINVAL;
++ }
+
-+static inline int pnfs_write_end(struct file *filp, struct page *page,
-+ loff_t pos, unsigned len, unsigned copied,
-+ struct pnfs_layout_segment *lseg)
-+{
-+ return 0;
-+}
++ lr.cbl_recall_type = type;
++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES;
++ lr.cbl_seg.clientid = 0;
++ lr.cbl_seg.offset = offset;
++ lr.cbl_seg.length = len;
++ lr.cbl_seg.iomode = IOMODE_ANY;
++ lr.cbl_layoutchanged = 0;
+
-+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
-+{
-+}
++ nfsd_layout_recall_cb(sb, inode, &lr);
+
-+static inline int pnfs_get_write_status(struct nfs_write_data *data)
-+{
+ return 0;
+}
+
-+static inline int pnfs_get_read_status(struct nfs_read_data *data)
-+{
-+ return 0;
-+}
+
-+static inline struct pnfs_layout_segment *
-+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
++int
++spnfs_test_layoutrecall(char *path, u64 offset, u64 len)
+{
-+ return NULL;
-+}
++ struct nameidata nd;
++ struct inode *inode;
++ int type, rc;
+
-+#endif /* CONFIG_NFS_V4_1 */
++ dprintk("%s: path=%s, offset=%llu, len=%llu\n",
++ __func__, path, offset, len);
+
-+#endif /* FS_NFS_PNFS_H */
-diff -up linux-2.6.35.noarch/fs/nfs/read.c.orig linux-2.6.35.noarch/fs/nfs/read.c
---- linux-2.6.35.noarch/fs/nfs/read.c.orig 2010-09-30 12:22:45.207044000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/read.c 2010-09-30 12:25:08.432285000 -0400
-@@ -18,13 +18,18 @@
- #include <linux/sunrpc/clnt.h>
- #include <linux/nfs_fs.h>
- #include <linux/nfs_page.h>
-+#include <linux/smp_lock.h>
-+#include <linux/module.h>
-
- #include <asm/system.h>
-+#include <linux/module.h>
-+#include "pnfs.h"
-
- #include "nfs4_fs.h"
- #include "internal.h"
- #include "iostat.h"
- #include "fscache.h"
-+#include "pnfs.h"
-
- #define NFSDBG_FACILITY NFSDBG_PAGECACHE
-
-@@ -117,11 +122,16 @@ int nfs_readpage_async(struct nfs_open_c
- LIST_HEAD(one_request);
- struct nfs_page *new;
- unsigned int len;
-+ loff_t pgoffs;
-+ struct pnfs_layout_segment *lseg;
-
- len = nfs_page_length(page);
- if (len == 0)
- return nfs_return_empty_page(page);
-- new = nfs_create_request(ctx, inode, page, 0, len);
-+ pgoffs = (loff_t)page->index << PAGE_CACHE_SHIFT;
-+ lseg = pnfs_update_layout(inode, ctx, pgoffs, len, IOMODE_READ);
-+ new = nfs_create_request(ctx, inode, page, 0, len, lseg);
-+ put_lseg(lseg);
- if (IS_ERR(new)) {
- unlock_page(page);
- return PTR_ERR(new);
-@@ -155,24 +165,20 @@ static void nfs_readpage_release(struct
- nfs_release_request(req);
- }
-
--/*
-- * Set up the NFS read request struct
-- */
--static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-- const struct rpc_call_ops *call_ops,
-- unsigned int count, unsigned int offset)
-+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops)
- {
-- struct inode *inode = req->wb_context->path.dentry->d_inode;
-+ struct inode *inode = data->inode;
- int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
-- .rpc_cred = req->wb_context->cred,
-+ .rpc_cred = data->cred,
- };
- struct rpc_task_setup task_setup_data = {
- .task = &data->task,
-- .rpc_client = NFS_CLIENT(inode),
-+ .rpc_client = clnt,
- .rpc_message = &msg,
- .callback_ops = call_ops,
- .callback_data = data,
-@@ -180,9 +186,46 @@ static int nfs_read_rpcsetup(struct nfs_
- .flags = RPC_TASK_ASYNC | swap_flags,
- };
-
-+ /* Set up the initial task struct. */
-+ NFS_PROTO(inode)->read_setup(data, &msg);
++ if (strcmp(path, "all") == 0) {
++ inode = NULL;
++ type = RETURN_ALL;
++ } else {
++ rc = path_lookup(path, 0, &nd);
++ if (rc != 0)
++ return -ENOENT;
+
-+ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-+ data->task.tk_pid,
-+ inode->i_sb->s_id,
-+ (long long)NFS_FILEID(inode),
-+ data->args.count,
-+ (unsigned long long)data->args.offset);
++ /*
++ * XXX todo: add a RETURN_FSID scenario here...maybe if
++ * inode is a dir...
++ */
+
-+ task = rpc_run_task(&task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+ rpc_put_task(task);
-+ return 0;
-+}
-+EXPORT_SYMBOL(nfs_initiate_read);
++ inode = nd.path.dentry->d_inode;
++ type = RETURN_FILE;
++ }
+
-+int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops)
-+{
-+ if (data->req->wb_lseg &&
-+ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
-+ return pnfs_get_read_status(data);
++ if (len == 0)
++ len = NFS4_MAX_UINT64;
+
-+ return nfs_initiate_read(data, clnt, call_ops);
++ rc = spnfs_layoutrecall(inode, type, offset, len);
++
++ if (type != RETURN_ALL)
++ path_put(&nd.path);
++ return rc;
+}
+
-+/*
-+ * Set up the NFS read request struct
-+ */
-+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-+ const struct rpc_call_ops *call_ops,
-+ unsigned int count, unsigned int offset)
++int
++spnfs_getdeviceiter(struct super_block *sb,
++ u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *gd_res)
+{
-+ struct inode *inode = req->wb_context->path.dentry->d_inode;
++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */
++ struct spnfs_msg *im = NULL;
++ union spnfs_msg_res *res = NULL;
++ int status = 0;
+
- data->req = req;
- data->inode = inode;
-- data->cred = msg.rpc_cred;
-+ data->cred = req->wb_context->cred;
-
- data->args.fh = NFS_FH(inode);
- data->args.offset = req_offset(req) + offset;
-@@ -197,21 +240,7 @@ static int nfs_read_rpcsetup(struct nfs_
- data->res.eof = 0;
- nfs_fattr_init(&data->fattr);
-
-- /* Set up the initial task struct. */
-- NFS_PROTO(inode)->read_setup(data, &msg);
--
-- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-- data->task.tk_pid,
-- inode->i_sb->s_id,
-- (long long)NFS_FILEID(inode),
-- count,
-- (unsigned long long)data->args.offset);
--
-- task = rpc_run_task(&task_setup_data);
-- if (IS_ERR(task))
-- return PTR_ERR(task);
-- rpc_put_task(task);
-- return 0;
-+ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
- }
-
- static void
-@@ -355,7 +384,14 @@ static void nfs_readpage_retry(struct rp
- {
- struct nfs_readargs *argp = &data->args;
- struct nfs_readres *resp = &data->res;
-+ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client;
-
-+#ifdef CONFIG_NFS_V4_1
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS read\n", __func__);
-+ clp = data->fldata.ds_nfs_client;
-+ }
-+#endif /* CONFIG_NFS_V4_1 */
- if (resp->eof || resp->count == argp->count)
- return;
-
-@@ -369,7 +405,10 @@ static void nfs_readpage_retry(struct rp
- argp->offset += resp->count;
- argp->pgbase += resp->count;
- argp->count -= resp->count;
-- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
-+#ifdef CONFIG_NFS_V4_1
-+ data->pdata.pnfs_error = -EAGAIN;
-+#endif /* CONFIG_NFS_V4_1 */
-+ nfs_restart_rpc(task, clp);
- }
-
- /*
-@@ -410,13 +449,19 @@ static void nfs_readpage_release_partial
- void nfs_read_prepare(struct rpc_task *task, void *calldata)
- {
- struct nfs_read_data *data = calldata;
-+ struct nfs4_session *ds_session = NULL;
-
-- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS read\n", __func__);
-+ ds_session = data->fldata.ds_nfs_client->cl_session;
++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++ if (im == NULL) {
++ status = -ENOMEM;
++ goto getdeviceiter_out;
+ }
-+ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
- &data->args.seq_args, &data->res.seq_res,
- 0, task))
- return;
- rpc_call_start(task);
- }
-+EXPORT_SYMBOL(nfs_read_prepare);
- #endif /* CONFIG_NFS_V4_1 */
-
- static const struct rpc_call_ops nfs_read_partial_ops = {
-@@ -561,15 +606,28 @@ readpage_async_filler(void *data, struct
- {
- struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct inode *inode = page->mapping->host;
-+ struct pnfs_layout_range *range;
- struct nfs_page *new;
- unsigned int len;
-+ loff_t pgoff;
- int error;
-
- len = nfs_page_length(page);
- if (len == 0)
- return nfs_return_empty_page(page);
-
-- new = nfs_create_request(desc->ctx, inode, page, 0, len);
-+ pgoff = (loff_t)page->index << PAGE_CACHE_SHIFT;
-+ range = desc->pgio->pg_lseg ? &desc->pgio->pg_lseg->range : NULL;
-+ if (!range ||
-+ (range->offset > pgoff + len) ||
-+ (range->offset + range->length < pgoff)) {
-+ put_lseg(desc->pgio->pg_lseg);
-+ desc->pgio->pg_lseg = pnfs_update_layout(inode, desc->ctx,
-+ pgoff, len, IOMODE_READ);
++
++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++ if (res == NULL) {
++ status = -ENOMEM;
++ goto getdeviceiter_out;
+ }
+
-+ new = nfs_create_request(desc->ctx, inode, page, 0, len,
-+ desc->pgio->pg_lseg);
- if (IS_ERR(new))
- goto out_error;
-
-@@ -625,6 +683,9 @@ int nfs_readpages(struct file *filp, str
- if (ret == 0)
- goto read_complete; /* all pages were read */
-
-+#ifdef CONFIG_NFS_V4_1
-+ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize);
-+#endif /* CONFIG_NFS_V4_1 */
- if (rsize < PAGE_CACHE_SIZE)
- nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
- else
-@@ -633,6 +694,7 @@ int nfs_readpages(struct file *filp, str
- ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-
- nfs_pageio_complete(&pgio);
-+ put_lseg(pgio.pg_lseg);
- npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nfs_add_stats(inode, NFSIOS_READPAGES, npages);
- read_complete:
-diff -up linux-2.6.35.noarch/fs/nfs/super.c.orig linux-2.6.35.noarch/fs/nfs/super.c
---- linux-2.6.35.noarch/fs/nfs/super.c.orig 2010-09-30 12:22:45.213046000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/super.c 2010-09-30 12:25:08.439284000 -0400
-@@ -64,6 +64,7 @@
- #include "iostat.h"
- #include "internal.h"
- #include "fscache.h"
-+#include "pnfs.h"
-
- #define NFSDBG_FACILITY NFSDBG_VFS
-
-@@ -687,6 +688,28 @@ static int nfs_show_options(struct seq_f
-
- return 0;
- }
-+#ifdef CONFIG_NFS_V4_1
-+void show_sessions(struct seq_file *m, struct nfs_server *server)
-+{
-+ if (nfs4_has_session(server->nfs_client))
-+ seq_printf(m, ",sessions");
-+}
-+#else
-+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
-+#endif
++ im->im_type = SPNFS_TYPE_GETDEVICEITER;
++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie;
++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf;
+
-+#ifdef CONFIG_NFS_V4_1
-+void show_pnfs(struct seq_file *m, struct nfs_server *server)
-+{
-+ seq_printf(m, ",pnfs=");
-+ if (server->pnfs_curr_ld)
-+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
-+ else
-+ seq_printf(m, "not configured");
-+}
-+#else /* CONFIG_NFS_V4_1 */
-+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
-+#endif /* CONFIG_NFS_V4_1 */
-
- /*
- * Present statistical information for this VFS mountpoint
-@@ -725,6 +748,8 @@ static int nfs_show_stats(struct seq_fil
- seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
- seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
- seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
-+ show_sessions(m, nfss);
-+ show_pnfs(m, nfss);
- }
- #endif
-
-diff -up linux-2.6.35.noarch/fs/nfs/unlink.c.orig linux-2.6.35.noarch/fs/nfs/unlink.c
---- linux-2.6.35.noarch/fs/nfs/unlink.c.orig 2010-09-30 12:22:45.218044000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/unlink.c 2010-09-30 12:25:08.445284000 -0400
-@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task
- struct nfs_unlinkdata *data = calldata;
- struct nfs_server *server = NFS_SERVER(data->dir);
-
-- if (nfs4_setup_sequence(server, &data->args.seq_args,
-+ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args,
- &data->res.seq_res, 1, task))
- return;
- rpc_call_start(task);
-diff -up linux-2.6.35.noarch/fs/nfs/write.c.orig linux-2.6.35.noarch/fs/nfs/write.c
---- linux-2.6.35.noarch/fs/nfs/write.c.orig 2010-09-30 12:22:45.233044000 -0400
-+++ linux-2.6.35.noarch/fs/nfs/write.c 2010-09-30 12:25:08.452282000 -0400
-@@ -28,6 +28,7 @@
- #include "iostat.h"
- #include "nfs4_fs.h"
- #include "fscache.h"
-+#include "pnfs.h"
-
- #define NFSDBG_FACILITY NFSDBG_PAGECACHE
-
-@@ -59,6 +60,7 @@ struct nfs_write_data *nfs_commitdata_al
- }
- return p;
- }
-+EXPORT_SYMBOL(nfs_commitdata_alloc);
-
- void nfs_commit_free(struct nfs_write_data *p)
- {
-@@ -429,6 +431,17 @@ static void nfs_inode_remove_request(str
- nfs_clear_request(req);
- nfs_release_request(req);
- }
-+static void
-+nfs_mark_request_nopnfs(struct nfs_page *req)
++ /* call function to queue the msg for upcall */
++ status = spnfs_upcall(spnfs, im, res);
++ if (status != 0) {
++ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++ status = -EIO;
++ goto getdeviceiter_out;
++ }
++ status = res->getdeviceiter_res.status;
++
++ if (res->getdeviceiter_res.eof)
++ gd_res->gd_eof = 1;
++ else {
++ gd_res->gd_devid = res->getdeviceiter_res.devid;
++ gd_res->gd_cookie = res->getdeviceiter_res.cookie;
++ gd_res->gd_verf = res->getdeviceiter_res.verf;
++ gd_res->gd_eof = 0;
++ }
++
++getdeviceiter_out:
++ kfree(im);
++ kfree(res);
++
++ return status;
++}
++
++#ifdef CONFIG_SPNFS_TEST
++/*
++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the
++ * 1024 encoded stripe indices.
++ *
++ * Skip the devaddr4 length and encode the indicies count (1024) in the
++ * rq_res.head and set the rq_res.head length.
++ *
++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices).
++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the
++ * rq_res head to hold the rest of the getdeviceinfo return.
++ *
++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and
++ * rq_respages[rq_resused] contains the rq_res.pages.
++ */
++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info,
++ const struct pnfs_filelayout_device *fdev)
+{
-+ struct pnfs_layout_segment *lseg = req->wb_lseg;
++ struct nfsd4_compoundres *resp = info->resp;
++ struct svc_rqst *rqstp = resp->rqstp;
++ struct xdr_buf *xb = &resp->rqstp->rq_res;
++ __be32 *p;
+
-+ if (req->wb_lseg == NULL)
-+ return;
-+ req->wb_lseg = NULL;
-+ put_lseg(lseg);
-+ dprintk(" retry through MDS\n");
++ p = nfsd4_xdr_reserve_space(resp, 8);
++ p++; /* Fill in length later */
++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */
++ resp->p = p;
++
++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base;
++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused];
++ xb->page_base = 0;
++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */
++ xb->tail[0].iov_base = resp->p;
++ resp->end = xb->head[0].iov_base + PAGE_SIZE;
++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p;
++ return 0;
+}
-
- static void
- nfs_mark_request_dirty(struct nfs_page *req)
-@@ -534,7 +547,7 @@ nfs_need_commit(struct nfs_inode *nfsi)
- * The requests are *not* checked to ensure that they form a contiguous set.
- */
- static int
--nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
-+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs)
- {
- struct nfs_inode *nfsi = NFS_I(inode);
- int ret;
-@@ -542,7 +555,8 @@ nfs_scan_commit(struct inode *inode, str
- if (!nfs_need_commit(nfsi))
- return 0;
-
-- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
-+ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT,
-+ use_pnfs);
- if (ret > 0)
- nfsi->ncommit -= ret;
- if (nfs_need_commit(NFS_I(inode)))
-@@ -571,7 +585,8 @@ static inline int nfs_scan_commit(struct
- static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
- struct page *page,
- unsigned int offset,
-- unsigned int bytes)
-+ unsigned int bytes,
-+ struct pnfs_layout_segment *lseg)
- {
- struct nfs_page *req;
- unsigned int rqend;
-@@ -596,8 +611,8 @@ static struct nfs_page *nfs_try_to_updat
- * Note: nfs_flush_incompatible() will already
- * have flushed out requests having wrong owners.
- */
-- if (offset > rqend
-- || end < req->wb_offset)
-+ if (offset > rqend || end < req->wb_offset ||
-+ req->wb_lseg != lseg)
- goto out_flushme;
-
- if (nfs_set_page_tag_locked(req))
-@@ -645,16 +660,17 @@ out_err:
- * already called nfs_flush_incompatible() if necessary.
- */
- static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
-- struct page *page, unsigned int offset, unsigned int bytes)
-+ struct page *page, unsigned int offset, unsigned int bytes,
-+ struct pnfs_layout_segment *lseg)
- {
- struct inode *inode = page->mapping->host;
- struct nfs_page *req;
- int error;
-
-- req = nfs_try_to_update_request(inode, page, offset, bytes);
-+ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg);
- if (req != NULL)
- goto out;
-- req = nfs_create_request(ctx, inode, page, offset, bytes);
-+ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg);
- if (IS_ERR(req))
- goto out;
- error = nfs_inode_add_request(inode, req);
-@@ -667,23 +683,27 @@ out:
- }
-
- static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
-- unsigned int offset, unsigned int count)
-+ unsigned int offset, unsigned int count,
-+ struct pnfs_layout_segment *lseg,
-+ void *fsdata)
- {
- struct nfs_page *req;
-
-- req = nfs_setup_write_request(ctx, page, offset, count);
-+ req = nfs_setup_write_request(ctx, page, offset, count, lseg);
- if (IS_ERR(req))
- return PTR_ERR(req);
- nfs_mark_request_dirty(req);
- /* Update file length */
-- nfs_grow_file(page, offset, count);
-+ if (pnfs_grow_ok(lseg, fsdata))
-+ nfs_grow_file(page, offset, count);
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
- nfs_mark_request_dirty(req);
- nfs_clear_page_tag_locked(req);
- return 0;
- }
-
--int nfs_flush_incompatible(struct file *file, struct page *page)
-+int nfs_flush_incompatible(struct file *file, struct page *page,
-+ struct pnfs_layout_segment *lseg)
- {
- struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct nfs_page *req;
-@@ -702,7 +722,8 @@ int nfs_flush_incompatible(struct file *
- return 0;
- do_flush = req->wb_page != page || req->wb_context != ctx ||
- req->wb_lock_context->lockowner != current->files ||
-- req->wb_lock_context->pid != current->tgid;
-+ req->wb_lock_context->pid != current->tgid ||
-+ req->wb_lseg != lseg;
- nfs_release_request(req);
- if (!do_flush)
- return 0;
-@@ -729,7 +750,8 @@ static int nfs_write_pageuptodate(struct
- * things with a page scheduled for an RPC call (e.g. invalidate it).
- */
- int nfs_updatepage(struct file *file, struct page *page,
-- unsigned int offset, unsigned int count)
-+ unsigned int offset, unsigned int count,
-+ struct pnfs_layout_segment *lseg, void *fsdata)
- {
- struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = page->mapping->host;
-@@ -754,7 +776,7 @@ int nfs_updatepage(struct file *file, st
- offset = 0;
- }
-
-- status = nfs_writepage_setup(ctx, page, offset, count);
-+ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
- if (status < 0)
- nfs_set_pageerror(page);
-
-@@ -784,25 +806,21 @@ static int flush_task_priority(int how)
- return RPC_PRIORITY_NORMAL;
- }
-
--/*
-- * Set up the argument/result storage required for the RPC call.
-- */
--static int nfs_write_rpcsetup(struct nfs_page *req,
-- struct nfs_write_data *data,
-- const struct rpc_call_ops *call_ops,
-- unsigned int count, unsigned int offset,
-- int how)
-+int nfs_initiate_write(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how)
- {
-- struct inode *inode = req->wb_context->path.dentry->d_inode;
-+ struct inode *inode = data->inode;
- int priority = flush_task_priority(how);
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
-- .rpc_cred = req->wb_context->cred,
-+ .rpc_cred = data->cred,
- };
- struct rpc_task_setup task_setup_data = {
-- .rpc_client = NFS_CLIENT(inode),
-+ .rpc_client = clnt,
- .task = &data->task,
- .rpc_message = &msg,
- .callback_ops = call_ops,
-@@ -813,12 +831,62 @@ static int nfs_write_rpcsetup(struct nfs
- };
- int ret = 0;
-
-+ /* Set up the initial task struct. */
-+ NFS_PROTO(inode)->write_setup(data, &msg);
++/*
++ * Return a stripeindices of length 1024 to test
++ * the pNFS client multipage getdeviceinfo implementation.
++ *
++ * Encode a page of stripe indices.
++ */
++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev,
++ struct spnfs_device *dev,
++ struct pnfs_devinfo_arg *info)
++{
++ struct svc_rqst *rqstp = info->xdr.resp->rqstp;
++ __be32 *p;
++ int i, j = 0;
+
-+ dprintk("NFS: %5u initiated write call "
-+ "(req %s/%lld, %u bytes @ offset %llu)\n",
-+ data->task.tk_pid,
-+ inode->i_sb->s_id,
-+ (long long)NFS_FILEID(inode),
-+ data->args.count,
-+ (unsigned long long)data->args.offset);
++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]);
++ fldev->fl_stripeindices_length = 1024;
++ /* round-robin the data servers device index into the stripe indicie */
++ for (i = 0; i < 1024; i++) {
++ *p++ = cpu_to_be32(j);
++ if (j < dev->dscount - 1)
++ j++;
++ else
++ j = 0;
++ }
++ fldev->fl_stripeindices_list = NULL;
++}
++#endif /* CONFIG_SPNFS_TEST */
+
-+ task = rpc_run_task(&task_setup_data);
-+ if (IS_ERR(task)) {
-+ ret = PTR_ERR(task);
-+ goto out;
++int
++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *devid)
++{
++ struct spnfs *spnfs = global_spnfs;
++ struct spnfs_msg *im = NULL;
++ union spnfs_msg_res *res = NULL;
++ struct spnfs_device *dev;
++ struct pnfs_filelayout_device *fldev = NULL;
++ struct pnfs_filelayout_multipath *mp = NULL;
++ struct pnfs_filelayout_devaddr *fldap = NULL;
++ int status = 0, i, len;
++
++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++ if (im == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
+ }
-+ if (how & FLUSH_SYNC) {
-+ ret = rpc_wait_for_completion_task(task);
-+ if (ret == 0)
-+ ret = task->tk_status;
++
++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++ if (res == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
+ }
-+ rpc_put_task(task);
-+out:
-+ return ret;
-+}
-+EXPORT_SYMBOL(nfs_initiate_write);
+
-+int pnfs_initiate_write(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how)
-+{
-+ if (data->req->wb_lseg &&
-+ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
-+ return pnfs_get_write_status(data);
++ im->im_type = SPNFS_TYPE_GETDEVICEINFO;
++ /* XXX FIX: figure out what to do about fsid */
++ im->im_args.getdeviceinfo_args.devid = devid->devid;
+
-+ return nfs_initiate_write(data, clnt, call_ops, how);
-+}
++ /* call function to queue the msg for upcall */
++ status = spnfs_upcall(spnfs, im, res);
++ if (status != 0) {
++ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++ status = -EIO;
++ goto getdeviceinfo_out;
++ }
++ status = res->getdeviceinfo_res.status;
++ if (status != 0)
++ goto getdeviceinfo_out;
+
-+/*
-+ * Set up the argument/result storage required for the RPC call.
-+ */
-+static int nfs_write_rpcsetup(struct nfs_page *req,
-+ struct nfs_write_data *data,
-+ const struct rpc_call_ops *call_ops,
-+ unsigned int count, unsigned int offset,
-+ int how)
-+{
-+ struct inode *inode = req->wb_context->path.dentry->d_inode;
++ dev = &res->getdeviceinfo_res.devinfo;
+
- /* Set up the RPC argument and reply structs
- * NB: take care not to mess about with data->commit et al. */
-
- data->req = req;
- data->inode = inode = req->wb_context->path.dentry->d_inode;
-- data->cred = msg.rpc_cred;
-+ data->cred = req->wb_context->cred;
-
- data->args.fh = NFS_FH(inode);
- data->args.offset = req_offset(req) + offset;
-@@ -839,30 +907,7 @@ static int nfs_write_rpcsetup(struct nfs
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
-
-- /* Set up the initial task struct. */
-- NFS_PROTO(inode)->write_setup(data, &msg);
--
-- dprintk("NFS: %5u initiated write call "
-- "(req %s/%lld, %u bytes @ offset %llu)\n",
-- data->task.tk_pid,
-- inode->i_sb->s_id,
-- (long long)NFS_FILEID(inode),
-- count,
-- (unsigned long long)data->args.offset);
--
-- task = rpc_run_task(&task_setup_data);
-- if (IS_ERR(task)) {
-- ret = PTR_ERR(task);
-- goto out;
-- }
-- if (how & FLUSH_SYNC) {
-- ret = rpc_wait_for_completion_task(task);
-- if (ret == 0)
-- ret = task->tk_status;
-- }
-- rpc_put_task(task);
--out:
-- return ret;
-+ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
- }
-
- /* If a nfs_flush_* function fails, it should remove reqs from @head and
-@@ -873,6 +918,7 @@ static void nfs_redirty_request(struct n
- {
- struct page *page = req->wb_page;
-
-+ nfs_mark_request_nopnfs(req);
- nfs_mark_request_dirty(req);
- nfs_clear_page_tag_locked(req);
- nfs_end_page_writeback(page);
-@@ -985,6 +1031,10 @@ static void nfs_pageio_init_write(struct
- {
- size_t wsize = NFS_SERVER(inode)->wsize;
-
-+#ifdef CONFIG_NFS_V4_1
-+ pnfs_pageio_init_write(pgio, inode, &wsize);
-+#endif /* CONFIG_NFS_V4_1 */
++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */
++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL);
++ if (fldev == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
++ }
+
- if (wsize < PAGE_CACHE_SIZE)
- nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
- else
-@@ -1050,13 +1100,27 @@ out:
- void nfs_write_prepare(struct rpc_task *task, void *calldata)
- {
- struct nfs_write_data *data = calldata;
-+ struct nfs4_session *ds_session = NULL;
++ /*
++ * Stripe count is the same as data server count for our purposes
++ */
++ fldev->fl_stripeindices_length = dev->dscount;
++ fldev->fl_device_length = dev->dscount;
+
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS read\n", __func__);
-+ ds_session = data->fldata.ds_nfs_client->cl_session;
-+ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) {
-+ /* retrying via MDS? */
-+ data->pdata.orig_count = data->args.count;
-+ data->args.count = NFS_SERVER(data->inode)->wsize;
-+ dprintk("%s: trimmed count %u to wsize %u\n", __func__,
-+ data->pdata.orig_count, data->args.count);
-+ } else
-+ data->pdata.orig_count = 0;
-
-- if (nfs4_setup_sequence(NFS_SERVER(data->inode),
-+ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session,
- &data->args.seq_args,
- &data->res.seq_res, 1, task))
- return;
- rpc_call_start(task);
- }
-+EXPORT_SYMBOL(nfs_write_prepare);
- #endif /* CONFIG_NFS_V4_1 */
-
- static const struct rpc_call_ops nfs_write_partial_ops = {
-@@ -1140,10 +1204,11 @@ int nfs_writeback_done(struct rpc_task *
- struct nfs_writeargs *argp = &data->args;
- struct nfs_writeres *resp = &data->res;
- struct nfs_server *server = NFS_SERVER(data->inode);
-+ struct nfs_client *clp = server->nfs_client;
- int status;
-
-- dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
-- task->tk_pid, task->tk_status);
-+ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n",
-+ task->tk_pid, task->tk_status, resp->count);
-
- /*
- * ->write_done will attempt to use post-op attributes to detect
-@@ -1156,6 +1221,13 @@ int nfs_writeback_done(struct rpc_task *
- if (status != 0)
- return status;
- nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
-+#ifdef CONFIG_NFS_V4_1
-+ /* Is this a DS session */
-+ if (data->fldata.ds_nfs_client) {
-+ dprintk("%s DS write\n", __func__);
-+ clp = data->fldata.ds_nfs_client;
++ /* Set stripe indices */
++#ifdef CONFIG_SPNFS_TEST
++ spnfs_set_test_indices(fldev, dev, info);
++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr;
++#else /* CONFIG_SPNFS_TEST */
++ fldev->fl_stripeindices_list =
++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32),
++ GFP_KERNEL);
++ if (fldev->fl_stripeindices_list == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
+ }
-+#endif /* CONFIG_NFS_V4_1 */
-
- #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
- if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
-@@ -1172,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *
- if (time_before(complain, jiffies)) {
- dprintk("NFS: faulty NFS server %s:"
- " (committed = %d) != (stable = %d)\n",
-- server->nfs_client->cl_hostname,
-+ clp->cl_hostname,
- resp->verf->committed, argp->stable);
- complain = jiffies + 300 * HZ;
- }
-@@ -1198,6 +1270,9 @@ int nfs_writeback_done(struct rpc_task *
- */
- argp->stable = NFS_FILE_SYNC;
- }
-+#ifdef CONFIG_NFS_V4_1
-+ data->pdata.pnfs_error = -EAGAIN;
-+#endif /* CONFIG_NFS_V4_1 */
- nfs_restart_rpc(task, server->nfs_client);
- return -EAGAIN;
- }
-@@ -1242,40 +1317,73 @@ static void nfs_commitdata_release(void
- nfs_commit_free(wdata);
- }
-
--/*
-- * Set up the argument/result storage required for the RPC call.
-- */
--static int nfs_commit_rpcsetup(struct list_head *head,
-- struct nfs_write_data *data,
-- int how)
-+int nfs_initiate_commit(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how)
- {
-- struct nfs_page *first = nfs_list_entry(head->next);
-- struct inode *inode = first->wb_context->path.dentry->d_inode;
-+ struct inode *inode = data->inode;
- int priority = flush_task_priority(how);
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
-- .rpc_cred = first->wb_context->cred,
-+ .rpc_cred = data->cred,
- };
- struct rpc_task_setup task_setup_data = {
- .task = &data->task,
-- .rpc_client = NFS_CLIENT(inode),
-+ .rpc_client = clnt,
- .rpc_message = &msg,
-- .callback_ops = &nfs_commit_ops,
-+ .callback_ops = call_ops,
- .callback_data = data,
- .workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
- .priority = priority,
- };
-
-+ /* Set up the initial task struct. */
-+ NFS_PROTO(inode)->commit_setup(data, &msg);
++ for (i = 0; i < fldev->fl_stripeindices_length; i++)
++ fldev->fl_stripeindices_list[i] = i;
++#endif /* CONFIG_SPNFS_TEST */
++
++ /*
++ * Set the device's data server addresses No multipath for spnfs,
++ * so mp length is always 1.
++ *
++ */
++ fldev->fl_device_list =
++ kmalloc(fldev->fl_device_length *
++ sizeof(struct pnfs_filelayout_multipath),
++ GFP_KERNEL);
++ if (fldev->fl_device_list == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
++ }
++ for (i = 0; i < fldev->fl_device_length; i++) {
++ mp = &fldev->fl_device_list[i];
++ mp->fl_multipath_length = 1;
++ mp->fl_multipath_list =
++ kmalloc(sizeof(struct pnfs_filelayout_devaddr),
++ GFP_KERNEL);
++ if (mp->fl_multipath_list == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
++ }
++ fldap = mp->fl_multipath_list;
+
-+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
++ /*
++ * Copy the netid into the device address, for example: "tcp"
++ */
++ len = strlen(dev->dslist[i].netid);
++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL);
++ if (fldap->r_netid.data == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
++ }
++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len);
++ fldap->r_netid.len = len;
+
-+ task = rpc_run_task(&task_setup_data);
-+ if (IS_ERR(task))
-+ return PTR_ERR(task);
-+ rpc_put_task(task);
-+ return 0;
++ /*
++ * Copy the network address into the device address,
++ * for example: "10.35.9.16.08.01"
++ */
++ len = strlen(dev->dslist[i].addr);
++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL);
++ if (fldap->r_addr.data == NULL) {
++ status = -ENOMEM;
++ goto getdeviceinfo_out;
++ }
++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len);
++ fldap->r_addr.len = len;
++ }
++
++ /* encode the device data */
++ status = filelayout_encode_devinfo(xdr, fldev);
++
++getdeviceinfo_out:
++ if (fldev) {
++ kfree(fldev->fl_stripeindices_list);
++ if (fldev->fl_device_list) {
++ for (i = 0; i < fldev->fl_device_length; i++) {
++ fldap =
++ fldev->fl_device_list[i].fl_multipath_list;
++ kfree(fldap->r_netid.data);
++ kfree(fldap->r_addr.data);
++ kfree(fldap);
++ }
++ kfree(fldev->fl_device_list);
++ }
++ kfree(fldev);
++ }
++
++ kfree(im);
++ kfree(res);
++
++ return status;
+}
-+EXPORT_SYMBOL(nfs_initiate_commit);
+
++int
++spnfs_setattr(void)
++{
++ return 0;
++}
+
-+int pnfs_initiate_commit(struct nfs_write_data *data,
-+ struct rpc_clnt *clnt,
-+ const struct rpc_call_ops *call_ops,
-+ int how, int pnfs)
++int
++spnfs_open(struct inode *inode, struct nfsd4_open *open)
+{
-+ if (pnfs &&
-+ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED))
-+ return pnfs_get_write_status(data);
++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
++ struct spnfs_msg *im = NULL;
++ union spnfs_msg_res *res = NULL;
++ int status = 0;
+
-+ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how);
++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++ if (im == NULL) {
++ status = -ENOMEM;
++ goto open_out;
++ }
++
++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++ if (res == NULL) {
++ status = -ENOMEM;
++ goto open_out;
++ }
++
++ im->im_type = SPNFS_TYPE_OPEN;
++ im->im_args.open_args.inode = inode->i_ino;
++ im->im_args.open_args.generation = inode->i_generation;
++ im->im_args.open_args.create = open->op_create;
++ im->im_args.open_args.createmode = open->op_createmode;
++ im->im_args.open_args.truncate = open->op_truncate;
++
++ /* call function to queue the msg for upcall */
++ status = spnfs_upcall(spnfs, im, res);
++ if (status != 0) {
++ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++ status = -EIO;
++ goto open_out;
++ }
++ status = res->open_res.status;
++
++open_out:
++ kfree(im);
++ kfree(res);
++
++ return status;
++}
++
++int
++spnfs_create(void)
++{
++ return 0;
+}
+
+/*
-+ * Set up the argument/result storage required for the RPC call.
++ * Invokes the spnfsd with the inode number of the object to remove.
++ * The file has already been removed on the MDS, so all the spnsfd
++ * daemon does is remove the stripes.
++ * Returns 0 on success otherwise error code
+ */
-+static int nfs_commit_rpcsetup(struct list_head *head,
-+ struct nfs_write_data *data,
-+ int how, int pnfs)
++int
++spnfs_remove(unsigned long ino, unsigned long generation)
+{
-+ struct nfs_page *first = nfs_list_entry(head->next);
-+ struct inode *inode = first->wb_context->path.dentry->d_inode;
++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */
++ struct spnfs_msg *im = NULL;
++ union spnfs_msg_res *res = NULL;
++ int status = 0;
+
- /* Set up the RPC argument and reply structs
- * NB: take care not to mess about with data->commit et al. */
-
- list_splice_init(head, &data->pages);
-
- data->inode = inode;
-- data->cred = msg.rpc_cred;
-+ data->cred = first->wb_context->cred;
-
- data->args.fh = NFS_FH(data->inode);
- /* Note: we always request a commit of the entire inode */
-@@ -1286,45 +1394,47 @@ static int nfs_commit_rpcsetup(struct li
- data->res.fattr = &data->fattr;
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
-+ kref_init(&data->refcount);
-+ data->parent = NULL;
-+ data->args.context = first->wb_context; /* used by commit done */
-
-- /* Set up the initial task struct. */
-- NFS_PROTO(inode)->commit_setup(data, &msg);
-+ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops,
-+ how, pnfs);
-+}
-
-- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-+/* Handle memory error during commit */
-+void nfs_mark_list_commit(struct list_head *head)
-+{
-+ struct nfs_page *req;
-
-- task = rpc_run_task(&task_setup_data);
-- if (IS_ERR(task))
-- return PTR_ERR(task);
-- rpc_put_task(task);
-- return 0;
-+ while (!list_empty(head)) {
-+ req = nfs_list_entry(head->next);
-+ nfs_list_remove_request(req);
-+ nfs_mark_request_commit(req);
-+ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-+ dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-+ BDI_RECLAIMABLE);
-+ nfs_clear_page_tag_locked(req);
++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL);
++ if (im == NULL) {
++ status = -ENOMEM;
++ goto remove_out;
+ }
- }
-+EXPORT_SYMBOL(nfs_mark_list_commit);
-
- /*
- * Commit dirty pages
- */
- static int
--nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-+nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs)
- {
- struct nfs_write_data *data;
-- struct nfs_page *req;
-
- data = nfs_commitdata_alloc();
--
- if (!data)
- goto out_bad;
-
- /* Set up the argument struct */
-- return nfs_commit_rpcsetup(head, data, how);
-+ return nfs_commit_rpcsetup(head, data, how, pnfs);
- out_bad:
-- while (!list_empty(head)) {
-- req = nfs_list_entry(head->next);
-- nfs_list_remove_request(req);
-- nfs_mark_request_commit(req);
-- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-- BDI_RECLAIMABLE);
-- nfs_clear_page_tag_locked(req);
-- }
-+ nfs_mark_list_commit(head);
- nfs_commit_clear_lock(NFS_I(inode));
- return -ENOMEM;
- }
-@@ -1344,6 +1454,19 @@ static void nfs_commit_done(struct rpc_t
- return;
- }
-
-+static inline void nfs_commit_cleanup(struct kref *kref)
-+{
-+ struct nfs_write_data *data;
+
-+ data = container_of(kref, struct nfs_write_data, refcount);
-+ /* Clear lock only when all cloned commits are finished */
-+ if (data->parent)
-+ kref_put(&data->parent->refcount, nfs_commit_cleanup);
-+ else
-+ nfs_commit_clear_lock(NFS_I(data->inode));
-+ nfs_commitdata_release(data);
++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL);
++ if (res == NULL) {
++ status = -ENOMEM;
++ goto remove_out;
++ }
++
++ im->im_type = SPNFS_TYPE_REMOVE;
++ im->im_args.remove_args.inode = ino;
++ im->im_args.remove_args.generation = generation;
++
++ /* call function to queue the msg for upcall */
++ status = spnfs_upcall(spnfs, im, res);
++ if (status != 0) {
++ dprintk("%s spnfs upcall failure: %d\n", __func__, status);
++ status = -EIO;
++ goto remove_out;
++ }
++ status = res->remove_res.status;
++
++remove_out:
++ kfree(im);
++ kfree(res);
++
++ return status;
+}
+
- static void nfs_commit_release(void *calldata)
- {
- struct nfs_write_data *data = calldata;
-@@ -1361,6 +1484,11 @@ static void nfs_commit_release(void *cal
- req->wb_bytes,
- (long long)req_offset(req));
- if (status < 0) {
-+ if (req->wb_lseg) {
-+ nfs_mark_request_nopnfs(req);
-+ nfs_mark_request_dirty(req);
-+ goto next;
-+ }
- nfs_context_set_write_error(req->wb_context, status);
- nfs_inode_remove_request(req);
- dprintk(", error = %d\n", status);
-@@ -1377,12 +1505,12 @@ static void nfs_commit_release(void *cal
- }
- /* We have a mismatch. Write the page again */
- dprintk(" mismatch\n");
-+ nfs_mark_request_nopnfs(req);
- nfs_mark_request_dirty(req);
- next:
- nfs_clear_page_tag_locked(req);
- }
-- nfs_commit_clear_lock(NFS_I(data->inode));
-- nfs_commitdata_release(calldata);
-+ kref_put(&data->refcount, nfs_commit_cleanup);
- }
-
- static const struct rpc_call_ops nfs_commit_ops = {
-@@ -1398,21 +1526,22 @@ int nfs_commit_inode(struct inode *inode
- LIST_HEAD(head);
- int may_wait = how & FLUSH_SYNC;
- int res = 0;
-+ int use_pnfs = 0;
-
- if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
- goto out_mark_dirty;
- spin_lock(&inode->i_lock);
-- res = nfs_scan_commit(inode, &head, 0, 0);
-+ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs);
- spin_unlock(&inode->i_lock);
- if (res) {
-- int error = nfs_commit_list(inode, &head, how);
-+ int error = nfs_commit_list(inode, &head, how, use_pnfs);
- if (error < 0)
- return error;
-- if (may_wait)
-+ if (may_wait) {
- wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
-- else
-+ } else
- goto out_mark_dirty;
- } else
- nfs_commit_clear_lock(NFS_I(inode));
-@@ -1465,7 +1594,18 @@ static int nfs_commit_unstable_pages(str
-
- int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
- {
-- return nfs_commit_unstable_pages(inode, wbc);
-+ int ret;
-+ ret = nfs_commit_unstable_pages(inode, wbc);
-+ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) {
-+ int err, sync = wbc->sync_mode;
++static int
++read_one(struct inode *inode, loff_t offset, size_t len, char *buf,
++ struct file **filp)
++{
++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
++ size_t iolen;
++ int completed = 0, ds, err;
+
-+ if (wbc->nonblocking || wbc->for_background)
-+ sync = 0;
-+ err = pnfs_layoutcommit_inode(inode, sync);
++ while (len > 0) {
++ tmp = offset;
++ soff = do_div(tmp, spnfs_config->stripe_size);
++ snum = tmp;
++ ds = do_div(tmp, spnfs_config->num_ds);
++ if (spnfs_config->dense_striping == 0)
++ soffset = offset;
++ else {
++ tmp = snum;
++ do_div(tmp, spnfs_config->num_ds);
++ soffset = tmp * spnfs_config->stripe_size + soff;
++ }
++ if (len < spnfs_config->stripe_size - soff)
++ iolen = len;
++ else
++ iolen = spnfs_config->stripe_size - soff;
++
++ pos = soffset;
++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos);
+ if (err < 0)
-+ ret = err;
++ return -EIO;
++ if (err == 0)
++ break;
++ filp[ds]->f_pos = pos;
++ iolen = err;
++ completed += iolen;
++ len -= iolen;
++ offset += iolen;
++ bufoffset += iolen;
+ }
-+ return ret;
- }
-
- /*
-diff -up linux-2.6.35.noarch/include/linux/exportfs.h.orig linux-2.6.35.noarch/include/linux/exportfs.h
---- linux-2.6.35.noarch/include/linux/exportfs.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/exportfs.h 2010-09-30 12:25:08.637289000 -0400
-@@ -2,6 +2,7 @@
- #define LINUX_EXPORTFS_H 1
-
- #include <linux/types.h>
-+#include <linux/exp_xdr.h>
-
- struct dentry;
- struct inode;
-@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare
- struct fid *fid, int fh_len, int fh_type,
- struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
-
-+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
-+struct pnfs_filelayout_device;
-+struct pnfs_filelayout_layout;
-+
-+extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr,
-+ const struct pnfs_filelayout_device *fdev);
-+extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr,
-+ const struct pnfs_filelayout_layout *flp);
-+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
+
-+#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
-+struct list_head;
-+
-+extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr,
-+ const struct list_head *volumes);
++ return completed;
++}
+
-+extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr,
-+ const struct list_head *layouts);
-+#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
++static __be32
++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
++ struct svc_rqst *rqstp)
++{
++ int i, vnum, err, bytecount = 0;
++ char path[128];
++ struct file *filp[SPNFS_MAX_DATA_SERVERS];
++ size_t iolen;
++ __be32 status = nfs_ok;
+
-+#if defined(CONFIG_PNFSD)
-+#include <linux/module.h>
++ /*
++ * XXX We should just be doing this at open time, but it gets
++ * kind of messy storing this info in nfsd's state structures
++ * and piggybacking its path through the various state handling
++ * functions. Revisit this.
++ */
++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
++ for (i = 0; i < spnfs_config->num_ds; i++) {
++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
++ inode->i_ino, inode->i_generation);
++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0);
++ if (filp[i] == NULL) {
++ status = nfserr_io;
++ goto read_out;
++ }
++ get_file(filp[i]);
++ }
+
-+struct pnfsd_cb_operations;
++ for (vnum = 0 ; vnum < vlen ; vnum++) {
++ iolen = rqstp->rq_vec[vnum].iov_len;
++ err = read_one(inode, offset + bytecount, iolen,
++ (char *)rqstp->rq_vec[vnum].iov_base, filp);
++ if (err < 0) {
++ status = nfserr_io;
++ goto read_out;
++ }
++ if (err < iolen) {
++ bytecount += err;
++ goto read_out;
++ }
++ bytecount += rqstp->rq_vec[vnum].iov_len;
++ }
+
-+struct pnfsd_cb_ctl {
-+ spinlock_t lock;
-+ struct module *module;
-+ const struct pnfsd_cb_operations *cb_op;
-+};
++read_out:
++ *lenp = bytecount;
++ for (i = 0; i < spnfs_config->num_ds; i++) {
++ if (filp[i]) {
++ filp_close(filp[i], current->files);
++ fput(filp[i]);
++ }
++ }
++ return status;
++}
+
-+/* in expfs.c so that file systems can depend on it */
-+extern struct pnfsd_cb_ctl pnfsd_cb_ctl;
++__be32
++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen,
++ struct svc_rqst *rqstp)
++{
++ if (spnfs_config)
++ return read(inode, offset, lenp, vlen, rqstp);
++ else {
++ printk(KERN_ERR "Please upgrade to latest spnfsd\n");
++ return nfserr_notsupp;
++ }
++}
+
-+static inline int
-+pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl)
++static int
++write_one(struct inode *inode, loff_t offset, size_t len, char *buf,
++ struct file **filp)
+{
-+ int ret = -ENOENT;
++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp;
++ size_t iolen;
++ int completed = 0, ds, err;
+
-+ spin_lock(&pnfsd_cb_ctl.lock);
-+ if (!pnfsd_cb_ctl.cb_op)
-+ goto out;
-+ if (!try_module_get(pnfsd_cb_ctl.module))
-+ goto out;
-+ ctl->cb_op = pnfsd_cb_ctl.cb_op;
-+ ctl->module = pnfsd_cb_ctl.module;
-+ ret = 0;
-+out:
-+ spin_unlock(&pnfsd_cb_ctl.lock);
-+ return ret;
++ while (len > 0) {
++ tmp = offset;
++ soff = do_div(tmp, spnfs_config->stripe_size);
++ snum = tmp;
++ ds = do_div(tmp, spnfs_config->num_ds);
++ if (spnfs_config->dense_striping == 0)
++ soffset = offset;
++ else {
++ tmp = snum;
++ do_div(tmp, spnfs_config->num_ds);
++ soffset = tmp * spnfs_config->stripe_size + soff;
++ }
++ if (len < spnfs_config->stripe_size - soff)
++ iolen = len;
++ else
++ iolen = spnfs_config->stripe_size - soff;
++
++ pos = soffset;
++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos);
++ if (err < 0)
++ return -EIO;
++ filp[ds]->f_pos = pos;
++ iolen = err;
++ completed += iolen;
++ len -= iolen;
++ offset += iolen;
++ bufoffset += iolen;
++ }
++
++ return completed;
+}
+
-+static inline void
-+pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl)
++static __be32
++write(struct inode *inode, loff_t offset, size_t len, int vlen,
++ struct svc_rqst *rqstp)
+{
-+ module_put(ctl->module);
-+}
-+#endif /* CONFIG_PNFSD */
- #endif /* LINUX_EXPORTFS_H */
-diff -up linux-2.6.35.noarch/include/linux/exp_xdr.h.orig linux-2.6.35.noarch/include/linux/exp_xdr.h
---- linux-2.6.35.noarch/include/linux/exp_xdr.h.orig 2010-09-30 12:25:08.623289000 -0400
-+++ linux-2.6.35.noarch/include/linux/exp_xdr.h 2010-09-30 12:25:08.625288000 -0400
-@@ -0,0 +1,141 @@
-+#ifndef _LINUX_EXP_XDR_H
-+#define _LINUX_EXP_XDR_H
++ int i, vnum, err, bytecount = 0;
++ char path[128];
++ struct file *filp[SPNFS_MAX_DATA_SERVERS];
++ size_t iolen;
++ __be32 status = nfs_ok;
+
-+#include <asm/byteorder.h>
-+#include <asm/unaligned.h>
-+#include <linux/string.h>
++ /*
++ * XXX We should just be doing this at open time, but it gets
++ * kind of messy storing this info in nfsd's state structures
++ * and piggybacking its path through the various state handling
++ * functions. Revisit this.
++ */
++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *));
++ for (i = 0; i < spnfs_config->num_ds; i++) {
++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i],
++ inode->i_ino, inode->i_generation);
++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0);
++ if (filp[i] == NULL) {
++ status = nfserr_io;
++ goto write_out;
++ }
++ get_file(filp[i]);
++ }
+
-+struct exp_xdr_stream {
-+ __be32 *p;
-+ __be32 *end;
-+};
++ for (vnum = 0; vnum < vlen; vnum++) {
++ iolen = rqstp->rq_vec[vnum].iov_len;
++ err = write_one(inode, offset + bytecount, iolen,
++ (char *)rqstp->rq_vec[vnum].iov_base, filp);
++ if (err != iolen) {
++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len);
++ status = nfserr_io;
++ goto write_out;
++ }
++ bytecount += rqstp->rq_vec[vnum].iov_len;
++ }
+
-+/**
-+ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes
-+ * @nbytes: number of bytes to encode
-+ */
-+static inline size_t
-+exp_xdr_qwords(__u32 nbytes)
-+{
-+ return DIV_ROUND_UP(nbytes, 4);
-+}
++write_out:
++ for (i = 0; i < spnfs_config->num_ds; i++) {
++ if (filp[i]) {
++ filp_close(filp[i], current->files);
++ fput(filp[i]);
++ }
++ }
+
-+/**
-+ * exp_xdr_qbytes - Calculate the number of bytes holding qwords
-+ * @qwords: number of quad-words to encode
-+ */
-+static inline size_t
-+exp_xdr_qbytes(size_t qwords)
-+{
-+ return qwords << 2;
++ return status;
+}
+
-+/**
-+ * exp_xdr_reserve_space - Reserve buffer space for sending
-+ * @xdr: pointer to exp_xdr_stream
-+ * @nbytes: number of bytes to reserve
-+ *
-+ * Checks that we have enough buffer space to encode 'nbytes' more
-+ * bytes of data. If so, update the xdr stream.
-+ */
-+static inline __be32 *
-+exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes)
++__be32
++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen,
++ struct svc_rqst *rqstp)
+{
-+ __be32 *p = xdr->p;
-+ __be32 *q;
-+
-+ /* align nbytes on the next 32-bit boundary */
-+ q = p + exp_xdr_qwords(nbytes);
-+ if (unlikely(q > xdr->end || q < p))
-+ return NULL;
-+ xdr->p = q;
-+ return p;
++ if (spnfs_config)
++ return write(inode, offset, len, vlen, rqstp);
++ else {
++ printk(KERN_ERR "Please upgrade to latest spnfsd\n");
++ return nfserr_notsupp;
++ }
+}
+
-+/**
-+ * exp_xdr_reserve_qwords - Reserve buffer space for sending
-+ * @xdr: pointer to exp_xdr_stream
-+ * @nwords: number of quad words (u32's) to reserve
-+ */
-+static inline __be32 *
-+exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords)
++int
++spnfs_commit(void)
+{
-+ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords));
++ return 0;
+}
+
-+/**
-+ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream
-+ * @p: pointer to encoding destination
-+ * @val: value to encode
++/*
++ * Return the state for this object.
++ * At this time simply return 0 to indicate success and use the existing state
+ */
-+static inline __be32 *
-+exp_xdr_encode_u32(__be32 *p, __u32 val)
++int
++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg)
+{
-+ *p = cpu_to_be32(val);
-+ return p + 1;
++ return 0;
+}
+
-+/**
-+ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream
-+ * @p: pointer to encoding destination
-+ * @val: value to encode
++/*
++ * Return the filehandle for the specified file descriptor
+ */
-+static inline __be32 *
-+exp_xdr_encode_u64(__be32 *p, __u64 val)
++int
++spnfs_getfh(int fd, struct nfs_fh *fh)
+{
-+ put_unaligned_be64(val, p);
-+ return p + 2;
-+}
++ struct file *file;
+
-+/**
-+ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream
-+ * @p: pointer to encoding destination
-+ * @ptr: pointer to the array of bytes
-+ * @nbytes: number of bytes to encode
-+ */
-+static inline __be32 *
-+exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes)
-+{
-+ if (likely(nbytes != 0)) {
-+ unsigned int qwords = exp_xdr_qwords(nbytes);
-+ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes;
++ file = fget(fd);
++ if (file == NULL)
++ return -EIO;
+
-+ memcpy(p, ptr, nbytes);
-+ if (padding != 0)
-+ memset((char *)p + nbytes, 0, padding);
-+ p += qwords;
-+ }
-+ return p;
++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh));
++ fput(file);
++ return 0;
+}
+diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
+index 322518c..2536d06 100644
+--- a/fs/nfsd/state.h
++++ b/fs/nfsd/state.h
+@@ -241,6 +241,12 @@ struct nfs4_client {
+ u32 cl_cb_seq_nr;
+ struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
+ /* wait here for slots */
++#if defined(CONFIG_PNFSD)
++ struct list_head cl_layouts; /* outstanding layouts */
++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall
++ callbacks */
++ atomic_t cl_deviceref; /* Num outstanding devs */
++#endif /* CONFIG_PNFSD */
+ };
+
+ static inline void
+@@ -357,6 +363,14 @@ struct nfs4_file {
+ u32 fi_id; /* used with stateowner->so_id
+ * for stateid_hashtbl hash */
+ bool fi_had_conflict;
++#if defined(CONFIG_PNFSD)
++ struct list_head fi_layouts;
++ struct list_head fi_layout_states;
++ /* used by layoutget / layoutrecall */
++ struct nfs4_fsid fi_fsid;
++ u32 fi_fhlen;
++ u8 fi_fhval[NFS4_FHSIZE];
++#endif /* CONFIG_PNFSD */
+ };
+
+ /* XXX: for first cut may fall back on returning file that doesn't work
+@@ -385,6 +399,15 @@ static inline struct file *find_any_file(struct nfs4_file *f)
+ return f->fi_fds[O_RDONLY];
+ }
+
++#if defined(CONFIG_PNFSD)
++/* pNFS Metadata server state */
+
-+/**
-+ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream
-+ * @p: pointer to encoding destination
-+ * @ptr: pointer to the opaque array
-+ * @nbytes: number of bytes to encode
-+ *
-+ * Encodes the 32-bit opaque size in bytes followed by the opaque value.
-+ */
-+static inline __be32 *
-+exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes)
++struct pnfs_ds_dev_entry {
++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */
++ u32 dd_dsid;
++};
++#endif /* CONFIG_PNFSD */
++
+ /*
+ * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+ *
+@@ -407,6 +430,9 @@ struct nfs4_stateid {
+ struct list_head st_perfile;
+ struct list_head st_perstateowner;
+ struct list_head st_lockowners;
++#if defined(CONFIG_PNFSD)
++ struct list_head st_pnfs_ds_id;
++#endif /* CONFIG_PNFSD */
+ struct nfs4_stateowner * st_stateowner;
+ struct nfs4_file * st_file;
+ stateid_t st_stateid;
+@@ -457,6 +483,34 @@ extern void nfsd4_recdir_purge_old(void);
+ extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+ extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+ extern void release_session_client(struct nfsd4_session *);
++extern void nfsd4_free_slab(struct kmem_cache **);
++extern struct nfs4_file *find_file(struct inode *);
++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *);
++extern void put_nfs4_file(struct nfs4_file *);
++extern void get_nfs4_file(struct nfs4_file *);
++extern struct nfs4_client *find_confirmed_client(clientid_t *);
++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags);
++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *);
++extern __be32 nfs4_check_stateid(stateid_t *);
++extern void expire_client_lock(struct nfs4_client *);
++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *);
++
++#if defined(CONFIG_PNFSD)
++extern int nfsd4_init_pnfs_slabs(void);
++extern void nfsd4_free_pnfs_slabs(void);
++extern void pnfs_expire_client(struct nfs4_client *);
++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *);
++extern void nfs4_pnfs_state_init(void);
++extern void nfs4_pnfs_state_shutdown(void);
++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *);
++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *);
++#else /* CONFIG_PNFSD */
++static inline void nfsd4_free_pnfs_slabs(void) {}
++static inline int nfsd4_init_pnfs_slabs(void) { return 0; }
++static inline void pnfs_expire_client(struct nfs4_client *clp) {}
++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {}
++static inline void nfs4_pnfs_state_shutdown(void) {}
++#endif /* CONFIG_PNFSD */
+
+ static inline void
+ nfs4_put_stateowner(struct nfs4_stateowner *so)
+@@ -470,4 +524,24 @@ nfs4_get_stateowner(struct nfs4_stateowner *so)
+ kref_get(&so->so_ref);
+ }
+
++static inline u64
++end_offset(u64 start, u64 len)
+{
-+ p = exp_xdr_encode_u32(p, nbytes);
-+ return exp_xdr_encode_bytes(p, ptr, nbytes);
++ u64 end;
++
++ end = start + len;
++ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
-+/**
-+ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream
-+ * @lenp: pointer to the opaque length destination
-+ * @endp: pointer to the end of the opaque array
-+ *
-+ * Encodes the 32-bit opaque size in bytes given the start and end pointers
-+ */
-+static inline __be32 *
-+exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp)
++/* last octet in a range */
++static inline u64
++last_byte_offset(u64 start, u64 len)
+{
-+ size_t nbytes = (char *)endp - (char *)(lenp + 1);
++ u64 end;
+
-+ exp_xdr_encode_u32(lenp, nbytes);
-+ return lenp + 1 + exp_xdr_qwords(nbytes);
++ BUG_ON(!len);
++ end = start + len;
++ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
-+#endif /* _LINUX_EXP_XDR_H */
-diff -up linux-2.6.35.noarch/include/linux/fs.h.orig linux-2.6.35.noarch/include/linux/fs.h
---- linux-2.6.35.noarch/include/linux/fs.h.orig 2010-09-30 12:22:49.210164000 -0400
-+++ linux-2.6.35.noarch/include/linux/fs.h 2010-09-30 12:25:08.650290000 -0400
-@@ -388,6 +388,7 @@ struct inodes_stat_t {
- #include <asm/byteorder.h>
++
+ #endif /* NFSD4_STATE_H */
+diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
+index 661a6cf..ed3a1b7 100644
+--- a/fs/nfsd/vfs.c
++++ b/fs/nfsd/vfs.c
+@@ -37,7 +37,12 @@
+ #ifdef CONFIG_NFSD_V4
+ #include <linux/nfs4_acl.h>
+ #include <linux/nfsd_idmap.h>
++#include <linux/security.h>
++#include <linux/nfsd4_spnfs.h>
+ #endif /* CONFIG_NFSD_V4 */
++#if defined(CONFIG_SPNFS_BLOCK)
++#include <linux/nfsd4_block.h>
++#endif
- struct export_operations;
-+struct pnfs_export_operations;
- struct hd_geometry;
- struct iovec;
- struct nameidata;
-@@ -1327,6 +1328,7 @@ struct super_block {
- const struct dquot_operations *dq_op;
- const struct quotactl_ops *s_qcop;
- const struct export_operations *s_export_op;
-+ const struct pnfs_export_operations *s_pnfs_op;
- unsigned long s_flags;
- unsigned long s_magic;
- struct dentry *s_root;
-diff -up linux-2.6.35.noarch/include/linux/nfs4.h.orig linux-2.6.35.noarch/include/linux/nfs4.h
---- linux-2.6.35.noarch/include/linux/nfs4.h.orig 2010-09-30 12:22:50.222192000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfs4.h 2010-09-30 12:25:08.663289000 -0400
-@@ -17,7 +17,10 @@
+ #include "nfsd.h"
+ #include "vfs.h"
+@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+ NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
+ if (err)
+ goto out;
++#if defined(CONFIG_SPNFS_BLOCK)
++ if (pnfs_block_enabled(inode, 0)) {
++ err = bl_layoutrecall(inode, RETURN_FILE,
++ iap->ia_size, inode->i_size - iap->ia_size);
++ }
++#endif /* CONFIG_SPNFS_BLOCK */
+ }
- #define NFS4_BITMAP_SIZE 2
- #define NFS4_VERIFIER_SIZE 8
--#define NFS4_STATEID_SIZE 16
-+#define NFS4_CLIENTID_SIZE 8
-+#define NFS4_STATEID_SEQID_SIZE 4
-+#define NFS4_STATEID_OTHER_SIZE 12
-+#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
- #define NFS4_FHSIZE 128
- #define NFS4_MAXPATHLEN PATH_MAX
- #define NFS4_MAXNAMLEN NAME_MAX
-@@ -119,6 +122,13 @@
- #define EXCHGID4_FLAG_MASK_A 0x40070003
- #define EXCHGID4_FLAG_MASK_R 0x80070003
+ /*
+@@ -1716,6 +1727,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ struct inode *fdir, *tdir;
+ __be32 err;
+ int host_err;
++#ifdef CONFIG_SPNFS
++ unsigned long ino = 0;
++ unsigned long generation = 0;
++ unsigned int nlink = 0;
++#endif /* CONFIG_SPNFS */
-+static inline bool
-+is_ds_only_session(u32 exchange_flags)
-+{
-+ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS;
-+ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS;
-+}
-+
- #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001
- #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002
- #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004
-@@ -166,8 +176,23 @@ struct nfs4_acl {
- struct nfs4_ace aces[0];
- };
+ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
+ if (err)
+@@ -1779,7 +1795,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ if (host_err)
+ goto out_dput_new;
-+struct nfs4_fsid {
-+ u64 major;
-+ u64 minor;
-+};
-+
- typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
--typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid;
-+typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid;
++#ifdef CONFIG_SPNFS
++ /*
++ * if the target is a preexisting regular file, remember the
++ * inode number and generation so we can delete the stripes;
++ * save the link count as well so that the stripes only get
++ * get deleted when the last link is deleted
++ */
++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) {
++ ino = ndentry->d_inode->i_ino;
++ generation = ndentry->d_inode->i_generation;
++ nlink = ndentry->d_inode->i_nlink;
++ }
++#endif /* CONFIG_SPNFS */
+
-+struct nfs41_stateid {
-+ __be32 seqid;
-+ char other[NFS4_STATEID_OTHER_SIZE];
-+} __attribute__ ((packed));
+ host_err = vfs_rename(fdir, odentry, tdir, ndentry);
++#ifdef CONFIG_SPNFS
++ if (spnfs_enabled() && (!host_err && ino && nlink == 1))
++ spnfs_remove(ino, generation);
++#endif /* CONFIG_SPNFS */
+
-+typedef union {
-+ char data[NFS4_STATEID_SIZE];
-+ struct nfs41_stateid stateid;
-+} nfs4_stateid;
+ if (!host_err) {
+ host_err = commit_metadata(tfhp);
+ if (!host_err)
+@@ -1820,6 +1855,11 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ struct inode *dirp;
+ __be32 err;
+ int host_err;
++#if defined(CONFIG_SPNFS)
++ unsigned long ino;
++ unsigned long generation;
++ unsigned int nlink;
++#endif /* defined(CONFIG_SPNFS) */
- enum nfs_opnum4 {
- OP_ACCESS = 3,
-@@ -471,6 +496,8 @@ enum lock_type4 {
- #define FATTR4_WORD1_TIME_MODIFY (1UL << 21)
- #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22)
- #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23)
-+#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30)
-+#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
+ err = nfserr_acces;
+ if (!flen || isdotent(fname, flen))
+@@ -1843,6 +1883,17 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ goto out;
+ }
- #define NFSPROC4_NULL 0
- #define NFSPROC4_COMPOUND 1
-@@ -532,6 +559,13 @@ enum {
- NFSPROC4_CLNT_SEQUENCE,
- NFSPROC4_CLNT_GET_LEASE_TIME,
- NFSPROC4_CLNT_RECLAIM_COMPLETE,
-+ NFSPROC4_CLNT_LAYOUTGET,
-+ NFSPROC4_CLNT_LAYOUTCOMMIT,
-+ NFSPROC4_CLNT_LAYOUTRETURN,
-+ NFSPROC4_CLNT_GETDEVICELIST,
-+ NFSPROC4_CLNT_GETDEVICEINFO,
-+ NFSPROC4_CLNT_PNFS_WRITE,
-+ NFSPROC4_CLNT_PNFS_COMMIT,
- };
++#if defined(CONFIG_SPNFS)
++ /*
++ * Remember the inode number to communicate to the spnfsd
++ * for removal of stripes; save the link count as well so that
++ * the stripes only get get deleted when the last link is deleted
++ */
++ ino = rdentry->d_inode->i_ino;
++ generation = rdentry->d_inode->i_generation;
++ nlink = rdentry->d_inode->i_nlink;
++#endif /* defined(CONFIG_SPNFS) */
++
+ if (!type)
+ type = rdentry->d_inode->i_mode & S_IFMT;
- /* nfs41 types */
-@@ -550,6 +584,51 @@ enum state_protect_how4 {
- SP4_SSV = 2
- };
+@@ -1867,6 +1918,29 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ if (!host_err)
+ host_err = commit_metadata(fhp);
-+enum pnfs_layouttype {
-+ LAYOUT_NFSV4_1_FILES = 1,
-+ LAYOUT_OSD2_OBJECTS = 2,
-+ LAYOUT_BLOCK_VOLUME = 3,
++#if defined(CONFIG_SPNFS)
++ /*
++ * spnfs: notify spnfsd of removal to destroy stripes
++ */
++/*
++ sb = current_fh->fh_dentry->d_inode->i_sb;
++ if (sb->s_export_op->spnfs_remove) {
++*/
++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__);
++ if (spnfs_enabled() && nlink == 1) {
++ BUG_ON(ino == 0);
++ dprintk("%s calling spnfs_remove inumber=%ld\n",
++ __FUNCTION__, ino);
++ if (spnfs_remove(ino, generation) == 0) {
++ dprintk("%s spnfs_remove success\n", __FUNCTION__);
++ } else {
++ /* XXX How do we make this atomic? */
++ printk(KERN_WARNING "nfsd: pNFS could not "
++ "remove stripes for inode: %ld\n", ino);
++ }
++ }
++#endif /* defined(CONFIG_SPNFS) */
+
-+ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
+ out_nfserr:
+ err = nfserrno(host_err);
+diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
+index 4d476ff..4cc7558 100644
+--- a/fs/nfsd/xdr4.h
++++ b/fs/nfsd/xdr4.h
+@@ -37,6 +37,8 @@
+ #ifndef _LINUX_NFSD_XDR4_H
+ #define _LINUX_NFSD_XDR4_H
+
++#include <linux/nfsd/nfsd4_pnfs.h>
++
+ #include "state.h"
+ #include "nfsd.h"
+
+@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete {
+ u32 rca_one_fs;
+ };
+
++struct nfsd4_pnfs_getdevinfo {
++ struct nfsd4_pnfs_deviceid gd_devid; /* request */
++ u32 gd_layout_type; /* request */
++ u32 gd_maxcount; /* request */
++ u32 gd_notify_types;/* request */
++ struct super_block *gd_sb;
+};
+
-+/* used for both layout return and recall */
-+enum pnfs_layoutreturn_type {
-+ RETURN_FILE = 1,
-+ RETURN_FSID = 2,
-+ RETURN_ALL = 3
++struct nfsd4_pnfs_getdevlist {
++ u32 gd_layout_type; /* request */
++ u32 gd_maxdevices; /* request */
++ u64 gd_cookie; /* request - response */
++ u64 gd_verf; /* request - response */
++ struct svc_fh *gd_fhp; /* response */
++ u32 gd_eof; /* response */
+};
+
-+enum pnfs_iomode {
-+ IOMODE_READ = 1,
-+ IOMODE_RW = 2,
-+ IOMODE_ANY = 3,
++struct nfsd4_pnfs_layoutget {
++ u64 lg_minlength; /* request */
++ u32 lg_signal; /* request */
++ u32 lg_maxcount; /* request */
++ struct svc_fh *lg_fhp; /* request */
++ stateid_t lg_sid; /* request/response */
++ struct nfsd4_layout_seg lg_seg; /* request/response */
++ u32 lg_roc; /* response */
+};
+
-+enum pnfs_notify_deviceid_type4 {
-+ NOTIFY_DEVICEID4_CHANGE = 1 << 1,
-+ NOTIFY_DEVICEID4_DELETE = 1 << 2,
++struct nfsd4_pnfs_layoutcommit {
++ struct nfsd4_pnfs_layoutcommit_arg args;
++ stateid_t lc_sid; /* request */
++ struct nfsd4_pnfs_layoutcommit_res res;
+};
+
-+#define NFL4_UFLG_MASK 0x0000003F
-+#define NFL4_UFLG_DENSE 0x00000001
-+#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002
-+#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0
-+
-+/* Encoded in the loh_body field of type layouthint4 */
-+enum filelayout_hint_care4 {
-+ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE,
-+ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS,
-+ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040,
-+ NFLH4_CARE_STRIPE_COUNT = 0x00000080
++enum layoutreturn_flags {
++ LR_FLAG_INTERN = 1 << 0, /* internal return */
++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */
+};
+
-+#define NFS4_DEVICEID4_SIZE 16
-+
-+struct nfs4_deviceid {
-+ char data[NFS4_DEVICEID4_SIZE];
++struct nfsd4_pnfs_layoutreturn {
++ struct nfsd4_pnfs_layoutreturn_arg args;
++ u32 lr_flags;
++ stateid_t lr_sid; /* request/resopnse */
++ u32 lrs_present; /* response */
+};
+
- #endif
- #endif
-
-diff -up linux-2.6.35.noarch/include/linux/nfsd4_block.h.orig linux-2.6.35.noarch/include/linux/nfsd4_block.h
---- linux-2.6.35.noarch/include/linux/nfsd4_block.h.orig 2010-09-30 12:25:08.799296000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd4_block.h 2010-09-30 12:25:08.801293000 -0400
-@@ -0,0 +1,101 @@
-+#ifndef NFSD4_BLOCK
-+#define NFSD4_BLOCK
-+
-+#include <linux/sunrpc/svc.h>
-+#include <linux/sunrpc/svcauth.h>
-+#include <linux/nfsd/nfsfh.h>
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+
-+#define PNFS_BLOCK_SUCCESS 1
-+#define PNFS_BLOCK_FAILURE 0
+ struct nfsd4_op {
+ int opnum;
+ __be32 status;
+@@ -426,6 +473,13 @@ struct nfsd4_op {
+ struct nfsd4_destroy_session destroy_session;
+ struct nfsd4_sequence sequence;
+ struct nfsd4_reclaim_complete reclaim_complete;
++#if defined(CONFIG_PNFSD)
++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist;
++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo;
++ struct nfsd4_pnfs_layoutget pnfs_layoutget;
++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit;
++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn;
++#endif /* CONFIG_PNFSD */
+ } u;
+ struct nfs4_replay * replay;
+ };
+diff --git a/include/linux/exp_xdr.h b/include/linux/exp_xdr.h
+new file mode 100644
+index 0000000..b69c309
+--- /dev/null
++++ b/include/linux/exp_xdr.h
+@@ -0,0 +1,141 @@
++#ifndef _LINUX_EXP_XDR_H
++#define _LINUX_EXP_XDR_H
+
-+#define PNFS_BLOCK_CTL_START 1
-+#define PNFS_BLOCK_CTL_STOP 2
-+#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current
-+ * version from kernel via an upcall.
-+ */
++#include <asm/byteorder.h>
++#include <asm/unaligned.h>
++#include <linux/string.h>
+
-+#define PNFS_UPCALL_MSG_STOP 0
-+#define PNFS_UPCALL_MSG_GETSIG 1
-+#define PNFS_UPCALL_MSG_GETSLICE 2
-+#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume
-+#define PNFS_UPCALL_MSG_DMGET 4
-+#define PNFS_UPCALL_MSG_VERS 5
++struct exp_xdr_stream {
++ __be32 *p;
++ __be32 *end;
++};
+
-+#define PNFS_UPCALL_VERS 8
++/**
++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes
++ * @nbytes: number of bytes to encode
++ */
++static inline size_t
++exp_xdr_qwords(__u32 nbytes)
++{
++ return DIV_ROUND_UP(nbytes, 4);
++}
+
-+typedef struct stripe_dev {
-+ int major,
-+ minor,
-+ offset;
-+} stripe_dev_t;
++/**
++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords
++ * @qwords: number of quad-words to encode
++ */
++static inline size_t
++exp_xdr_qbytes(size_t qwords)
++{
++ return qwords << 2;
++}
+
-+typedef struct bl_comm_res {
-+ int res_status;
-+ union {
-+ struct {
-+ long long start,
-+ length;
-+ } slice;
-+ struct {
-+ int num_stripes,
-+ stripe_size;
-+ stripe_dev_t devs[];
-+ } stripe;
-+ struct {
-+ long long sector;
-+ int offset,
-+ len;
-+ char sig[];
-+ } sig;
-+ int vers,
-+ dm_vol;
-+ } u;
-+} bl_comm_res_t;
++/**
++ * exp_xdr_reserve_space - Reserve buffer space for sending
++ * @xdr: pointer to exp_xdr_stream
++ * @nbytes: number of bytes to reserve
++ *
++ * Checks that we have enough buffer space to encode 'nbytes' more
++ * bytes of data. If so, update the xdr stream.
++ */
++static inline __be32 *
++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes)
++{
++ __be32 *p = xdr->p;
++ __be32 *q;
+
-+typedef struct bl_comm_msg {
-+ int msg_type,
-+ msg_status;
-+ union {
-+ dev_t msg_dev;
-+ int msg_vers;
-+ } u;
-+ bl_comm_res_t *msg_res;
-+} bl_comm_msg_t;
++ /* align nbytes on the next 32-bit boundary */
++ q = p + exp_xdr_qwords(nbytes);
++ if (unlikely(q > xdr->end || q < p))
++ return NULL;
++ xdr->p = q;
++ return p;
++}
+
-+#ifdef __KERNEL__
++/**
++ * exp_xdr_reserve_qwords - Reserve buffer space for sending
++ * @xdr: pointer to exp_xdr_stream
++ * @nwords: number of quad words (u32's) to reserve
++ */
++static inline __be32 *
++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords)
++{
++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords));
++}
+
-+typedef struct bl_comm {
-+ /* ---- protects access to this structure ---- */
-+ struct mutex lock;
-+ /* ---- protects access to rpc pipe ---- */
-+ struct mutex pipe_lock;
-+ struct dentry *pipe_dentry;
-+ wait_queue_head_t pipe_wq;
-+ bl_comm_msg_t msg;
-+} bl_comm_t;
++/**
++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream
++ * @p: pointer to encoding destination
++ * @val: value to encode
++ */
++static inline __be32 *
++exp_xdr_encode_u32(__be32 *p, __u32 val)
++{
++ *p = cpu_to_be32(val);
++ return p + 1;
++}
+
-+int pnfs_block_enabled(struct inode *, int);
-+int bl_layout_type(struct super_block *sb);
-+int bl_getdeviceiter(struct super_block *, u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *);
-+int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *);
-+enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *,
-+ const struct nfsd4_pnfs_layoutget_arg *,
-+ struct nfsd4_pnfs_layoutget_res *);
-+int bl_layoutcommit(struct inode *,
-+ const struct nfsd4_pnfs_layoutcommit_arg *,
-+ struct nfsd4_pnfs_layoutcommit_res *);
-+int bl_layoutreturn(struct inode *,
-+ const struct nfsd4_pnfs_layoutreturn_arg *);
-+int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len);
-+int bl_init_proc(void);
-+int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **);
++/**
++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream
++ * @p: pointer to encoding destination
++ * @val: value to encode
++ */
++static inline __be32 *
++exp_xdr_encode_u64(__be32 *p, __u64 val)
++{
++ put_unaligned_be64(val, p);
++ return p + 2;
++}
+
-+extern bl_comm_t *bl_comm_global; // Ugly...
-+#endif /* __KERNEL__ */
++/**
++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream
++ * @p: pointer to encoding destination
++ * @ptr: pointer to the array of bytes
++ * @nbytes: number of bytes to encode
++ */
++static inline __be32 *
++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes)
++{
++ if (likely(nbytes != 0)) {
++ unsigned int qwords = exp_xdr_qwords(nbytes);
++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes;
+
-+#endif /* NFSD4_BLOCK */
++ memcpy(p, ptr, nbytes);
++ if (padding != 0)
++ memset((char *)p + nbytes, 0, padding);
++ p += qwords;
++ }
++ return p;
++}
+
-diff -up linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h
---- linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h.orig 2010-09-30 12:25:08.811294000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h 2010-09-30 12:25:08.813294000 -0400
-@@ -0,0 +1,345 @@
-+/*
-+ * include/linux/nfsd4_spnfs.h
-+ *
-+ * spNFS - simple pNFS implementation with userspace daemon
++/**
++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream
++ * @p: pointer to encoding destination
++ * @ptr: pointer to the opaque array
++ * @nbytes: number of bytes to encode
+ *
++ * Encodes the 32-bit opaque size in bytes followed by the opaque value.
+ */
++static inline __be32 *
++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes)
++{
++ p = exp_xdr_encode_u32(p, nbytes);
++ return exp_xdr_encode_bytes(p, ptr, nbytes);
++}
+
-+/******************************************************************************
-+
-+(c) 2007 Network Appliance, Inc. All Rights Reserved.
++/**
++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream
++ * @lenp: pointer to the opaque length destination
++ * @endp: pointer to the end of the opaque array
++ *
++ * Encodes the 32-bit opaque size in bytes given the start and end pointers
++ */
++static inline __be32 *
++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp)
++{
++ size_t nbytes = (char *)endp - (char *)(lenp + 1);
+
-+Network Appliance provides this source code under the GPL v2 License.
-+The GPL v2 license is available at
-+http://opensource.org/licenses/gpl-license.php.
++ exp_xdr_encode_u32(lenp, nbytes);
++ return lenp + 1 + exp_xdr_qwords(nbytes);
++}
++#endif /* _LINUX_EXP_XDR_H */
+diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
+index a9cd507..225560d 100644
+--- a/include/linux/exportfs.h
++++ b/include/linux/exportfs.h
+@@ -2,6 +2,7 @@
+ #define LINUX_EXPORTFS_H 1
+
+ #include <linux/types.h>
++#include <linux/exp_xdr.h>
+
+ struct dentry;
+ struct inode;
+@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type,
+ struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
+
++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
++struct pnfs_filelayout_device;
++struct pnfs_filelayout_layout;
+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr,
++ const struct pnfs_filelayout_device *fdev);
++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr,
++ const struct pnfs_filelayout_layout *flp);
++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
+
-+******************************************************************************/
++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT)
++struct list_head;
+
-+#ifndef NFS_SPNFS_H
-+#define NFS_SPNFS_H
++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr,
++ const struct list_head *volumes);
+
++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr,
++ const struct list_head *layouts);
++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */
+
-+#ifdef __KERNEL__
-+#include "exportfs.h"
-+#include "sunrpc/svc.h"
-+#include "nfsd/nfsfh.h"
-+#else
-+#include <sys/types.h>
-+#endif /* __KERNEL__ */
++#if defined(CONFIG_PNFSD)
++#include <linux/module.h>
+
-+#define SPNFS_STATUS_INVALIDMSG 0x01
-+#define SPNFS_STATUS_AGAIN 0x02
-+#define SPNFS_STATUS_FAIL 0x04
-+#define SPNFS_STATUS_SUCCESS 0x08
++struct pnfsd_cb_operations;
+
-+#define SPNFS_TYPE_LAYOUTGET 0x01
-+#define SPNFS_TYPE_LAYOUTCOMMIT 0x02
-+#define SPNFS_TYPE_LAYOUTRETURN 0x03
-+#define SPNFS_TYPE_GETDEVICEITER 0x04
-+#define SPNFS_TYPE_GETDEVICEINFO 0x05
-+#define SPNFS_TYPE_SETATTR 0x06
-+#define SPNFS_TYPE_OPEN 0x07
-+#define SPNFS_TYPE_CLOSE 0x08
-+#define SPNFS_TYPE_CREATE 0x09
-+#define SPNFS_TYPE_REMOVE 0x0a
-+#define SPNFS_TYPE_COMMIT 0x0b
-+#define SPNFS_TYPE_READ 0x0c
-+#define SPNFS_TYPE_WRITE 0x0d
++struct pnfsd_cb_ctl {
++ spinlock_t lock;
++ struct module *module;
++ const struct pnfsd_cb_operations *cb_op;
++};
+
-+#define SPNFS_MAX_DEVICES 1
-+#define SPNFS_MAX_DATA_SERVERS 16
-+#define SPNFS_MAX_IO 512
++/* in expfs.c so that file systems can depend on it */
++extern struct pnfsd_cb_ctl pnfsd_cb_ctl;
+
-+/* layout */
-+struct spnfs_msg_layoutget_args {
-+ unsigned long inode;
-+ unsigned long generation;
-+};
++static inline int
++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl)
++{
++ int ret = -ENOENT;
+
-+struct spnfs_filelayout_list {
-+ u_int32_t fh_len;
-+ unsigned char fh_val[128]; /* DMXXX fix this const */
-+};
++ spin_lock(&pnfsd_cb_ctl.lock);
++ if (!pnfsd_cb_ctl.cb_op)
++ goto out;
++ if (!try_module_get(pnfsd_cb_ctl.module))
++ goto out;
++ ctl->cb_op = pnfsd_cb_ctl.cb_op;
++ ctl->module = pnfsd_cb_ctl.module;
++ ret = 0;
++out:
++ spin_unlock(&pnfsd_cb_ctl.lock);
++ return ret;
++}
+
-+struct spnfs_msg_layoutget_res {
-+ int status;
-+ u_int64_t devid;
-+ u_int64_t stripe_size;
-+ u_int32_t stripe_type;
-+ u_int32_t stripe_count;
-+ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS];
-+};
++static inline void
++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl)
++{
++ module_put(ctl->module);
++}
++#endif /* CONFIG_PNFSD */
+ #endif /* LINUX_EXPORTFS_H */
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 63d069b..3a8601a 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -388,6 +388,7 @@ struct inodes_stat_t {
+ #include <asm/byteorder.h>
+
+ struct export_operations;
++struct pnfs_export_operations;
+ struct hd_geometry;
+ struct iovec;
+ struct nameidata;
+@@ -1327,6 +1328,7 @@ struct super_block {
+ const struct dquot_operations *dq_op;
+ const struct quotactl_ops *s_qcop;
+ const struct export_operations *s_export_op;
++ const struct pnfs_export_operations *s_pnfs_op;
+ unsigned long s_flags;
+ unsigned long s_magic;
+ struct dentry *s_root;
+diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
+index 07e40c6..df29296 100644
+--- a/include/linux/nfs4.h
++++ b/include/linux/nfs4.h
+@@ -17,7 +17,10 @@
+
+ #define NFS4_BITMAP_SIZE 2
+ #define NFS4_VERIFIER_SIZE 8
+-#define NFS4_STATEID_SIZE 16
++#define NFS4_CLIENTID_SIZE 8
++#define NFS4_STATEID_SEQID_SIZE 4
++#define NFS4_STATEID_OTHER_SIZE 12
++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
+ #define NFS4_FHSIZE 128
+ #define NFS4_MAXPATHLEN PATH_MAX
+ #define NFS4_MAXNAMLEN NAME_MAX
+@@ -119,6 +122,13 @@
+ #define EXCHGID4_FLAG_MASK_A 0x40070003
+ #define EXCHGID4_FLAG_MASK_R 0x80070003
+
++static inline bool
++is_ds_only_session(u32 exchange_flags)
++{
++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS;
++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS;
++}
+
-+/* layoutcommit */
-+struct spnfs_msg_layoutcommit_args {
-+ unsigned long inode;
-+ unsigned long generation;
-+ u_int64_t file_size;
+ #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001
+ #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002
+ #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004
+@@ -166,8 +176,23 @@ struct nfs4_acl {
+ struct nfs4_ace aces[0];
+ };
+
++struct nfs4_fsid {
++ u64 major;
++ u64 minor;
+};
+
-+struct spnfs_msg_layoutcommit_res {
-+ int status;
-+};
+ typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
+-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid;
++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid;
+
-+/* layoutreturn */
-+/* No op for the daemon */
-+/*
-+struct spnfs_msg_layoutreturn_args {
-+};
++struct nfs41_stateid {
++ __be32 seqid;
++ char other[NFS4_STATEID_OTHER_SIZE];
++} __attribute__ ((packed));
+
-+struct spnfs_msg_layoutreturn_res {
-+};
-+*/
++typedef union {
++ char data[NFS4_STATEID_SIZE];
++ struct nfs41_stateid stateid;
++} nfs4_stateid;
+
+ enum nfs_opnum4 {
+ OP_ACCESS = 3,
+@@ -471,6 +496,8 @@ enum lock_type4 {
+ #define FATTR4_WORD1_TIME_MODIFY (1UL << 21)
+ #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22)
+ #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23)
++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30)
++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
+
+ #define NFSPROC4_NULL 0
+ #define NFSPROC4_COMPOUND 1
+@@ -532,6 +559,13 @@ enum {
+ NFSPROC4_CLNT_SEQUENCE,
+ NFSPROC4_CLNT_GET_LEASE_TIME,
+ NFSPROC4_CLNT_RECLAIM_COMPLETE,
++ NFSPROC4_CLNT_LAYOUTGET,
++ NFSPROC4_CLNT_LAYOUTCOMMIT,
++ NFSPROC4_CLNT_LAYOUTRETURN,
++ NFSPROC4_CLNT_GETDEVICELIST,
++ NFSPROC4_CLNT_GETDEVICEINFO,
++ NFSPROC4_CLNT_PNFS_WRITE,
++ NFSPROC4_CLNT_PNFS_COMMIT,
+ };
+
+ /* nfs41 types */
+@@ -550,6 +584,51 @@ enum state_protect_how4 {
+ SP4_SSV = 2
+ };
+
++enum pnfs_layouttype {
++ LAYOUT_NFSV4_1_FILES = 1,
++ LAYOUT_OSD2_OBJECTS = 2,
++ LAYOUT_BLOCK_VOLUME = 3,
+
-+/* getdeviceiter */
-+struct spnfs_msg_getdeviceiter_args {
-+ unsigned long inode;
-+ u_int64_t cookie;
-+ u_int64_t verf;
++ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000
+};
+
-+struct spnfs_msg_getdeviceiter_res {
-+ int status;
-+ u_int64_t devid;
-+ u_int64_t cookie;
-+ u_int64_t verf;
-+ u_int32_t eof;
++/* used for both layout return and recall */
++enum pnfs_layoutreturn_type {
++ RETURN_FILE = 1,
++ RETURN_FSID = 2,
++ RETURN_ALL = 3
+};
+
-+/* getdeviceinfo */
-+struct spnfs_data_server {
-+ u_int32_t dsid;
-+ char netid[5];
-+ char addr[29];
++enum pnfs_iomode {
++ IOMODE_READ = 1,
++ IOMODE_RW = 2,
++ IOMODE_ANY = 3,
+};
+
-+struct spnfs_device {
-+ u_int64_t devid;
-+ int dscount;
-+ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS];
++enum pnfs_notify_deviceid_type4 {
++ NOTIFY_DEVICEID4_CHANGE = 1 << 1,
++ NOTIFY_DEVICEID4_DELETE = 1 << 2,
+};
+
-+struct spnfs_msg_getdeviceinfo_args {
-+ u_int64_t devid;
-+};
++#define NFL4_UFLG_MASK 0x0000003F
++#define NFL4_UFLG_DENSE 0x00000001
++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002
++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0
+
-+struct spnfs_msg_getdeviceinfo_res {
-+ int status;
-+ struct spnfs_device devinfo;
++/* Encoded in the loh_body field of type layouthint4 */
++enum filelayout_hint_care4 {
++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE,
++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS,
++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040,
++ NFLH4_CARE_STRIPE_COUNT = 0x00000080
+};
+
-+/* setattr */
-+struct spnfs_msg_setattr_args {
-+ unsigned long inode;
-+ unsigned long generation;
-+ int file_size;
-+};
++#define NFS4_DEVICEID4_SIZE 16
+
-+struct spnfs_msg_setattr_res {
-+ int status;
++struct nfs4_deviceid {
++ char data[NFS4_DEVICEID4_SIZE];
+};
+
-+/* open */
-+struct spnfs_msg_open_args {
-+ unsigned long inode;
-+ unsigned long generation;
-+ int create;
-+ int createmode;
-+ int truncate;
-+};
+ #endif
+ #endif
+
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index 508f8cf..27c45cc 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -188,6 +188,10 @@ struct nfs_inode {
+ struct nfs_delegation *delegation;
+ fmode_t delegation_state;
+ struct rw_semaphore rwsem;
+
-+struct spnfs_msg_open_res {
-+ int status;
-+};
++ /* pNFS layout information */
++ struct rpc_wait_queue lo_rpcwaitq;
++ struct pnfs_layout_hdr *layout;
+ #endif /* CONFIG_NFS_V4*/
+ #ifdef CONFIG_NFS_FSCACHE
+ struct fscache_cookie *fscache;
+@@ -490,8 +494,12 @@ extern void nfs_unblock_sillyrename(struct dentry *dentry);
+ extern int nfs_congestion_kb;
+ extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
+ extern int nfs_writepages(struct address_space *, struct writeback_control *);
+-extern int nfs_flush_incompatible(struct file *file, struct page *page);
+-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
++struct pnfs_layout_segment;
++extern int nfs_flush_incompatible(struct file *file, struct page *page,
++ struct pnfs_layout_segment *lseg);
++extern int nfs_updatepage(struct file *, struct page *,
++ unsigned int offset, unsigned int count,
++ struct pnfs_layout_segment *lseg, void *fsdata);
+ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+
+ /*
+@@ -613,6 +621,8 @@ extern void * nfs_root_data(void);
+ #define NFSDBG_CLIENT 0x0200
+ #define NFSDBG_MOUNT 0x0400
+ #define NFSDBG_FSCACHE 0x0800
++#define NFSDBG_PNFS 0x1000
++#define NFSDBG_PNFS_LD 0x2000
+ #define NFSDBG_ALL 0xFFFF
+
+ #ifdef __KERNEL__
+diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
+index c82ee7c..81121d0 100644
+--- a/include/linux/nfs_fs_sb.h
++++ b/include/linux/nfs_fs_sb.h
+@@ -82,6 +82,8 @@ struct nfs_client {
+ /* The flags used for obtaining the clientid during EXCHANGE_ID */
+ u32 cl_exchange_flags;
+ struct nfs4_session *cl_session; /* sharred session */
++ struct list_head cl_layouts;
++ struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
+ #endif /* CONFIG_NFS_V4_1 */
+
+ #ifdef CONFIG_NFS_FSCACHE
+@@ -89,6 +91,16 @@ struct nfs_client {
+ #endif
+ };
+
++static inline bool
++is_ds_only_client(struct nfs_client *clp)
++{
++#ifdef CONFIG_NFS_V4_1
++ return is_ds_only_session(clp->cl_exchange_flags);
++#else
++ return false;
++#endif
++}
+
-+/* close */
-+/* No op for daemon */
-+struct spnfs_msg_close_args {
-+ int x;
-+};
+ /*
+ * NFS client parameters stored in the superblock.
+ */
+@@ -133,7 +145,7 @@ struct nfs_server {
+ #endif
+
+ #ifdef CONFIG_NFS_V4
+- u32 attr_bitmask[2];/* V4 bitmask representing the set
++ u32 attr_bitmask[3];/* V4 bitmask representing the set
+ of attributes supported on this
+ filesystem */
+ u32 cache_consistency_bitmask[2];
+@@ -144,6 +156,11 @@ struct nfs_server {
+ u32 acl_bitmask; /* V4 bitmask representing the ACEs
+ that are supported on this
+ filesystem */
++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
++ void *pnfs_ld_data; /* Per-mount data */
++ unsigned int ds_rsize; /* Data server read size */
++ unsigned int ds_wsize; /* Data server write size */
++ u32 pnfs_blksize; /* layout_blksize attr */
+ #endif
+ void (*destroy)(struct nfs_server *);
+
+diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
+index 68b10f5..f9b5f44 100644
+--- a/include/linux/nfs_iostat.h
++++ b/include/linux/nfs_iostat.h
+@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters {
+ NFSIOS_SHORTREAD,
+ NFSIOS_SHORTWRITE,
+ NFSIOS_DELAY,
++ NFSIOS_PNFS_READ,
++ NFSIOS_PNFS_WRITE,
++ NFSIOS_PNFS_COMMIT,
+ __NFSIOS_COUNTSMAX,
+ };
+
+diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
+index f8b60e7..6fa43c7 100644
+--- a/include/linux/nfs_page.h
++++ b/include/linux/nfs_page.h
+@@ -48,6 +48,7 @@ struct nfs_page {
+ struct kref wb_kref; /* reference count */
+ unsigned long wb_flags;
+ struct nfs_writeverf wb_verf; /* Commit cookie */
++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */
+ };
+
+ struct nfs_pageio_descriptor {
+@@ -61,6 +62,11 @@ struct nfs_pageio_descriptor {
+ int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
+ int pg_ioflags;
+ int pg_error;
++ struct pnfs_layout_segment *pg_lseg;
++#ifdef CONFIG_NFS_V4_1
++ int pg_iswrite;
++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
++#endif /* CONFIG_NFS_V4_1 */
+ };
+
+ #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
+@@ -69,13 +75,15 @@ extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
+ struct inode *inode,
+ struct page *page,
+ unsigned int offset,
+- unsigned int count);
++ unsigned int count,
++ struct pnfs_layout_segment *lseg);
+ extern void nfs_clear_request(struct nfs_page *req);
+ extern void nfs_release_request(struct nfs_page *req);
+
+
+ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
+- pgoff_t idx_start, unsigned int npages, int tag);
++ pgoff_t idx_start, unsigned int npages, int tag,
++ int *use_pnfs);
+ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+ struct inode *inode,
+ int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
+index fc46192..63c0301 100644
+--- a/include/linux/nfs_xdr.h
++++ b/include/linux/nfs_xdr.h
+@@ -3,6 +3,8 @@
+
+ #include <linux/nfsacl.h>
+ #include <linux/nfs3.h>
++#include <linux/nfs4.h>
++#include <linux/sunrpc/sched.h>
+
+ /*
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+@@ -10,7 +12,7 @@
+ * support a megabyte or more. The default is left at 4096 bytes, which is
+ * reasonable for NFS over UDP.
+ */
+-#define NFS_MAX_FILE_IO_SIZE (1048576U)
++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U)
+ #define NFS_DEF_FILE_IO_SIZE (4096U)
+ #define NFS_MIN_FILE_IO_SIZE (1024U)
+
+@@ -113,6 +115,8 @@ struct nfs_fsinfo {
+ __u32 dtpref; /* pref. readdir transfer size */
+ __u64 maxfilesize;
+ __u32 lease_time; /* in seconds */
++ __u32 layouttype; /* supported pnfs layout driver */
++ __u32 blksize; /* preferred pnfs io block size */
+ };
+
+ struct nfs_fsstat {
+@@ -185,6 +189,123 @@ struct nfs4_get_lease_time_res {
+ struct nfs4_sequence_res lr_seq_res;
+ };
+
++#define PNFS_LAYOUT_MAXSIZE 4096
+
-+struct spnfs_msg_close_res {
-+ int y;
++struct nfs4_layoutdriver_data {
++ __u32 len;
++ void *buf;
+};
+
-+/* create */
-+/*
-+struct spnfs_msg_create_args {
-+ int x;
++struct pnfs_layout_range {
++ u32 iomode;
++ u64 offset;
++ u64 length;
+};
+
-+struct spnfs_msg_create_res {
-+ int y;
++struct nfs4_layoutget_args {
++ __u32 type;
++ struct pnfs_layout_range range;
++ __u64 minlength;
++ __u32 maxcount;
++ struct inode *inode;
++ struct nfs_open_context *ctx;
++ struct nfs4_sequence_args seq_args;
+};
-+*/
+
-+/* remove */
-+struct spnfs_msg_remove_args {
-+ unsigned long inode;
-+ unsigned long generation;
++struct nfs4_layoutget_res {
++ __u32 return_on_close;
++ struct pnfs_layout_range range;
++ __u32 type;
++ nfs4_stateid stateid;
++ struct nfs4_layoutdriver_data layout;
++ struct nfs4_sequence_res seq_res;
+};
+
-+struct spnfs_msg_remove_res {
++struct nfs4_layoutget {
++ struct nfs4_layoutget_args args;
++ struct nfs4_layoutget_res res;
++ struct pnfs_layout_segment **lsegpp;
+ int status;
+};
+
-+/* commit */
-+/*
-+struct spnfs_msg_commit_args {
-+ int x;
-+};
++struct nfs4_layoutcommit_args {
++ nfs4_stateid stateid;
++ __u64 lastbytewritten;
++ __u32 time_modify_changed;
++ struct timespec time_modify;
++ const u32 *bitmask;
++ struct nfs_fh *fh;
++ struct inode *inode;
+
-+struct spnfs_msg_commit_res {
-+ int y;
++ /* Values set by layout driver */
++ struct pnfs_layout_range range;
++ __u32 layout_type;
++ void *layoutdriver_data;
++ struct nfs4_sequence_args seq_args;
+};
-+*/
+
-+/* read */
-+struct spnfs_msg_read_args {
-+ unsigned long inode;
-+ unsigned long generation;
-+ loff_t offset;
-+ unsigned long len;
++struct nfs4_layoutcommit_res {
++ __u32 sizechanged;
++ __u64 newsize;
++ struct nfs_fattr *fattr;
++ const struct nfs_server *server;
++ struct nfs4_sequence_res seq_res;
+};
+
-+struct spnfs_msg_read_res {
++struct nfs4_layoutcommit_data {
++ struct rpc_task task;
++ struct rpc_cred *cred;
++ struct nfs_fattr fattr;
++ struct nfs4_layoutcommit_args args;
++ struct nfs4_layoutcommit_res res;
+ int status;
-+ char data[SPNFS_MAX_IO];
+};
+
-+/* write */
-+struct spnfs_msg_write_args {
-+ unsigned long inode;
-+ unsigned long generation;
-+ loff_t offset;
-+ unsigned long len;
-+ char data[SPNFS_MAX_IO];
++struct nfs4_layoutreturn_args {
++ __u32 reclaim;
++ __u32 layout_type;
++ __u32 return_type;
++ struct pnfs_layout_range range;
++ struct inode *inode;
++ struct nfs4_sequence_args seq_args;
+};
+
-+struct spnfs_msg_write_res {
-+ int status;
++struct nfs4_layoutreturn_res {
++ struct nfs4_sequence_res seq_res;
++ bool valid; /* internal, true if received reply */
++ u32 lrs_present;
++ nfs4_stateid stateid;
+};
+
-+/* bundle args and responses */
-+union spnfs_msg_args {
-+ struct spnfs_msg_layoutget_args layoutget_args;
-+ struct spnfs_msg_layoutcommit_args layoutcommit_args;
-+/*
-+ struct spnfs_msg_layoutreturn_args layoutreturn_args;
-+*/
-+ struct spnfs_msg_getdeviceiter_args getdeviceiter_args;
-+ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args;
-+ struct spnfs_msg_setattr_args setattr_args;
-+ struct spnfs_msg_open_args open_args;
-+ struct spnfs_msg_close_args close_args;
-+/*
-+ struct spnfs_msg_create_args create_args;
-+*/
-+ struct spnfs_msg_remove_args remove_args;
-+/*
-+ struct spnfs_msg_commit_args commit_args;
-+*/
-+ struct spnfs_msg_read_args read_args;
-+ struct spnfs_msg_write_args write_args;
++struct nfs4_layoutreturn {
++ struct nfs4_layoutreturn_args args;
++ struct nfs4_layoutreturn_res res;
++ struct rpc_cred *cred;
++ const nfs4_stateid *stateid;
++ int rpc_status;
+};
+
-+union spnfs_msg_res {
-+ struct spnfs_msg_layoutget_res layoutget_res;
-+ struct spnfs_msg_layoutcommit_res layoutcommit_res;
-+/*
-+ struct spnfs_msg_layoutreturn_res layoutreturn_res;
-+*/
-+ struct spnfs_msg_getdeviceiter_res getdeviceiter_res;
-+ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res;
-+ struct spnfs_msg_setattr_res setattr_res;
-+ struct spnfs_msg_open_res open_res;
-+ struct spnfs_msg_close_res close_res;
-+/*
-+ struct spnfs_msg_create_res create_res;
-+*/
-+ struct spnfs_msg_remove_res remove_res;
-+/*
-+ struct spnfs_msg_commit_res commit_res;
-+*/
-+ struct spnfs_msg_read_res read_res;
-+ struct spnfs_msg_write_res write_res;
++struct nfs4_getdevicelist_args {
++ const struct nfs_fh *fh;
++ u32 layoutclass;
++ struct nfs4_sequence_args seq_args;
+};
+
-+/* a spnfs message, args and response */
-+struct spnfs_msg {
-+ unsigned char im_type;
-+ unsigned char im_status;
-+ union spnfs_msg_args im_args;
-+ union spnfs_msg_res im_res;
++struct nfs4_getdevicelist_res {
++ struct pnfs_devicelist *devlist;
++ struct nfs4_sequence_res seq_res;
+};
+
-+/* spnfs configuration info */
-+struct spnfs_config {
-+ unsigned char dense_striping;
-+ int stripe_size;
-+ int num_ds;
-+ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */
++struct nfs4_getdeviceinfo_args {
++ struct pnfs_device *pdev;
++ struct nfs4_sequence_args seq_args;
+};
+
-+#if defined(__KERNEL__) && defined(CONFIG_SPNFS)
-+
-+#include <linux/nfsd/nfsd4_pnfs.h>
-+
-+/* pipe mgmt structure. messages flow through here */
-+struct spnfs {
-+ struct dentry *spnfs_dentry; /* dentry for pipe */
-+ wait_queue_head_t spnfs_wq;
-+ struct spnfs_msg spnfs_im; /* spnfs message */
-+ struct mutex spnfs_lock; /* Serializes upcalls */
-+ struct mutex spnfs_plock;
++struct nfs4_getdeviceinfo_res {
++ struct pnfs_device *pdev;
++ struct nfs4_sequence_res seq_res;
+};
+
-+struct nfsd4_open;
-+
-+int spnfs_layout_type(struct super_block *);
-+enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_layoutget_arg *,
-+ struct nfsd4_pnfs_layoutget_res *);
-+int spnfs_layoutcommit(void);
-+int spnfs_layoutreturn(struct inode *,
-+ const struct nfsd4_pnfs_layoutreturn_arg *);
-+int spnfs_getdeviceiter(struct super_block *,
-+ u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *);
-+int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *);
-+int spnfs_setattr(void);
-+int spnfs_open(struct inode *, struct nfsd4_open *);
-+int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *);
-+int spnfs_remove(unsigned long, unsigned long);
-+__be32 spnfs_read(struct inode *, loff_t, unsigned long *,
-+ int, struct svc_rqst *);
-+__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *);
-+int spnfs_getfh(int, struct nfs_fh *);
-+int spnfs_test_layoutrecall(char *, u64, u64);
-+int spnfs_layoutrecall(struct inode *, int, u64, u64);
+ /*
+ * Arguments to the open call.
+ */
+@@ -854,7 +975,7 @@ struct nfs4_server_caps_arg {
+ };
+
+ struct nfs4_server_caps_res {
+- u32 attr_bitmask[2];
++ u32 attr_bitmask[3];
+ u32 acl_bitmask;
+ u32 has_links;
+ u32 has_symlinks;
+@@ -969,6 +1090,30 @@ struct nfs_page;
+
+ #define NFS_PAGEVEC_SIZE (8U)
+
++#if defined(CONFIG_NFS_V4_1)
+
-+int nfsd_spnfs_new(void);
-+void nfsd_spnfs_delete(void);
-+int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *);
-+int spnfs_enabled(void);
-+int spnfs_init_proc(void);
++/* pnfsflag values */
++enum pnfs_flags {
++ PNFS_NO_RPC = 1 << 0, /* non rpc result callback switch */
++};
+
-+extern struct spnfs_config *spnfs_config;
++/* pnfs-specific data needed for read, write, and commit calls */
++struct pnfs_call_data {
++ struct pnfs_layout_segment *lseg;
++ const struct rpc_call_ops *call_ops;
++ u32 orig_count; /* for retry via MDS */
++ int pnfs_error;
++ u8 pnfsflags;
++ u8 how; /* for FLUSH_STABLE */
++};
+
-+#endif /* __KERNEL__ && CONFIG_SPNFS */
++/* files layout-type specific data for read, write, and commit */
++struct pnfs_fl_call_data {
++ struct nfs_client *ds_nfs_client;
++ __u64 orig_offset;
++};
++#endif /* CONFIG_NFS_V4_1 */
+
-+#endif /* NFS_SPNFS_H */
-diff -up linux-2.6.35.noarch/include/linux/nfsd/const.h.orig linux-2.6.35.noarch/include/linux/nfsd/const.h
---- linux-2.6.35.noarch/include/linux/nfsd/const.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/const.h 2010-09-30 12:25:08.759293000 -0400
+ struct nfs_read_data {
+ int flags;
+ struct rpc_task task;
+@@ -984,10 +1129,16 @@ struct nfs_read_data {
+ #ifdef CONFIG_NFS_V4
+ unsigned long timestamp; /* For lease renewal */
+ #endif
++#if defined(CONFIG_NFS_V4_1)
++ struct pnfs_call_data pdata;
++ struct pnfs_fl_call_data fldata;
++#endif /* CONFIG_NFS_V4_1 */
+ struct page *page_array[NFS_PAGEVEC_SIZE];
+ };
+
+ struct nfs_write_data {
++ struct kref refcount; /* For pnfs commit splitting */
++ struct nfs_write_data *parent; /* For pnfs commit splitting */
+ int flags;
+ struct rpc_task task;
+ struct inode *inode;
+@@ -1003,6 +1154,10 @@ struct nfs_write_data {
+ #ifdef CONFIG_NFS_V4
+ unsigned long timestamp; /* For lease renewal */
+ #endif
++#if defined(CONFIG_NFS_V4_1)
++ struct pnfs_call_data pdata;
++ struct pnfs_fl_call_data fldata;
++#endif /* CONFIG_NFS_V4_1 */
+ struct page *page_array[NFS_PAGEVEC_SIZE];
+ };
+
+diff --git a/include/linux/nfsd/const.h b/include/linux/nfsd/const.h
+index 323f8cf..520fcfb 100644
+--- a/include/linux/nfsd/const.h
++++ b/include/linux/nfsd/const.h
@@ -29,6 +29,7 @@
#ifdef __KERNEL__
@@ -27101,9 +27077,10 @@ diff -up linux-2.6.35.noarch/include/linux/nfsd/const.h.orig linux-2.6.35.noarch
/*
* Largest number of bytes we need to allocate for an NFS
-diff -up linux-2.6.35.noarch/include/linux/nfsd/debug.h.orig linux-2.6.35.noarch/include/linux/nfsd/debug.h
---- linux-2.6.35.noarch/include/linux/nfsd/debug.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/debug.h 2010-09-30 12:25:08.764292000 -0400
+diff --git a/include/linux/nfsd/debug.h b/include/linux/nfsd/debug.h
+index ee4aa91..aad7013 100644
+--- a/include/linux/nfsd/debug.h
++++ b/include/linux/nfsd/debug.h
@@ -32,6 +32,8 @@
#define NFSDDBG_REPCACHE 0x0080
#define NFSDDBG_XDR 0x0100
@@ -27113,9 +27090,10 @@ diff -up linux-2.6.35.noarch/include/linux/nfsd/debug.h.orig linux-2.6.35.noarch
#define NFSDDBG_ALL 0x7FFF
#define NFSDDBG_NOCHANGE 0xFFFF
-diff -up linux-2.6.35.noarch/include/linux/nfsd/export.h.orig linux-2.6.35.noarch/include/linux/nfsd/export.h
---- linux-2.6.35.noarch/include/linux/nfsd/export.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/export.h 2010-09-30 12:25:08.769292000 -0400
+diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
+index 8ae78a6..933ab78 100644
+--- a/include/linux/nfsd/export.h
++++ b/include/linux/nfsd/export.h
@@ -100,6 +100,7 @@ struct svc_export {
uid_t ex_anon_uid;
gid_t ex_anon_gid;
@@ -27124,9 +27102,11 @@ diff -up linux-2.6.35.noarch/include/linux/nfsd/export.h.orig linux-2.6.35.noarc
unsigned char * ex_uuid; /* 16 byte fsid */
struct nfsd4_fs_locations ex_fslocs;
int ex_nflavors;
-diff -up linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h
---- linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-09-30 12:25:08.772295000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-09-30 12:25:08.774292000 -0400
+diff --git a/include/linux/nfsd/nfs4layoutxdr.h b/include/linux/nfsd/nfs4layoutxdr.h
+new file mode 100644
+index 0000000..b02d96a
+--- /dev/null
++++ b/include/linux/nfsd/nfs4layoutxdr.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2006 The Regents of the University of Michigan.
@@ -27260,9 +27240,11 @@ diff -up linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.3
+} pnfs_blocklayout_devinfo_t;
+
+#endif /* NFSD_NFS4LAYOUTXDR_H */
-diff -up linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h
---- linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-09-30 12:25:08.777291000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-09-30 12:25:08.778297000 -0400
+diff --git a/include/linux/nfsd/nfs4pnfsdlm.h b/include/linux/nfsd/nfs4pnfsdlm.h
+new file mode 100644
+index 0000000..eb31123
+--- /dev/null
++++ b/include/linux/nfsd/nfs4pnfsdlm.h
@@ -0,0 +1,54 @@
+/******************************************************************************
+ *
@@ -27304,700 +27286,800 @@ diff -up linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.35.
+/* For use by DLM cluster file systems exported by pNFSD */
+extern const struct pnfs_export_operations pnfs_dlm_export_ops;
+
-+int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len);
++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len);
++
++void nfsd4_pnfs_dlm_shutdown(void);
++
++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen);
++
++#else /* CONFIG_PNFSD */
++
++static inline void nfsd4_pnfs_dlm_shutdown(void)
++{
++ return;
++}
++
++#endif /* CONFIG_PNFSD */
+diff --git a/include/linux/nfsd/nfsd4_pnfs.h b/include/linux/nfsd/nfsd4_pnfs.h
+new file mode 100644
+index 0000000..2e66837
+--- /dev/null
++++ b/include/linux/nfsd/nfsd4_pnfs.h
+@@ -0,0 +1,271 @@
++/*
++ * Copyright (c) 2006 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros at umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#ifndef _LINUX_NFSD_NFSD4_PNFS_H
++#define _LINUX_NFSD_NFSD4_PNFS_H
++
++#include <linux/exportfs.h>
++#include <linux/exp_xdr.h>
++#include <linux/nfs_xdr.h>
++
++struct nfsd4_pnfs_deviceid {
++ u64 sbid; /* per-superblock unique ID */
++ u64 devid; /* filesystem-wide unique device ID */
++};
++
++struct nfsd4_pnfs_dev_iter_res {
++ u64 gd_cookie; /* request/repsonse */
++ u64 gd_verf; /* request/repsonse */
++ u64 gd_devid; /* response */
++ u32 gd_eof; /* response */
++};
++
++/* Arguments for set_device_notify */
++struct pnfs_devnotify_arg {
++ struct nfsd4_pnfs_deviceid dn_devid; /* request */
++ u32 dn_layout_type; /* request */
++ u32 dn_notify_types; /* request/response */
++};
++
++struct nfsd4_layout_seg {
++ u64 clientid;
++ u32 layout_type;
++ u32 iomode;
++ u64 offset;
++ u64 length;
++};
++
++/* Used by layout_get to encode layout (loc_body var in spec)
++ * Args:
++ * minlength - min number of accessible bytes given by layout
++ * fsid - Major part of struct pnfs_deviceid. File system uses this
++ * to build the deviceid returned in the layout.
++ * fh - fs can modify the file handle for use on data servers
++ * seg - layout info requested and layout info returned
++ * xdr - xdr info
++ * return_on_close - true if layout to be returned on file close
++ */
++
++struct nfsd4_pnfs_layoutget_arg {
++ u64 lg_minlength;
++ u64 lg_sbid;
++ const struct knfsd_fh *lg_fh;
++};
++
++struct nfsd4_pnfs_layoutget_res {
++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */
++ u32 lg_return_on_close;
++};
++
++struct nfsd4_pnfs_layoutcommit_arg {
++ struct nfsd4_layout_seg lc_seg; /* request */
++ u32 lc_reclaim; /* request */
++ u32 lc_newoffset; /* request */
++ u64 lc_last_wr; /* request */
++ struct nfstime4 lc_mtime; /* request */
++ u32 lc_up_len; /* layout length */
++ void *lc_up_layout; /* decoded by callback */
++};
++
++struct nfsd4_pnfs_layoutcommit_res {
++ u32 lc_size_chg; /* boolean for response */
++ u64 lc_newsize; /* response */
++};
++
++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */
++
++struct nfsd4_pnfs_layoutreturn_arg {
++ u32 lr_return_type; /* request */
++ struct nfsd4_layout_seg lr_seg; /* request */
++ u32 lr_reclaim; /* request */
++ u32 lrf_body_len; /* request */
++ void *lrf_body; /* request */
++ void *lr_cookie; /* fs private */
++};
++
++/* pNFS Metadata to Data server state communication */
++struct pnfs_get_state {
++ u32 dsid; /* request */
++ u64 ino; /* request */
++ nfs4_stateid stid; /* request;response */
++ nfs4_clientid clid; /* response */
++ u32 access; /* response */
++ u32 stid_gen; /* response */
++ u32 verifier[2]; /* response */
++};
++
++/*
++ * pNFS export operations vector.
++ *
++ * The filesystem must implement the following methods:
++ * layout_type
++ * get_device_info
++ * layout_get
++ *
++ * All other methods are optional and can be set to NULL if not implemented.
++ */
++struct pnfs_export_operations {
++ /* Returns the supported pnfs_layouttype4. */
++ int (*layout_type) (struct super_block *);
++
++ /* Encode device info onto the xdr stream. */
++ int (*get_device_info) (struct super_block *,
++ struct exp_xdr_stream *,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *);
++
++ /* Retrieve all available devices via an iterator.
++ * arg->cookie == 0 indicates the beginning of the list,
++ * otherwise arg->verf is used to verify that the list hasn't changed
++ * while retrieved.
++ *
++ * On output, the filesystem sets the devid based on the current cookie
++ * and sets res->cookie and res->verf corresponding to the next entry.
++ * When the last entry in the list is retrieved, res->eof is set to 1.
++ */
++ int (*get_device_iter) (struct super_block *,
++ u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *);
++
++ int (*set_device_notify) (struct super_block *,
++ struct pnfs_devnotify_arg *);
++
++ /* Retrieve and encode a layout for inode onto the xdr stream.
++ * arg->minlength is the minimum number of accessible bytes required
++ * by the client.
++ * The maximum number of bytes to encode the layout is given by
++ * the xdr stream end pointer.
++ * arg->fsid contains the major part of struct pnfs_deviceid.
++ * The file system uses this to build the deviceid returned
++ * in the layout.
++ * res->seg - layout segment requested and layout info returned.
++ * res->fh can be modified the file handle for use on data servers
++ * res->return_on_close - true if layout to be returned on file close
++ *
++ * return one of the following nfs errors:
++ * NFS_OK Success
++ * NFS4ERR_ACCESS Permission error
++ * NFS4ERR_BADIOMODE Server does not support requested iomode
++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules
++ * NFS4ERR_INVAL Parameter other than layout is invalid
++ * NFS4ERR_IO I/O error
++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later
++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file
++ * NFS4ERR_LOCKED Lock conflict
++ * NFS4ERR_NOSPC Out-of-space error occured
++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to
++ * a conflicting CB_LAYOUTRECALL
++ * NFS4ERR_SERVERFAULT Server went bezerk
++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout
++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file)
++ */
++ enum nfsstat4 (*layout_get) (struct inode *,
++ struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_layoutget_arg *,
++ struct nfsd4_pnfs_layoutget_res *);
++
++ /* Commit changes to layout */
++ int (*layout_commit) (struct inode *,
++ const struct nfsd4_pnfs_layoutcommit_arg *,
++ struct nfsd4_pnfs_layoutcommit_res *);
++
++ /* Returns the layout */
++ int (*layout_return) (struct inode *,
++ const struct nfsd4_pnfs_layoutreturn_arg *);
++
++ /* Can layout segments be merged for this layout type? */
++ int (*can_merge_layouts) (u32 layout_type);
++
++ /* pNFS Files layout specific operations */
++
++ /* Get the write verifier for DS (called on MDS only) */
++ void (*get_verifier) (struct super_block *, u32 *p);
++ /* Call fs on DS only */
++ int (*get_state) (struct inode *, struct knfsd_fh *,
++ struct pnfs_get_state *);
++};
++
++struct nfsd4_pnfs_cb_layout {
++ u32 cbl_recall_type; /* request */
++ struct nfsd4_layout_seg cbl_seg; /* request */
++ u32 cbl_layoutchanged; /* request */
++ nfs4_stateid cbl_sid; /* request */
++ struct nfs4_fsid cbl_fsid;
++ void *cbl_cookie; /* fs private */
++};
+
-+void nfsd4_pnfs_dlm_shutdown(void);
++/* layoutrecall request (from exported filesystem) */
++struct nfs4_layoutrecall {
++ struct kref clr_ref;
++ struct nfsd4_pnfs_cb_layout cb; /* request */
++ struct list_head clr_perclnt; /* on cl_layoutrecalls */
++ struct nfs4_client *clr_client;
++ struct nfs4_file *clr_file;
++ struct timespec clr_time; /* last activity */
++ struct super_block *clr_sb; /* We might not have a file */
++ struct nfs4_layoutrecall *parent; /* The initiating recall */
+
-+ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen);
++ void *clr_args; /* nfsd internal */
++};
+
-+#else /* CONFIG_PNFSD */
++struct nfsd4_pnfs_cb_dev_item {
++ u32 cbd_notify_type; /* request */
++ u32 cbd_layout_type; /* request */
++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */
++ u32 cbd_immediate; /* request */
++};
+
-+static inline void nfsd4_pnfs_dlm_shutdown(void)
-+{
-+ return;
-+}
++struct nfsd4_pnfs_cb_dev_list {
++ u32 cbd_len; /* request */
++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */
++};
+
-+#endif /* CONFIG_PNFSD */
-diff -up linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h
---- linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-09-30 12:25:08.782294000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-09-30 12:25:08.784293000 -0400
-@@ -0,0 +1,271 @@
+/*
-+ * Copyright (c) 2006 The Regents of the University of Michigan.
-+ * All rights reserved.
-+ *
-+ * Andy Adamson <andros at umich.edu>
-+ *
-+ * Redistribution and use in source and binary forms, with or without
-+ * modification, are permitted provided that the following conditions
-+ * are met:
-+ *
-+ * 1. Redistributions of source code must retain the above copyright
-+ * notice, this list of conditions and the following disclaimer.
-+ * 2. Redistributions in binary form must reproduce the above copyright
-+ * notice, this list of conditions and the following disclaimer in the
-+ * documentation and/or other materials provided with the distribution.
-+ * 3. Neither the name of the University nor the names of its
-+ * contributors may be used to endorse or promote products derived
-+ * from this software without specific prior written permission.
-+ *
-+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
-+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
++ * callbacks provided by the nfsd
+ */
++struct pnfsd_cb_operations {
++ /* Generic callbacks */
++ int (*cb_layout_recall) (struct super_block *, struct inode *,
++ struct nfsd4_pnfs_cb_layout *);
++ int (*cb_device_notify) (struct super_block *,
++ struct nfsd4_pnfs_cb_dev_list *);
+
-+#ifndef _LINUX_NFSD_NFSD4_PNFS_H
-+#define _LINUX_NFSD_NFSD4_PNFS_H
-+
-+#include <linux/exportfs.h>
-+#include <linux/exp_xdr.h>
-+#include <linux/nfs_xdr.h>
++ /* pNFS Files layout specific callbacks */
+
-+struct nfsd4_pnfs_deviceid {
-+ u64 sbid; /* per-superblock unique ID */
-+ u64 devid; /* filesystem-wide unique device ID */
++ /* Callback from fs on MDS only */
++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *);
++ /* Callback from fs on DS only */
++ int (*cb_change_state) (struct pnfs_get_state *);
+};
+
-+struct nfsd4_pnfs_dev_iter_res {
-+ u64 gd_cookie; /* request/repsonse */
-+ u64 gd_verf; /* request/repsonse */
-+ u64 gd_devid; /* response */
-+ u32 gd_eof; /* response */
++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */
+diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
+index 812bc1e..df667d0 100644
+--- a/include/linux/nfsd/syscall.h
++++ b/include/linux/nfsd/syscall.h
+@@ -29,6 +29,7 @@
+ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */
+ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */
+ #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */
++#define NFSCTL_FD2FH 9 /* get a fh from a fd */
+
+ /* SVC */
+ struct nfsctl_svc {
+@@ -71,6 +72,11 @@ struct nfsctl_fsparm {
+ int gd_maxlen;
+ };
+
++/* FD2FH */
++struct nfsctl_fd2fh {
++ int fd;
+};
+
-+/* Arguments for set_device_notify */
-+struct pnfs_devnotify_arg {
-+ struct nfsd4_pnfs_deviceid dn_devid; /* request */
-+ u32 dn_layout_type; /* request */
-+ u32 dn_notify_types; /* request/response */
-+};
+ /*
+ * This is the argument union.
+ */
+@@ -82,6 +88,7 @@ struct nfsctl_arg {
+ struct nfsctl_export u_export;
+ struct nfsctl_fdparm u_getfd;
+ struct nfsctl_fsparm u_getfs;
++ struct nfsctl_fd2fh u_fd2fh;
+ /*
+ * The following dummy member is needed to preserve binary compatibility
+ * on platforms where alignof(void*)>alignof(int). It's needed because
+@@ -95,6 +102,7 @@ struct nfsctl_arg {
+ #define ca_export u.u_export
+ #define ca_getfd u.u_getfd
+ #define ca_getfs u.u_getfs
++#define ca_fd2fh u.u_fd2fh
+ };
+
+ union nfsctl_res {
+diff --git a/include/linux/nfsd4_block.h b/include/linux/nfsd4_block.h
+new file mode 100644
+index 0000000..b0d5177
+--- /dev/null
++++ b/include/linux/nfsd4_block.h
+@@ -0,0 +1,101 @@
++#ifndef NFSD4_BLOCK
++#define NFSD4_BLOCK
+
-+struct nfsd4_layout_seg {
-+ u64 clientid;
-+ u32 layout_type;
-+ u32 iomode;
-+ u64 offset;
-+ u64 length;
-+};
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/svcauth.h>
++#include <linux/nfsd/nfsfh.h>
++#include <linux/nfsd/nfsd4_pnfs.h>
+
-+/* Used by layout_get to encode layout (loc_body var in spec)
-+ * Args:
-+ * minlength - min number of accessible bytes given by layout
-+ * fsid - Major part of struct pnfs_deviceid. File system uses this
-+ * to build the deviceid returned in the layout.
-+ * fh - fs can modify the file handle for use on data servers
-+ * seg - layout info requested and layout info returned
-+ * xdr - xdr info
-+ * return_on_close - true if layout to be returned on file close
-+ */
++#define PNFS_BLOCK_SUCCESS 1
++#define PNFS_BLOCK_FAILURE 0
+
-+struct nfsd4_pnfs_layoutget_arg {
-+ u64 lg_minlength;
-+ u64 lg_sbid;
-+ const struct knfsd_fh *lg_fh;
-+};
++#define PNFS_BLOCK_CTL_START 1
++#define PNFS_BLOCK_CTL_STOP 2
++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current
++ * version from kernel via an upcall.
++ */
+
-+struct nfsd4_pnfs_layoutget_res {
-+ struct nfsd4_layout_seg lg_seg; /* request/resopnse */
-+ u32 lg_return_on_close;
-+};
++#define PNFS_UPCALL_MSG_STOP 0
++#define PNFS_UPCALL_MSG_GETSIG 1
++#define PNFS_UPCALL_MSG_GETSLICE 2
++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume
++#define PNFS_UPCALL_MSG_DMGET 4
++#define PNFS_UPCALL_MSG_VERS 5
+
-+struct nfsd4_pnfs_layoutcommit_arg {
-+ struct nfsd4_layout_seg lc_seg; /* request */
-+ u32 lc_reclaim; /* request */
-+ u32 lc_newoffset; /* request */
-+ u64 lc_last_wr; /* request */
-+ struct nfstime4 lc_mtime; /* request */
-+ u32 lc_up_len; /* layout length */
-+ void *lc_up_layout; /* decoded by callback */
-+};
++#define PNFS_UPCALL_VERS 8
+
-+struct nfsd4_pnfs_layoutcommit_res {
-+ u32 lc_size_chg; /* boolean for response */
-+ u64 lc_newsize; /* response */
-+};
++typedef struct stripe_dev {
++ int major,
++ minor,
++ offset;
++} stripe_dev_t;
+
-+#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */
++typedef struct bl_comm_res {
++ int res_status;
++ union {
++ struct {
++ long long start,
++ length;
++ } slice;
++ struct {
++ int num_stripes,
++ stripe_size;
++ stripe_dev_t devs[];
++ } stripe;
++ struct {
++ long long sector;
++ int offset,
++ len;
++ char sig[];
++ } sig;
++ int vers,
++ dm_vol;
++ } u;
++} bl_comm_res_t;
++
++typedef struct bl_comm_msg {
++ int msg_type,
++ msg_status;
++ union {
++ dev_t msg_dev;
++ int msg_vers;
++ } u;
++ bl_comm_res_t *msg_res;
++} bl_comm_msg_t;
++
++#ifdef __KERNEL__
++
++typedef struct bl_comm {
++ /* ---- protects access to this structure ---- */
++ struct mutex lock;
++ /* ---- protects access to rpc pipe ---- */
++ struct mutex pipe_lock;
++ struct dentry *pipe_dentry;
++ wait_queue_head_t pipe_wq;
++ bl_comm_msg_t msg;
++} bl_comm_t;
++
++int pnfs_block_enabled(struct inode *, int);
++int bl_layout_type(struct super_block *sb);
++int bl_getdeviceiter(struct super_block *, u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *);
++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *);
++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *,
++ const struct nfsd4_pnfs_layoutget_arg *,
++ struct nfsd4_pnfs_layoutget_res *);
++int bl_layoutcommit(struct inode *,
++ const struct nfsd4_pnfs_layoutcommit_arg *,
++ struct nfsd4_pnfs_layoutcommit_res *);
++int bl_layoutreturn(struct inode *,
++ const struct nfsd4_pnfs_layoutreturn_arg *);
++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len);
++int bl_init_proc(void);
++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **);
+
-+struct nfsd4_pnfs_layoutreturn_arg {
-+ u32 lr_return_type; /* request */
-+ struct nfsd4_layout_seg lr_seg; /* request */
-+ u32 lr_reclaim; /* request */
-+ u32 lrf_body_len; /* request */
-+ void *lrf_body; /* request */
-+ void *lr_cookie; /* fs private */
-+};
++extern bl_comm_t *bl_comm_global; // Ugly...
++#endif /* __KERNEL__ */
+
-+/* pNFS Metadata to Data server state communication */
-+struct pnfs_get_state {
-+ u32 dsid; /* request */
-+ u64 ino; /* request */
-+ nfs4_stateid stid; /* request;response */
-+ nfs4_clientid clid; /* response */
-+ u32 access; /* response */
-+ u32 stid_gen; /* response */
-+ u32 verifier[2]; /* response */
-+};
++#endif /* NFSD4_BLOCK */
+
+diff --git a/include/linux/nfsd4_spnfs.h b/include/linux/nfsd4_spnfs.h
+new file mode 100644
+index 0000000..ea828e4
+--- /dev/null
++++ b/include/linux/nfsd4_spnfs.h
+@@ -0,0 +1,345 @@
+/*
-+ * pNFS export operations vector.
++ * include/linux/nfsd4_spnfs.h
+ *
-+ * The filesystem must implement the following methods:
-+ * layout_type
-+ * get_device_info
-+ * layout_get
++ * spNFS - simple pNFS implementation with userspace daemon
+ *
-+ * All other methods are optional and can be set to NULL if not implemented.
+ */
-+struct pnfs_export_operations {
-+ /* Returns the supported pnfs_layouttype4. */
-+ int (*layout_type) (struct super_block *);
+
-+ /* Encode device info onto the xdr stream. */
-+ int (*get_device_info) (struct super_block *,
-+ struct exp_xdr_stream *,
-+ u32 layout_type,
-+ const struct nfsd4_pnfs_deviceid *);
++/******************************************************************************
+
-+ /* Retrieve all available devices via an iterator.
-+ * arg->cookie == 0 indicates the beginning of the list,
-+ * otherwise arg->verf is used to verify that the list hasn't changed
-+ * while retrieved.
-+ *
-+ * On output, the filesystem sets the devid based on the current cookie
-+ * and sets res->cookie and res->verf corresponding to the next entry.
-+ * When the last entry in the list is retrieved, res->eof is set to 1.
-+ */
-+ int (*get_device_iter) (struct super_block *,
-+ u32 layout_type,
-+ struct nfsd4_pnfs_dev_iter_res *);
++(c) 2007 Network Appliance, Inc. All Rights Reserved.
+
-+ int (*set_device_notify) (struct super_block *,
-+ struct pnfs_devnotify_arg *);
++Network Appliance provides this source code under the GPL v2 License.
++The GPL v2 license is available at
++http://opensource.org/licenses/gpl-license.php.
+
-+ /* Retrieve and encode a layout for inode onto the xdr stream.
-+ * arg->minlength is the minimum number of accessible bytes required
-+ * by the client.
-+ * The maximum number of bytes to encode the layout is given by
-+ * the xdr stream end pointer.
-+ * arg->fsid contains the major part of struct pnfs_deviceid.
-+ * The file system uses this to build the deviceid returned
-+ * in the layout.
-+ * res->seg - layout segment requested and layout info returned.
-+ * res->fh can be modified the file handle for use on data servers
-+ * res->return_on_close - true if layout to be returned on file close
-+ *
-+ * return one of the following nfs errors:
-+ * NFS_OK Success
-+ * NFS4ERR_ACCESS Permission error
-+ * NFS4ERR_BADIOMODE Server does not support requested iomode
-+ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules
-+ * NFS4ERR_INVAL Parameter other than layout is invalid
-+ * NFS4ERR_IO I/O error
-+ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later
-+ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file
-+ * NFS4ERR_LOCKED Lock conflict
-+ * NFS4ERR_NOSPC Out-of-space error occured
-+ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to
-+ * a conflicting CB_LAYOUTRECALL
-+ * NFS4ERR_SERVERFAULT Server went bezerk
-+ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout
-+ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file)
-+ */
-+ enum nfsstat4 (*layout_get) (struct inode *,
-+ struct exp_xdr_stream *xdr,
-+ const struct nfsd4_pnfs_layoutget_arg *,
-+ struct nfsd4_pnfs_layoutget_res *);
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
-+ /* Commit changes to layout */
-+ int (*layout_commit) (struct inode *,
-+ const struct nfsd4_pnfs_layoutcommit_arg *,
-+ struct nfsd4_pnfs_layoutcommit_res *);
++******************************************************************************/
+
-+ /* Returns the layout */
-+ int (*layout_return) (struct inode *,
-+ const struct nfsd4_pnfs_layoutreturn_arg *);
++#ifndef NFS_SPNFS_H
++#define NFS_SPNFS_H
+
-+ /* Can layout segments be merged for this layout type? */
-+ int (*can_merge_layouts) (u32 layout_type);
+
-+ /* pNFS Files layout specific operations */
++#ifdef __KERNEL__
++#include "exportfs.h"
++#include "sunrpc/svc.h"
++#include "nfsd/nfsfh.h"
++#else
++#include <sys/types.h>
++#endif /* __KERNEL__ */
+
-+ /* Get the write verifier for DS (called on MDS only) */
-+ void (*get_verifier) (struct super_block *, u32 *p);
-+ /* Call fs on DS only */
-+ int (*get_state) (struct inode *, struct knfsd_fh *,
-+ struct pnfs_get_state *);
-+};
++#define SPNFS_STATUS_INVALIDMSG 0x01
++#define SPNFS_STATUS_AGAIN 0x02
++#define SPNFS_STATUS_FAIL 0x04
++#define SPNFS_STATUS_SUCCESS 0x08
+
-+struct nfsd4_pnfs_cb_layout {
-+ u32 cbl_recall_type; /* request */
-+ struct nfsd4_layout_seg cbl_seg; /* request */
-+ u32 cbl_layoutchanged; /* request */
-+ nfs4_stateid cbl_sid; /* request */
-+ struct nfs4_fsid cbl_fsid;
-+ void *cbl_cookie; /* fs private */
++#define SPNFS_TYPE_LAYOUTGET 0x01
++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02
++#define SPNFS_TYPE_LAYOUTRETURN 0x03
++#define SPNFS_TYPE_GETDEVICEITER 0x04
++#define SPNFS_TYPE_GETDEVICEINFO 0x05
++#define SPNFS_TYPE_SETATTR 0x06
++#define SPNFS_TYPE_OPEN 0x07
++#define SPNFS_TYPE_CLOSE 0x08
++#define SPNFS_TYPE_CREATE 0x09
++#define SPNFS_TYPE_REMOVE 0x0a
++#define SPNFS_TYPE_COMMIT 0x0b
++#define SPNFS_TYPE_READ 0x0c
++#define SPNFS_TYPE_WRITE 0x0d
++
++#define SPNFS_MAX_DEVICES 1
++#define SPNFS_MAX_DATA_SERVERS 16
++#define SPNFS_MAX_IO 512
++
++/* layout */
++struct spnfs_msg_layoutget_args {
++ unsigned long inode;
++ unsigned long generation;
+};
+
-+/* layoutrecall request (from exported filesystem) */
-+struct nfs4_layoutrecall {
-+ struct kref clr_ref;
-+ struct nfsd4_pnfs_cb_layout cb; /* request */
-+ struct list_head clr_perclnt; /* on cl_layoutrecalls */
-+ struct nfs4_client *clr_client;
-+ struct nfs4_file *clr_file;
-+ struct timespec clr_time; /* last activity */
-+ struct super_block *clr_sb; /* We might not have a file */
-+ struct nfs4_layoutrecall *parent; /* The initiating recall */
++struct spnfs_filelayout_list {
++ u_int32_t fh_len;
++ unsigned char fh_val[128]; /* DMXXX fix this const */
++};
+
-+ void *clr_args; /* nfsd internal */
++struct spnfs_msg_layoutget_res {
++ int status;
++ u_int64_t devid;
++ u_int64_t stripe_size;
++ u_int32_t stripe_type;
++ u_int32_t stripe_count;
++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS];
+};
+
-+struct nfsd4_pnfs_cb_dev_item {
-+ u32 cbd_notify_type; /* request */
-+ u32 cbd_layout_type; /* request */
-+ struct nfsd4_pnfs_deviceid cbd_devid; /* request */
-+ u32 cbd_immediate; /* request */
++/* layoutcommit */
++struct spnfs_msg_layoutcommit_args {
++ unsigned long inode;
++ unsigned long generation;
++ u_int64_t file_size;
+};
+
-+struct nfsd4_pnfs_cb_dev_list {
-+ u32 cbd_len; /* request */
-+ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */
++struct spnfs_msg_layoutcommit_res {
++ int status;
+};
+
++/* layoutreturn */
++/* No op for the daemon */
+/*
-+ * callbacks provided by the nfsd
-+ */
-+struct pnfsd_cb_operations {
-+ /* Generic callbacks */
-+ int (*cb_layout_recall) (struct super_block *, struct inode *,
-+ struct nfsd4_pnfs_cb_layout *);
-+ int (*cb_device_notify) (struct super_block *,
-+ struct nfsd4_pnfs_cb_dev_list *);
++struct spnfs_msg_layoutreturn_args {
++};
++
++struct spnfs_msg_layoutreturn_res {
++};
++*/
++
++/* getdeviceiter */
++struct spnfs_msg_getdeviceiter_args {
++ unsigned long inode;
++ u_int64_t cookie;
++ u_int64_t verf;
++};
++
++struct spnfs_msg_getdeviceiter_res {
++ int status;
++ u_int64_t devid;
++ u_int64_t cookie;
++ u_int64_t verf;
++ u_int32_t eof;
++};
++
++/* getdeviceinfo */
++struct spnfs_data_server {
++ u_int32_t dsid;
++ char netid[5];
++ char addr[29];
++};
++
++struct spnfs_device {
++ u_int64_t devid;
++ int dscount;
++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS];
++};
++
++struct spnfs_msg_getdeviceinfo_args {
++ u_int64_t devid;
++};
+
-+ /* pNFS Files layout specific callbacks */
++struct spnfs_msg_getdeviceinfo_res {
++ int status;
++ struct spnfs_device devinfo;
++};
+
-+ /* Callback from fs on MDS only */
-+ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *);
-+ /* Callback from fs on DS only */
-+ int (*cb_change_state) (struct pnfs_get_state *);
++/* setattr */
++struct spnfs_msg_setattr_args {
++ unsigned long inode;
++ unsigned long generation;
++ int file_size;
+};
+
-+#endif /* _LINUX_NFSD_NFSD4_PNFS_H */
-diff -up linux-2.6.35.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.35.noarch/include/linux/nfsd/syscall.h
---- linux-2.6.35.noarch/include/linux/nfsd/syscall.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfsd/syscall.h 2010-09-30 12:25:08.788295000 -0400
-@@ -29,6 +29,7 @@
- /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */
- #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */
- #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */
-+#define NFSCTL_FD2FH 9 /* get a fh from a fd */
-
- /* SVC */
- struct nfsctl_svc {
-@@ -71,6 +72,11 @@ struct nfsctl_fsparm {
- int gd_maxlen;
- };
-
-+/* FD2FH */
-+struct nfsctl_fd2fh {
-+ int fd;
++struct spnfs_msg_setattr_res {
++ int status;
+};
+
- /*
- * This is the argument union.
- */
-@@ -82,6 +88,7 @@ struct nfsctl_arg {
- struct nfsctl_export u_export;
- struct nfsctl_fdparm u_getfd;
- struct nfsctl_fsparm u_getfs;
-+ struct nfsctl_fd2fh u_fd2fh;
- /*
- * The following dummy member is needed to preserve binary compatibility
- * on platforms where alignof(void*)>alignof(int). It's needed because
-@@ -95,6 +102,7 @@ struct nfsctl_arg {
- #define ca_export u.u_export
- #define ca_getfd u.u_getfd
- #define ca_getfs u.u_getfs
-+#define ca_fd2fh u.u_fd2fh
- };
-
- union nfsctl_res {
-diff -up linux-2.6.35.noarch/include/linux/nfs_fs.h.orig linux-2.6.35.noarch/include/linux/nfs_fs.h
---- linux-2.6.35.noarch/include/linux/nfs_fs.h.orig 2010-09-30 12:22:50.226195000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfs_fs.h 2010-09-30 12:25:08.691290000 -0400
-@@ -188,6 +188,10 @@ struct nfs_inode {
- struct nfs_delegation *delegation;
- fmode_t delegation_state;
- struct rw_semaphore rwsem;
++/* open */
++struct spnfs_msg_open_args {
++ unsigned long inode;
++ unsigned long generation;
++ int create;
++ int createmode;
++ int truncate;
++};
+
-+ /* pNFS layout information */
-+ struct rpc_wait_queue lo_rpcwaitq;
-+ struct pnfs_layout_hdr *layout;
- #endif /* CONFIG_NFS_V4*/
- #ifdef CONFIG_NFS_FSCACHE
- struct fscache_cookie *fscache;
-@@ -490,8 +494,12 @@ extern void nfs_unblock_sillyrename(stru
- extern int nfs_congestion_kb;
- extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
- extern int nfs_writepages(struct address_space *, struct writeback_control *);
--extern int nfs_flush_incompatible(struct file *file, struct page *page);
--extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-+struct pnfs_layout_segment;
-+extern int nfs_flush_incompatible(struct file *file, struct page *page,
-+ struct pnfs_layout_segment *lseg);
-+extern int nfs_updatepage(struct file *, struct page *,
-+ unsigned int offset, unsigned int count,
-+ struct pnfs_layout_segment *lseg, void *fsdata);
- extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
-
- /*
-@@ -613,6 +621,8 @@ extern void * nfs_root_data(void);
- #define NFSDBG_CLIENT 0x0200
- #define NFSDBG_MOUNT 0x0400
- #define NFSDBG_FSCACHE 0x0800
-+#define NFSDBG_PNFS 0x1000
-+#define NFSDBG_PNFS_LD 0x2000
- #define NFSDBG_ALL 0xFFFF
-
- #ifdef __KERNEL__
-diff -up linux-2.6.35.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.35.noarch/include/linux/nfs_fs_sb.h
---- linux-2.6.35.noarch/include/linux/nfs_fs_sb.h.orig 2010-09-30 12:22:50.231192000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfs_fs_sb.h 2010-09-30 12:25:08.703293000 -0400
-@@ -82,6 +82,8 @@ struct nfs_client {
- /* The flags used for obtaining the clientid during EXCHANGE_ID */
- u32 cl_exchange_flags;
- struct nfs4_session *cl_session; /* sharred session */
-+ struct list_head cl_layouts;
-+ struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
- #endif /* CONFIG_NFS_V4_1 */
-
- #ifdef CONFIG_NFS_FSCACHE
-@@ -89,6 +91,16 @@ struct nfs_client {
- #endif
- };
-
-+static inline bool
-+is_ds_only_client(struct nfs_client *clp)
-+{
-+#ifdef CONFIG_NFS_V4_1
-+ return is_ds_only_session(clp->cl_exchange_flags);
-+#else
-+ return false;
-+#endif
-+}
++struct spnfs_msg_open_res {
++ int status;
++};
+
- /*
- * NFS client parameters stored in the superblock.
- */
-@@ -133,7 +145,7 @@ struct nfs_server {
- #endif
-
- #ifdef CONFIG_NFS_V4
-- u32 attr_bitmask[2];/* V4 bitmask representing the set
-+ u32 attr_bitmask[3];/* V4 bitmask representing the set
- of attributes supported on this
- filesystem */
- u32 cache_consistency_bitmask[2];
-@@ -144,6 +156,11 @@ struct nfs_server {
- u32 acl_bitmask; /* V4 bitmask representing the ACEs
- that are supported on this
- filesystem */
-+ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
-+ void *pnfs_ld_data; /* Per-mount data */
-+ unsigned int ds_rsize; /* Data server read size */
-+ unsigned int ds_wsize; /* Data server write size */
-+ u32 pnfs_blksize; /* layout_blksize attr */
- #endif
- void (*destroy)(struct nfs_server *);
-
-diff -up linux-2.6.35.noarch/include/linux/nfs_iostat.h.orig linux-2.6.35.noarch/include/linux/nfs_iostat.h
---- linux-2.6.35.noarch/include/linux/nfs_iostat.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfs_iostat.h 2010-09-30 12:25:08.715296000 -0400
-@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters {
- NFSIOS_SHORTREAD,
- NFSIOS_SHORTWRITE,
- NFSIOS_DELAY,
-+ NFSIOS_PNFS_READ,
-+ NFSIOS_PNFS_WRITE,
-+ NFSIOS_PNFS_COMMIT,
- __NFSIOS_COUNTSMAX,
- };
-
-diff -up linux-2.6.35.noarch/include/linux/nfs_page.h.orig linux-2.6.35.noarch/include/linux/nfs_page.h
---- linux-2.6.35.noarch/include/linux/nfs_page.h.orig 2010-09-30 12:22:50.240192000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfs_page.h 2010-09-30 12:25:08.728291000 -0400
-@@ -48,6 +48,7 @@ struct nfs_page {
- struct kref wb_kref; /* reference count */
- unsigned long wb_flags;
- struct nfs_writeverf wb_verf; /* Commit cookie */
-+ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */
- };
-
- struct nfs_pageio_descriptor {
-@@ -61,6 +62,12 @@ struct nfs_pageio_descriptor {
- int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int);
- int pg_ioflags;
- int pg_error;
-+ struct pnfs_layout_segment *pg_lseg;
-+#ifdef CONFIG_NFS_V4_1
-+ int pg_iswrite;
-+ int pg_boundary;
-+ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
-+#endif /* CONFIG_NFS_V4_1 */
- };
-
- #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))
-@@ -69,13 +76,15 @@ extern struct nfs_page *nfs_create_reque
- struct inode *inode,
- struct page *page,
- unsigned int offset,
-- unsigned int count);
-+ unsigned int count,
-+ struct pnfs_layout_segment *lseg);
- extern void nfs_clear_request(struct nfs_page *req);
- extern void nfs_release_request(struct nfs_page *req);
-
-
- extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
-- pgoff_t idx_start, unsigned int npages, int tag);
-+ pgoff_t idx_start, unsigned int npages, int tag,
-+ int *use_pnfs);
- extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
- struct inode *inode,
- int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
-diff -up linux-2.6.35.noarch/include/linux/nfs_xdr.h.orig linux-2.6.35.noarch/include/linux/nfs_xdr.h
---- linux-2.6.35.noarch/include/linux/nfs_xdr.h.orig 2010-09-30 12:22:50.245193000 -0400
-+++ linux-2.6.35.noarch/include/linux/nfs_xdr.h 2010-09-30 12:25:08.740294000 -0400
-@@ -3,6 +3,8 @@
-
- #include <linux/nfsacl.h>
- #include <linux/nfs3.h>
-+#include <linux/nfs4.h>
-+#include <linux/sunrpc/sched.h>
-
- /*
- * To change the maximum rsize and wsize supported by the NFS client, adjust
-@@ -10,7 +12,7 @@
- * support a megabyte or more. The default is left at 4096 bytes, which is
- * reasonable for NFS over UDP.
- */
--#define NFS_MAX_FILE_IO_SIZE (1048576U)
-+#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U)
- #define NFS_DEF_FILE_IO_SIZE (4096U)
- #define NFS_MIN_FILE_IO_SIZE (1024U)
-
-@@ -113,6 +115,8 @@ struct nfs_fsinfo {
- __u32 dtpref; /* pref. readdir transfer size */
- __u64 maxfilesize;
- __u32 lease_time; /* in seconds */
-+ __u32 layouttype; /* supported pnfs layout driver */
-+ __u32 blksize; /* preferred pnfs io block size */
- };
-
- struct nfs_fsstat {
-@@ -185,6 +189,121 @@ struct nfs4_get_lease_time_res {
- struct nfs4_sequence_res lr_seq_res;
- };
-
-+#define PNFS_LAYOUT_MAXSIZE 4096
++/* close */
++/* No op for daemon */
++struct spnfs_msg_close_args {
++ int x;
++};
+
-+struct nfs4_layoutdriver_data {
-+ __u32 len;
-+ void *buf;
++struct spnfs_msg_close_res {
++ int y;
+};
+
-+struct pnfs_layout_range {
-+ u32 iomode;
-+ u64 offset;
-+ u64 length;
++/* create */
++/*
++struct spnfs_msg_create_args {
++ int x;
+};
+
-+struct nfs4_layoutget_args {
-+ __u32 type;
-+ struct pnfs_layout_range range;
-+ __u64 minlength;
-+ __u32 maxcount;
-+ struct inode *inode;
-+ struct nfs_open_context *ctx;
-+ struct nfs4_sequence_args seq_args;
++struct spnfs_msg_create_res {
++ int y;
+};
++*/
+
-+struct nfs4_layoutget_res {
-+ __u32 return_on_close;
-+ struct pnfs_layout_range range;
-+ __u32 type;
-+ nfs4_stateid stateid;
-+ struct nfs4_layoutdriver_data layout;
-+ struct nfs4_sequence_res seq_res;
++/* remove */
++struct spnfs_msg_remove_args {
++ unsigned long inode;
++ unsigned long generation;
+};
+
-+struct nfs4_layoutget {
-+ struct nfs4_layoutget_args args;
-+ struct nfs4_layoutget_res res;
-+ struct pnfs_layout_segment **lsegpp;
++struct spnfs_msg_remove_res {
+ int status;
+};
+
-+struct nfs4_layoutcommit_args {
-+ nfs4_stateid stateid;
-+ __u64 lastbytewritten;
-+ __u32 time_modify_changed;
-+ struct timespec time_modify;
-+ const u32 *bitmask;
-+ struct nfs_fh *fh;
-+ struct inode *inode;
++/* commit */
++/*
++struct spnfs_msg_commit_args {
++ int x;
++};
+
-+ /* Values set by layout driver */
-+ struct pnfs_layout_range range;
-+ __u32 layout_type;
-+ void *layoutdriver_data;
-+ struct nfs4_sequence_args seq_args;
++struct spnfs_msg_commit_res {
++ int y;
+};
++*/
+
-+struct nfs4_layoutcommit_res {
-+ __u32 sizechanged;
-+ __u64 newsize;
-+ struct nfs_fattr *fattr;
-+ const struct nfs_server *server;
-+ struct nfs4_sequence_res seq_res;
++/* read */
++struct spnfs_msg_read_args {
++ unsigned long inode;
++ unsigned long generation;
++ loff_t offset;
++ unsigned long len;
+};
+
-+struct nfs4_layoutcommit_data {
-+ struct rpc_task task;
-+ struct rpc_cred *cred;
-+ struct nfs_fattr fattr;
-+ struct nfs4_layoutcommit_args args;
-+ struct nfs4_layoutcommit_res res;
++struct spnfs_msg_read_res {
+ int status;
++ char data[SPNFS_MAX_IO];
+};
+
-+struct nfs4_layoutreturn_args {
-+ __u32 reclaim;
-+ __u32 layout_type;
-+ __u32 return_type;
-+ struct pnfs_layout_range range;
-+ struct inode *inode;
-+ struct nfs4_sequence_args seq_args;
++/* write */
++struct spnfs_msg_write_args {
++ unsigned long inode;
++ unsigned long generation;
++ loff_t offset;
++ unsigned long len;
++ char data[SPNFS_MAX_IO];
+};
+
-+struct nfs4_layoutreturn_res {
-+ struct nfs4_sequence_res seq_res;
-+ u32 lrs_present;
-+ nfs4_stateid stateid;
++struct spnfs_msg_write_res {
++ int status;
+};
+
-+struct nfs4_layoutreturn {
-+ struct nfs4_layoutreturn_args args;
-+ struct nfs4_layoutreturn_res res;
-+ struct rpc_cred *cred;
-+ int rpc_status;
++/* bundle args and responses */
++union spnfs_msg_args {
++ struct spnfs_msg_layoutget_args layoutget_args;
++ struct spnfs_msg_layoutcommit_args layoutcommit_args;
++/*
++ struct spnfs_msg_layoutreturn_args layoutreturn_args;
++*/
++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args;
++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args;
++ struct spnfs_msg_setattr_args setattr_args;
++ struct spnfs_msg_open_args open_args;
++ struct spnfs_msg_close_args close_args;
++/*
++ struct spnfs_msg_create_args create_args;
++*/
++ struct spnfs_msg_remove_args remove_args;
++/*
++ struct spnfs_msg_commit_args commit_args;
++*/
++ struct spnfs_msg_read_args read_args;
++ struct spnfs_msg_write_args write_args;
+};
+
-+struct nfs4_getdevicelist_args {
-+ const struct nfs_fh *fh;
-+ u32 layoutclass;
-+ struct nfs4_sequence_args seq_args;
++union spnfs_msg_res {
++ struct spnfs_msg_layoutget_res layoutget_res;
++ struct spnfs_msg_layoutcommit_res layoutcommit_res;
++/*
++ struct spnfs_msg_layoutreturn_res layoutreturn_res;
++*/
++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res;
++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res;
++ struct spnfs_msg_setattr_res setattr_res;
++ struct spnfs_msg_open_res open_res;
++ struct spnfs_msg_close_res close_res;
++/*
++ struct spnfs_msg_create_res create_res;
++*/
++ struct spnfs_msg_remove_res remove_res;
++/*
++ struct spnfs_msg_commit_res commit_res;
++*/
++ struct spnfs_msg_read_res read_res;
++ struct spnfs_msg_write_res write_res;
+};
+
-+struct nfs4_getdevicelist_res {
-+ struct pnfs_devicelist *devlist;
-+ struct nfs4_sequence_res seq_res;
++/* a spnfs message, args and response */
++struct spnfs_msg {
++ unsigned char im_type;
++ unsigned char im_status;
++ union spnfs_msg_args im_args;
++ union spnfs_msg_res im_res;
+};
+
-+struct nfs4_getdeviceinfo_args {
-+ struct pnfs_device *pdev;
-+ struct nfs4_sequence_args seq_args;
++/* spnfs configuration info */
++struct spnfs_config {
++ unsigned char dense_striping;
++ int stripe_size;
++ int num_ds;
++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */
+};
+
-+struct nfs4_getdeviceinfo_res {
-+ struct pnfs_device *pdev;
-+ struct nfs4_sequence_res seq_res;
-+};
++#if defined(__KERNEL__) && defined(CONFIG_SPNFS)
+
- /*
- * Arguments to the open call.
- */
-@@ -854,7 +973,7 @@ struct nfs4_server_caps_arg {
- };
-
- struct nfs4_server_caps_res {
-- u32 attr_bitmask[2];
-+ u32 attr_bitmask[3];
- u32 acl_bitmask;
- u32 has_links;
- u32 has_symlinks;
-@@ -969,6 +1088,30 @@ struct nfs_page;
-
- #define NFS_PAGEVEC_SIZE (8U)
-
-+#if defined(CONFIG_NFS_V4_1)
++#include <linux/nfsd/nfsd4_pnfs.h>
+
-+/* pnfsflag values */
-+enum pnfs_flags {
-+ PNFS_NO_RPC = 1 << 0, /* non rpc result callback switch */
++/* pipe mgmt structure. messages flow through here */
++struct spnfs {
++ struct dentry *spnfs_dentry; /* dentry for pipe */
++ wait_queue_head_t spnfs_wq;
++ struct spnfs_msg spnfs_im; /* spnfs message */
++ struct mutex spnfs_lock; /* Serializes upcalls */
++ struct mutex spnfs_plock;
+};
+
-+/* pnfs-specific data needed for read, write, and commit calls */
-+struct pnfs_call_data {
-+ struct pnfs_layout_segment *lseg;
-+ const struct rpc_call_ops *call_ops;
-+ u32 orig_count; /* for retry via MDS */
-+ int pnfs_error;
-+ u8 pnfsflags;
-+ u8 how; /* for FLUSH_STABLE */
-+};
++struct nfsd4_open;
+
-+/* files layout-type specific data for read, write, and commit */
-+struct pnfs_fl_call_data {
-+ struct nfs_client *ds_nfs_client;
-+ __u64 orig_offset;
-+};
-+#endif /* CONFIG_NFS_V4_1 */
++int spnfs_layout_type(struct super_block *);
++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr,
++ const struct nfsd4_pnfs_layoutget_arg *,
++ struct nfsd4_pnfs_layoutget_res *);
++int spnfs_layoutcommit(void);
++int spnfs_layoutreturn(struct inode *,
++ const struct nfsd4_pnfs_layoutreturn_arg *);
++int spnfs_getdeviceiter(struct super_block *,
++ u32 layout_type,
++ struct nfsd4_pnfs_dev_iter_res *);
++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *,
++ u32 layout_type,
++ const struct nfsd4_pnfs_deviceid *);
++int spnfs_setattr(void);
++int spnfs_open(struct inode *, struct nfsd4_open *);
++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *);
++int spnfs_remove(unsigned long, unsigned long);
++__be32 spnfs_read(struct inode *, loff_t, unsigned long *,
++ int, struct svc_rqst *);
++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *);
++int spnfs_getfh(int, struct nfs_fh *);
++int spnfs_test_layoutrecall(char *, u64, u64);
++int spnfs_layoutrecall(struct inode *, int, u64, u64);
+
- struct nfs_read_data {
- int flags;
- struct rpc_task task;
-@@ -984,10 +1127,16 @@ struct nfs_read_data {
- #ifdef CONFIG_NFS_V4
- unsigned long timestamp; /* For lease renewal */
- #endif
-+#if defined(CONFIG_NFS_V4_1)
-+ struct pnfs_call_data pdata;
-+ struct pnfs_fl_call_data fldata;
-+#endif /* CONFIG_NFS_V4_1 */
- struct page *page_array[NFS_PAGEVEC_SIZE];
- };
-
- struct nfs_write_data {
-+ struct kref refcount; /* For pnfs commit splitting */
-+ struct nfs_write_data *parent; /* For pnfs commit splitting */
- int flags;
- struct rpc_task task;
- struct inode *inode;
-@@ -1003,6 +1152,10 @@ struct nfs_write_data {
- #ifdef CONFIG_NFS_V4
- unsigned long timestamp; /* For lease renewal */
- #endif
-+#if defined(CONFIG_NFS_V4_1)
-+ struct pnfs_call_data pdata;
-+ struct pnfs_fl_call_data fldata;
-+#endif /* CONFIG_NFS_V4_1 */
- struct page *page_array[NFS_PAGEVEC_SIZE];
- };
-
-diff -up linux-2.6.35.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.35.noarch/include/linux/panfs_shim_api.h
---- linux-2.6.35.noarch/include/linux/panfs_shim_api.h.orig 2010-09-30 12:25:08.823293000 -0400
-+++ linux-2.6.35.noarch/include/linux/panfs_shim_api.h 2010-09-30 12:25:08.824299000 -0400
++int nfsd_spnfs_new(void);
++void nfsd_spnfs_delete(void);
++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *);
++int spnfs_enabled(void);
++int spnfs_init_proc(void);
++
++extern struct spnfs_config *spnfs_config;
++
++#endif /* __KERNEL__ && CONFIG_SPNFS */
++
++#endif /* NFS_SPNFS_H */
+diff --git a/include/linux/panfs_shim_api.h b/include/linux/panfs_shim_api.h
+new file mode 100644
+index 0000000..3b44e19
+--- /dev/null
++++ b/include/linux/panfs_shim_api.h
@@ -0,0 +1,57 @@
+#ifndef _PANFS_SHIM_API_H
+#define _PANFS_SHIM_API_H
@@ -28056,9 +28138,11 @@ diff -up linux-2.6.35.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.35.no
+panfs_shim_unregister(void);
+
+#endif /* _PANFS_SHIM_API_H */
-diff -up linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h
---- linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-09-30 12:25:08.834296000 -0400
-+++ linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h 2010-09-30 12:25:08.836294000 -0400
+diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
+new file mode 100644
+index 0000000..b404f33
+--- /dev/null
++++ b/include/linux/pnfs_osd_xdr.h
@@ -0,0 +1,439 @@
+/*
+ * pnfs_osd_xdr.h
@@ -28499,9 +28583,10 @@ diff -up linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.35.noar
+pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p);
+
+#endif /* __PNFS_OSD_XDR_H__ */
-diff -up linux-2.6.35.noarch/include/linux/posix_acl.h.orig linux-2.6.35.noarch/include/linux/posix_acl.h
---- linux-2.6.35.noarch/include/linux/posix_acl.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/posix_acl.h 2010-09-30 12:25:08.848294000 -0400
+diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
+index 6760816..fc3d2fc 100644
+--- a/include/linux/posix_acl.h
++++ b/include/linux/posix_acl.h
@@ -8,6 +8,7 @@
#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H
@@ -28510,9 +28595,10 @@ diff -up linux-2.6.35.noarch/include/linux/posix_acl.h.orig linux-2.6.35.noarch/
#include <linux/slab.h>
#define ACL_UNDEFINED_ID (-1)
-diff -up linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h
---- linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h 2010-09-30 12:25:08.860295000 -0400
+diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
+index 77e6248..1b26fff 100644
+--- a/include/linux/sunrpc/msg_prot.h
++++ b/include/linux/sunrpc/msg_prot.h
@@ -14,6 +14,8 @@
/* size of an XDR encoding unit in bytes, i.e. 32bit */
#define XDR_UNIT (4)
@@ -28522,9 +28608,10 @@ diff -up linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.35.n
/* spec defines authentication flavor as an unsigned 32 bit integer */
typedef u32 rpc_authflavor_t;
-diff -up linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h
---- linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-09-30 12:25:08.865298000 -0400
+diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
+index cf14db9..2177d50 100644
+--- a/include/linux/sunrpc/rpc_pipe_fs.h
++++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -3,6 +3,7 @@
#ifdef __KERNEL__
@@ -28544,10 +28631,12 @@ diff -up linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.3
};
struct rpc_pipe_ops {
-diff -up linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h
---- linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-09-30 12:25:08.869294000 -0400
-+++ linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-09-30 12:25:08.870300000 -0400
-@@ -0,0 +1,111 @@
+diff --git a/include/linux/sunrpc/simple_rpc_pipefs.h b/include/linux/sunrpc/simple_rpc_pipefs.h
+new file mode 100644
+index 0000000..f6a1227
+--- /dev/null
++++ b/include/linux/sunrpc/simple_rpc_pipefs.h
+@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2008 The Regents of the University of Michigan.
+ * All rights reserved.
@@ -28589,14 +28678,8 @@ diff -up linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux
+#ifndef _SIMPLE_RPC_PIPEFS_H_
+#define _SIMPLE_RPC_PIPEFS_H_
+
-+#include <linux/fs.h>
-+#include <linux/list.h>
-+#include <linux/mount.h>
-+#include <linux/sched.h>
-+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
-+
+#define payload_of(headerp) ((void *)(headerp + 1))
+
+/*
@@ -28659,10 +28742,11 @@ diff -up linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux
+extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg);
+
+#endif /* _SIMPLE_RPC_PIPEFS_H_ */
-diff -up linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h
---- linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h 2010-09-30 12:25:08.875298000 -0400
-@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con
+diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
+index 5f4e18b..f7a0358 100644
+--- a/include/linux/sunrpc/svc_xprt.h
++++ b/include/linux/sunrpc/svc_xprt.h
+@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(const struct sockaddr *addr,
return buf;
}
@@ -28704,9 +28788,10 @@ diff -up linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.35.n
+ return len;
+}
#endif /* SUNRPC_SVC_XPRT_H */
-diff -up linux-2.6.35.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.35.noarch/include/linux/sunrpc/xdr.h
---- linux-2.6.35.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/include/linux/sunrpc/xdr.h 2010-09-30 12:25:08.880298000 -0400
+diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
+index 35cf2e8..bb5f3fd 100644
+--- a/include/linux/sunrpc/xdr.h
++++ b/include/linux/sunrpc/xdr.h
@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp)
return p + 2;
}
@@ -28729,10 +28814,18 @@ diff -up linux-2.6.35.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.35.noarch
extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
unsigned int base, unsigned int len);
extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
-diff -up linux-2.6.35.noarch/net/sunrpc/Makefile.orig linux-2.6.35.noarch/net/sunrpc/Makefile
---- linux-2.6.35.noarch/net/sunrpc/Makefile.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/net/sunrpc/Makefile 2010-09-30 12:25:08.898305000 -0400
-@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt
+diff --git a/localversion-pnfs b/localversion-pnfs
+new file mode 100644
+index 0000000..7523328
+--- /dev/null
++++ b/localversion-pnfs
+@@ -0,0 +1 @@
++-pnfs
+diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
+index 9d2fca5..e102040 100644
+--- a/net/sunrpc/Makefile
++++ b/net/sunrpc/Makefile
+@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
svc.o svcsock.o svcauth.o svcauth_unix.o \
addr.o rpcb_clnt.o timer.o xdr.o \
sunrpc_syms.o cache.o rpc_pipe.o \
@@ -28741,10 +28834,12 @@ diff -up linux-2.6.35.noarch/net/sunrpc/Makefile.orig linux-2.6.35.noarch/net/su
sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
sunrpc-$(CONFIG_PROC_FS) += stats.o
sunrpc-$(CONFIG_SYSCTL) += sysctl.o
-diff -up linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c
---- linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-09-30 12:25:08.902299000 -0400
-+++ linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-09-30 12:25:08.904296000 -0400
-@@ -0,0 +1,424 @@
+diff --git a/net/sunrpc/simple_rpc_pipefs.c b/net/sunrpc/simple_rpc_pipefs.c
+new file mode 100644
+index 0000000..24af0a1
+--- /dev/null
++++ b/net/sunrpc/simple_rpc_pipefs.c
+@@ -0,0 +1,423 @@
+/*
+ * net/sunrpc/simple_rpc_pipefs.c
+ *
@@ -28785,9 +28880,8 @@ diff -up linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.35.no
+ * With thanks to CITI's project sponsor and partner, IBM.
+ */
+
-+#include <linux/completion.h>
-+#include <linux/uaccess.h>
-+#include <linux/module.h>
++#include <linux/mount.h>
++#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/simple_rpc_pipefs.h>
+
+
@@ -29169,10 +29263,11 @@ diff -up linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.35.no
+ kfree(rpcmsg);
+}
+EXPORT_SYMBOL(pipefs_generic_destroy_msg);
-diff -up linux-2.6.35.noarch/net/sunrpc/xdr.c.orig linux-2.6.35.noarch/net/sunrpc/xdr.c
---- linux-2.6.35.noarch/net/sunrpc/xdr.c.orig 2010-08-01 18:11:14.000000000 -0400
-+++ linux-2.6.35.noarch/net/sunrpc/xdr.c 2010-09-30 12:25:08.909298000 -0400
-@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf,
+diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
+index a1f82a8..98a59f6 100644
+--- a/net/sunrpc/xdr.c
++++ b/net/sunrpc/xdr.c
+@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
{
struct kvec *tail;
size_t copy;
@@ -29211,7 +29306,7 @@ diff -up linux-2.6.35.noarch/net/sunrpc/xdr.c.orig linux-2.6.35.noarch/net/sunrp
_copy_from_pages((char *)tail->iov_base,
buf->pages, buf->page_base + pglen - len,
copy);
-@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_st
+@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
EXPORT_SYMBOL_GPL(xdr_reserve_space);
/**
More information about the scm-commits
mailing list