Commit d5bb349d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'drm-vmwgfx-coherent-2019-11-29' of git://anongit.freedesktop.org/drm/drm

Pull drm coherent memory support for vmwgfx from Dave Airlie:
 "This is a separate pull for the mm pagewalking + drm/vmwgfx work
  Thomas did and you were involved in, I've left it separate in case you
  don't feel as comfortable with it as the other stuff.

  It has mm acks/r-b in the right places from what I can see"

* tag 'drm-vmwgfx-coherent-2019-11-29' of git://anongit.freedesktop.org/drm/drm:
  drm/vmwgfx: Add surface dirty-tracking callbacks
  drm/vmwgfx: Implement an infrastructure for read-coherent resources
  drm/vmwgfx: Use an RBtree instead of linked list for MOB resources
  drm/vmwgfx: Implement an infrastructure for write-coherent resources
  mm: Add write-protect and clean utilities for address space ranges
  mm: Add a walk_page_mapping() function to the pagewalk code
  mm: pagewalk: Take the pagetable lock in walk_pte_range()
  mm: Remove BUG_ON mmap_sem not held from xxx_trans_huge_lock()
  drm/ttm: Convert vm callbacks to helpers
  drm/ttm: Remove explicit typecasts of vm_private_data
parents 81b6b964 0a6cad5d
......@@ -42,8 +42,6 @@
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
#define TTM_BO_VM_NUM_PREFAULT 16
static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
struct vm_fault *vmf)
{
......@@ -106,25 +104,30 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
+ page_offset;
}
static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
/**
* ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
* @bo: The buffer object
* @vmf: The fault structure handed to the callback
*
* vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
* during long waits, and after the wait the callback will be restarted. This
* is to allow other threads using the same virtual memory space concurrent
* access to map(), unmap() completely unrelated buffer objects. TTM buffer
* object reservations sometimes wait for GPU and should therefore be
* considered long waits. This function reserves the buffer object interruptibly
* taking this into account. Starvation is avoided by the vm system not
* allowing too many repeated restarts.
* This function is intended to be used in customized fault() and _mkwrite()
* handlers.
*
* Return:
* 0 on success and the bo was reserved.
* VM_FAULT_RETRY if blocking wait.
* VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
*/
vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
vma->vm_private_data;
struct ttm_bo_device *bdev = bo->bdev;
unsigned long page_offset;
unsigned long page_last;
unsigned long pfn;
struct ttm_tt *ttm = NULL;
struct page *page;
int err;
int i;
vm_fault_t ret = VM_FAULT_NOPAGE;
unsigned long address = vmf->address;
struct ttm_mem_type_manager *man =
&bdev->man[bo->mem.mem_type];
struct vm_area_struct cvma;
/*
* Work around locking order reversal in fault / nopfn
* between mmap_sem and bo_reserve: Perform a trylock operation
......@@ -151,14 +154,54 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
return VM_FAULT_NOPAGE;
}
return 0;
}
EXPORT_SYMBOL(ttm_bo_vm_reserve);
/**
* ttm_bo_vm_fault_reserved - TTM fault helper
* @vmf: The struct vm_fault given as argument to the fault callback
* @prot: The page protection to be used for this memory area.
* @num_prefault: Maximum number of prefault pages. The caller may want to
* specify this based on madvice settings and the size of the GPU object
* backed by the memory.
*
* This function inserts one or more page table entries pointing to the
* memory backing the buffer object, and then returns a return code
* instructing the caller to retry the page access.
*
* Return:
* VM_FAULT_NOPAGE on success or pending signal
* VM_FAULT_SIGBUS on unspecified error
* VM_FAULT_OOM on out-of-memory
* VM_FAULT_RETRY if retryable wait
*/
vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
pgprot_t prot,
pgoff_t num_prefault)
{
struct vm_area_struct *vma = vmf->vma;
struct vm_area_struct cvma = *vma;
struct ttm_buffer_object *bo = vma->vm_private_data;
struct ttm_bo_device *bdev = bo->bdev;
unsigned long page_offset;
unsigned long page_last;
unsigned long pfn;
struct ttm_tt *ttm = NULL;
struct page *page;
int err;
pgoff_t i;
vm_fault_t ret = VM_FAULT_NOPAGE;
unsigned long address = vmf->address;
struct ttm_mem_type_manager *man =
&bdev->man[bo->mem.mem_type];
/*
* Refuse to fault imported pages. This should be handled
* (if at all) by redirecting mmap to the exporter.
*/
if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
return VM_FAULT_SIGBUS;
if (bdev->driver->fault_reserve_notify) {
struct dma_fence *moving = dma_fence_get(bo->moving);
......@@ -169,11 +212,9 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
break;
case -EBUSY:
case -ERESTARTSYS:
ret = VM_FAULT_NOPAGE;
goto out_unlock;
return VM_FAULT_NOPAGE;
default:
ret = VM_FAULT_SIGBUS;
goto out_unlock;
return VM_FAULT_SIGBUS;
}
if (bo->moving != moving) {
......@@ -189,21 +230,12 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
* move.
*/
ret = ttm_bo_vm_fault_idle(bo, vmf);
if (unlikely(ret != 0)) {
if (ret == VM_FAULT_RETRY &&
!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
/* The BO has already been unreserved. */
return ret;
}
goto out_unlock;
}
if (unlikely(ret != 0))
return ret;
err = ttm_mem_io_lock(man, true);
if (unlikely(err != 0)) {
ret = VM_FAULT_NOPAGE;
goto out_unlock;
}
if (unlikely(err != 0))
return VM_FAULT_NOPAGE;
err = ttm_mem_io_reserve_vm(bo);
if (unlikely(err != 0)) {
ret = VM_FAULT_SIGBUS;
......@@ -220,18 +252,8 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
goto out_io_unlock;
}
/*
* Make a local vma copy to modify the page_prot member
* and vm_flags if necessary. The vma parameter is protected
* by mmap_sem in write mode.
*/
cvma = *vma;
cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
if (bo->mem.bus.is_iomem) {
cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
cvma.vm_page_prot);
} else {
cvma.vm_page_prot = ttm_io_prot(bo->mem.placement, prot);
if (!bo->mem.bus.is_iomem) {
struct ttm_operation_ctx ctx = {
.interruptible = false,
.no_wait_gpu = false,
......@@ -240,24 +262,21 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
};
ttm = bo->ttm;
cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
cvma.vm_page_prot);
/* Allocate all page at once, most common usage */
if (ttm_tt_populate(ttm, &ctx)) {
if (ttm_tt_populate(bo->ttm, &ctx)) {
ret = VM_FAULT_OOM;
goto out_io_unlock;
}
} else {
/* Iomem should not be marked encrypted */
cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
}
/*
* Speculatively prefault a number of pages. Only error on
* first page.
*/
for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
for (i = 0; i < num_prefault; ++i) {
if (bo->mem.bus.is_iomem) {
/* Iomem should not be marked encrypted */
cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
pfn = ttm_bo_io_mem_pfn(bo, page_offset);
} else {
page = ttm->pages[page_offset];
......@@ -293,28 +312,49 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
out_io_unlock:
ttm_mem_io_unlock(man);
out_unlock:
return ret;
}
EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgprot_t prot;
struct ttm_buffer_object *bo = vma->vm_private_data;
vm_fault_t ret;
ret = ttm_bo_vm_reserve(bo, vmf);
if (ret)
return ret;
prot = vm_get_page_prot(vma->vm_flags);
ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
return ret;
dma_resv_unlock(bo->base.resv);
return ret;
}
static void ttm_bo_vm_open(struct vm_area_struct *vma)
void ttm_bo_vm_open(struct vm_area_struct *vma)
{
struct ttm_buffer_object *bo =
(struct ttm_buffer_object *)vma->vm_private_data;
struct ttm_buffer_object *bo = vma->vm_private_data;
WARN_ON(bo->bdev->dev_mapping != vma->vm_file->f_mapping);
ttm_bo_get(bo);
}
EXPORT_SYMBOL(ttm_bo_vm_open);
static void ttm_bo_vm_close(struct vm_area_struct *vma)
void ttm_bo_vm_close(struct vm_area_struct *vma)
{
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)vma->vm_private_data;
struct ttm_buffer_object *bo = vma->vm_private_data;
ttm_bo_put(bo);
vma->vm_private_data = NULL;
}
EXPORT_SYMBOL(ttm_bo_vm_close);
static int ttm_bo_vm_access_kmap(struct ttm_buffer_object *bo,
unsigned long offset,
......
......@@ -8,6 +8,7 @@ config DRM_VMWGFX
select FB_CFB_IMAGEBLIT
select DRM_TTM
select FB
select MAPPING_DIRTY_HELPERS
# Only needed for the transitional use of drm_crtc_init - can be removed
# again once vmwgfx sets up the primary plane itself.
select DRM_KMS_HELPER
......
......@@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
vmwgfx_validation.o \
vmwgfx_validation.o vmwgfx_page_dirty.o \
ttm_object.o ttm_lock.o
obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
......@@ -1280,7 +1280,6 @@ svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
return offset;
}
static inline u32
svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
surf_size_struct baseLevelSize,
......@@ -1375,4 +1374,236 @@ svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
return svga3dsurface_is_dx_screen_target_format(format);
}
/**
* struct svga3dsurface_mip - Mimpmap level information
* @bytes: Bytes required in the backing store of this mipmap level.
* @img_stride: Byte stride per image.
* @row_stride: Byte stride per block row.
* @size: The size of the mipmap.
*/
struct svga3dsurface_mip {
size_t bytes;
size_t img_stride;
size_t row_stride;
struct drm_vmw_size size;
};
/**
* struct svga3dsurface_cache - Cached surface information
* @desc: Pointer to the surface descriptor
* @mip: Array of mipmap level information. Valid size is @num_mip_levels.
* @mip_chain_bytes: Bytes required in the backing store for the whole chain
* of mip levels.
* @sheet_bytes: Bytes required in the backing store for a sheet
* representing a single sample.
* @num_mip_levels: Valid size of the @mip array. Number of mipmap levels in
* a chain.
* @num_layers: Number of slices in an array texture or number of faces in
* a cubemap texture.
*/
struct svga3dsurface_cache {
const struct svga3d_surface_desc *desc;
struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
size_t mip_chain_bytes;
size_t sheet_bytes;
u32 num_mip_levels;
u32 num_layers;
};
/**
* struct svga3dsurface_loc - Surface location
* @sub_resource: Surface subresource. Defined as layer * num_mip_levels +
* mip_level.
* @x: X coordinate.
* @y: Y coordinate.
* @z: Z coordinate.
*/
struct svga3dsurface_loc {
u32 sub_resource;
u32 x, y, z;
};
/**
* svga3dsurface_subres - Compute the subresource from layer and mipmap.
* @cache: Surface layout data.
* @mip_level: The mipmap level.
* @layer: The surface layer (face or array slice).
*
* Return: The subresource.
*/
static inline u32 svga3dsurface_subres(const struct svga3dsurface_cache *cache,
u32 mip_level, u32 layer)
{
return cache->num_mip_levels * layer + mip_level;
}
/**
* svga3dsurface_setup_cache - Build a surface cache entry
* @size: The surface base level dimensions.
* @format: The surface format.
* @num_mip_levels: Number of mipmap levels.
* @num_layers: Number of layers.
* @cache: Pointer to a struct svga3dsurface_cach object to be filled in.
*
* Return: Zero on success, -EINVAL on invalid surface layout.
*/
static inline int svga3dsurface_setup_cache(const struct drm_vmw_size *size,
SVGA3dSurfaceFormat format,
u32 num_mip_levels,
u32 num_layers,
u32 num_samples,
struct svga3dsurface_cache *cache)
{
const struct svga3d_surface_desc *desc;
u32 i;
memset(cache, 0, sizeof(*cache));
cache->desc = desc = svga3dsurface_get_desc(format);
cache->num_mip_levels = num_mip_levels;
cache->num_layers = num_layers;
for (i = 0; i < cache->num_mip_levels; i++) {
struct svga3dsurface_mip *mip = &cache->mip[i];
mip->size = svga3dsurface_get_mip_size(*size, i);
mip->bytes = svga3dsurface_get_image_buffer_size
(desc, &mip->size, 0);
mip->row_stride =
__KERNEL_DIV_ROUND_UP(mip->size.width,
desc->block_size.width) *
desc->bytes_per_block * num_samples;
if (!mip->row_stride)
goto invalid_dim;
mip->img_stride =
__KERNEL_DIV_ROUND_UP(mip->size.height,
desc->block_size.height) *
mip->row_stride;
if (!mip->img_stride)
goto invalid_dim;
cache->mip_chain_bytes += mip->bytes;
}
cache->sheet_bytes = cache->mip_chain_bytes * num_layers;
if (!cache->sheet_bytes)
goto invalid_dim;
return 0;
invalid_dim:
VMW_DEBUG_USER("Invalid surface layout for dirty tracking.\n");
return -EINVAL;
}
/**
* svga3dsurface_get_loc - Get a surface location from an offset into the
* backing store
* @cache: Surface layout data.
* @loc: Pointer to a struct svga3dsurface_loc to be filled in.
* @offset: Offset into the surface backing store.
*/
static inline void
svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
struct svga3dsurface_loc *loc,
size_t offset)
{
const struct svga3dsurface_mip *mip = &cache->mip[0];
const struct svga3d_surface_desc *desc = cache->desc;
u32 layer;
int i;
if (offset >= cache->sheet_bytes)
offset %= cache->sheet_bytes;
layer = offset / cache->mip_chain_bytes;
offset -= layer * cache->mip_chain_bytes;
for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
if (mip->bytes > offset)
break;
offset -= mip->bytes;
}
loc->sub_resource = svga3dsurface_subres(cache, i, layer);
loc->z = offset / mip->img_stride;
offset -= loc->z * mip->img_stride;
loc->z *= desc->block_size.depth;
loc->y = offset / mip->row_stride;
offset -= loc->y * mip->row_stride;
loc->y *= desc->block_size.height;
loc->x = offset / desc->bytes_per_block;
loc->x *= desc->block_size.width;
}
/**
* svga3dsurface_inc_loc - Clamp increment a surface location with one block
* size
* in each dimension.
* @loc: Pointer to a struct svga3dsurface_loc to be incremented.
*
* When computing the size of a range as size = end - start, the range does not
* include the end element. However a location representing the last byte
* of a touched region in the backing store *is* included in the range.
* This function modifies such a location to match the end definition
* given as start + size which is the one used in a SVGA3dBox.
*/
static inline void
svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
struct svga3dsurface_loc *loc)
{
const struct svga3d_surface_desc *desc = cache->desc;
u32 mip = loc->sub_resource % cache->num_mip_levels;
const struct drm_vmw_size *size = &cache->mip[mip].size;
loc->sub_resource++;
loc->x += desc->block_size.width;
if (loc->x > size->width)
loc->x = size->width;
loc->y += desc->block_size.height;
if (loc->y > size->height)
loc->y = size->height;
loc->z += desc->block_size.depth;
if (loc->z > size->depth)
loc->z = size->depth;
}
/**
* svga3dsurface_min_loc - The start location in a subresource
* @cache: Surface layout data.
* @sub_resource: The subresource.
* @loc: Pointer to a struct svga3dsurface_loc to be filled in.
*/
static inline void
svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
u32 sub_resource,
struct svga3dsurface_loc *loc)
{
loc->sub_resource = sub_resource;
loc->x = loc->y = loc->z = 0;
}
/**
* svga3dsurface_min_loc - The end location in a subresource
* @cache: Surface layout data.
* @sub_resource: The subresource.
* @loc: Pointer to a struct svga3dsurface_loc to be filled in.
*
* Following the end definition given in svga3dsurface_inc_loc(),
* Compute the end location of a surface subresource.
*/
static inline void
svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
u32 sub_resource,
struct svga3dsurface_loc *loc)
{
const struct drm_vmw_size *size;
u32 mip;
loc->sub_resource = sub_resource + 1;
mip = sub_resource % cache->num_mip_levels;
size = &cache->mip[mip].size;
loc->x = size->width;
loc->y = size->height;
loc->z = size->depth;
}
#endif /* _SVGA3D_SURFACEDEFS_H_ */
......@@ -462,6 +462,8 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
{
struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
WARN_ON(vmw_bo->dirty);
WARN_ON(!RB_EMPTY_ROOT(&vmw_bo->res_tree));
vmw_bo_unmap(vmw_bo);
kfree(vmw_bo);
}
......@@ -475,8 +477,11 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
{
struct vmw_user_buffer_object *vmw_user_bo = vmw_user_buffer_object(bo);
struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
vmw_bo_unmap(&vmw_user_bo->vbo);
WARN_ON(vbo->dirty);
WARN_ON(!RB_EMPTY_ROOT(&vbo->res_tree));
vmw_bo_unmap(vbo);
ttm_prime_object_kfree(vmw_user_bo, prime);
}
......@@ -511,8 +516,7 @@ int vmw_bo_init(struct vmw_private *dev_priv,
memset(vmw_bo, 0, sizeof(*vmw_bo));
BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3);
vmw_bo->base.priority = 3;
INIT_LIST_HEAD(&vmw_bo->res_list);
vmw_bo->res_tree = RB_ROOT;
ret = ttm_bo_init(bdev, &vmw_bo->base, size,
ttm_bo_type_device, placement,
......
......@@ -56,9 +56,9 @@
#define VMWGFX_DRIVER_NAME "vmwgfx"
#define VMWGFX_DRIVER_DATE "20180704"
#define VMWGFX_DRIVER_DATE "20190328"
#define VMWGFX_DRIVER_MAJOR 2
#define VMWGFX_DRIVER_MINOR 15
#define VMWGFX_DRIVER_MINOR 16
#define VMWGFX_DRIVER_PATCHLEVEL 0
#define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
#define VMWGFX_MAX_RELOCATIONS 2048
......@@ -100,17 +100,18 @@ struct vmw_fpriv {
/**
* struct vmw_buffer_object - TTM buffer object with vmwgfx additions
* @base: The TTM buffer object
* @res_list: List of resources using this buffer object as a backing MOB
* @res_tree: RB tree of resources using this buffer object as a backing MOB
* @pin_count: pin depth
* @cpu_writers: Number of synccpu write grabs. Protected by reservation when
* increased. May be decreased without reservation.
* @dx_query_ctx: DX context if this buffer object is used as a DX query MOB
* @map: Kmap object for semi-persistent mappings
* @res_prios: Eviction priority counts for attached resources
* @dirty: structure for user-space dirty-tracking
*/
struct vmw_buffer_object {
struct ttm_buffer_object base;
struct list_head res_list;
struct rb_root res_tree;
s32 pin_count;
atomic_t cpu_writers;
/* Not ref-counted. Protected by binding_mutex */
......@@ -118,6 +119,7 @@ struct vmw_buffer_object {
/* Protected by reservation */
struct ttm_bo_kmap_obj map;
u32 res_prios[TTM_MAX_BO_PRIORITY];
struct vmw_bo_dirty *dirty;
};
/**
......@@ -148,7 +150,8 @@ struct vmw_res_func;
* @res_dirty: Resource contains data not yet in the backup buffer. Protected
* by resource reserved.
* @backup_dirty: Backup buffer contains data not yet in the HW resource.
* Protecte by resource reserved.
* Protected by resource reserved.
* @coherent: Emulate coherency by tracking vm accesses.
* @backup: The backup buffer if any. Protected by resource reserved.
* @backup_offset: Offset into the backup buffer if any. Protected by resource
* reserved. Note that only a few resource types can have a @backup_offset
......@@ -157,29 +160,32 @@ struct vmw_res_func;
* pin-count greater than zero. It is not on the resource LRU lists and its
* backup buffer is pinned. Hence it can't be evicted.
* @func: Method vtable for this resource. Immutable.
* @mob_node; Node for the MOB backup rbtree. Protected by @backup reserved.
* @lru_head: List head for the LRU list. Protected by @dev_priv::resource_lock.
* @mob_head: List head for the MOB backup list. Protected by @backup reserved.
* @binding_head: List head for the context binding list. Protected by
* the @dev_priv::binding_mutex
* @res_free: The resource destructor.
* @hw_destroy: Callback to destroy the resource on the device, as part of
* resource destruction.
*/
struct vmw_resource_dirty;
struct vmw_resource {
struct kref kref;
struct vmw_private *dev_priv;
int id;
u32 used_prio;
unsigned long backup_size;
bool res_dirty;
bool backup_dirty;
u32 res_dirty : 1;
u32 backup_dirty : 1;
u32 coherent : 1;
struct vmw_buffer_object *backup;
unsigned long backup_offset;
unsigned long pin_count;
const struct vmw_res_func *func;
struct rb_node mob_node;
struct list_head lru_head;
struct list_head mob_head;
struct list_head binding_head;
struct vmw_resource_dirty *dirty;
void (*res_free) (struct vmw_resource *res);
void (*hw_destroy) (struct vmw_resource *res);
};
......@@ -678,7 +684,8 @@ extern void vmw_resource_unreference(struct vmw_resource **p_res);
extern struct vmw_resource *vmw_resource_reference(struct vmw_resource *res);
extern struct vmw_resource *
vmw_resource_reference_unless_doomed(struct vmw_resource *res);
extern int vmw_resource_validate(struct vmw_resource *res, bool intr);
extern int vmw_resource_validate(struct vmw_resource *res, bool intr,
bool dirtying);
extern int vmw_resource_reserve(struct vmw_resource *res, bool interruptible,
bool no_backup);
extern bool vmw_resource_needs_backup(const struct vmw_resource *res);
......@@ -720,6 +727,10 @@ extern void vmw_resource_evict_all(struct vmw_private *dev_priv);
extern void vmw_resource_unbind_list(struct vmw_buffer_object *vbo);
void vmw_resource_mob_attach(struct vmw_resource *res);
void vmw_resource_mob_detach(struct vmw_resource *res);
void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
pgoff_t end);
int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
pgoff_t end, pgoff_t *num_prefault);
/**
* vmw_resource_mob_attached - Whether a resource currently has a mob attached
......@@ -729,7 +740,7 @@ void vmw_resource_mob_detach(struct vmw_resource *res);
*/
static inline bool vmw_resource_mob_attached(const struct vmw_resource *res)
{
return !list_empty(&res->mob_head);
return !RB_EMPTY_NODE(&res->mob_node);
}
/**
......@@ -1407,6 +1418,17 @@ int vmw_host_log(const char *log);
#define VMW_DEBUG_USER(fmt, ...) \
DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
/* Resource dirtying - vmwgfx_page_dirty.c */
void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
void vmw_bo_dirty_clear_res(struct vmw_resource *res);
void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
pgoff_t start, pgoff_t end);
vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
/**
* VMW_DEBUG_KMS - Debug output for kernel mode-setting
*
......
......@@ -2560,7 +2560,6 @@ static int vmw_cmd_dx_check_subresource(struct vmw_private *dev_priv,
offsetof(typeof(*cmd), sid));
cmd = container_of(header, typeof(*cmd), header);
return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
VMW_RES_DIRTY_NONE, user_surface_converter,
&cmd->sid, NULL);
......
This diff is collapsed.
......@@ -40,11 +40,24 @@
void vmw_resource_mob_attach(struct vmw_resource *res)
{
struct vmw_buffer_object *backup = res->backup;
struct rb_node **new = &backup->res_tree.rb_node, *parent = NULL;
dma_resv_assert_held(res->backup->base.base.resv);
res->used_prio = (res->res_dirty) ? res->func->dirty_prio :
res->func->prio;
list_add_tail(&res->mob_head, &backup->res_list);
while (*new) {
struct vmw_resource *this =
container_of(*new, struct vmw_resource, mob_node);
parent = *new;
new = (res->backup_offset < this->backup_offset) ?
&((*new)->rb_left) : &((*new)->rb_right);
}
rb_link_node(&res->mob_node, parent, new);
rb_insert_color(&res->mob_node, &backup->res_tree);
vmw_bo_prio_add(backup, res->used_prio);
}
......@@ -58,7 +71,8 @@ void vmw_resource_mob_detach(struct vmw_resource *res)
dma_resv_assert_held(backup->base.base.resv);
if (vmw_resource_mob_attached(res)) {
list_del_init(&res->mob_head);
rb_erase(&res->mob_node, &backup->res_tree);
RB_CLEAR_NODE(&res->mob_node);
vmw_bo_prio_del(backup, res->used_prio);
}
}
......@@ -119,6 +133,10 @@ static void vmw_resource_release(struct kref *kref)
}
res->backup_dirty = false;
vmw_resource_mob_detach(res);
if (res->dirty)
res->func->dirty_free(res);
if (res->coherent)
vmw_bo_dirty_release(res->backup);
ttm_bo_unreserve(bo);
vmw_bo_unreference(&res->backup);
}
......@@ -200,15 +218,17 @@ int vmw_resource_init(struct vmw_private *dev_priv, struct vmw_resource *res,
res->res_free = res_free;
res->dev_priv = dev_priv;
res->func = func;
RB_CLEAR_NODE(&res->mob_node);
INIT_LIST_HEAD(&res->lru_head);
INIT_LIST_HEAD(&res->mob_head);
INIT_LIST_HEAD(&res->binding_head);
res->id = -1;
res->backup = NULL;
res->backup_offset = 0;
res->backup_dirty = false;
res->res_dirty = false;
res->coherent = false;
res->used_prio = 3;
res->dirty = NULL;
if (delay_id)
return 0;
else
......@@ -373,7 +393,8 @@ static int vmw_resource_buf_alloc(struct vmw_resource *res,
* should be retried once resources have been freed up.
*/
static int vmw_resource_do_validate(struct vmw_resource *res,
struct ttm_validate_buffer *val_buf)
struct ttm_validate_buffer *val_buf,
bool dirtying)
{
int ret = 0;
const struct vmw_res_func *func = res->func;
......@@ -395,6 +416,39 @@ static int vmw_resource_do_validate(struct vmw_resource *res,
vmw_resource_mob_attach(res);
}
/*
* Handle the case where the backup mob is marked coherent but
* the resource isn't.
*/
if (func->dirty_alloc && vmw_resource_mob_attached(res) &&
!res->coherent) {
if (res->backup->dirty && !res->dirty) {
ret = func->dirty_alloc(res);
if (ret)
return ret;
} else if (!res->backup->dirty && res->dirty) {
func->dirty_free(res);
}
}
/*
* Transfer the dirty regions to the resource and update
* the resource.
*/
if (res->dirty) {
if (dirtying && !res->res_dirty) {
pgoff_t start = res->backup_offset >> PAGE_SHIFT;
pgoff_t end = __KERNEL_DIV_ROUND_UP
(res->backup_offset + res->backup_size,
PAGE_SIZE);
vmw_bo_dirty_unmap(res->backup, start, end);
}
vmw_bo_dirty_transfer_to_res(res);
return func->dirty_sync(res);
}
return 0;
out_bind_failed:
......@@ -433,16 +487,28 @@ void vmw_resource_unreserve(struct vmw_resource *res,
if (switch_backup && new_backup != res->backup) {
if (res->backup) {
vmw_resource_mob_detach(res);
if (res->coherent)
vmw_bo_dirty_release(res->backup);
vmw_bo_unreference(&res->backup);
}
if (new_backup) {
res->backup = vmw_bo_reference(new_backup);
/*
* The validation code should already have added a
* dirty tracker here.
*/
WARN_ON(res->coherent && !new_backup->dirty);
vmw_resource_mob_attach(res);
} else {
res->backup = NULL;
}
} else if (switch_backup && res->coherent) {
vmw_bo_dirty_release(res->backup);
}
if (switch_backup)
res->backup_offset = new_backup_offset;
......@@ -622,6 +688,7 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
* to the device.
* @res: The resource to make visible to the device.
* @intr: Perform waits interruptible if possible.
* @dirtying: Pending GPU operation will dirty the resource
*
* On succesful return, any backup DMA buffer pointed to by @res->backup will
* be reserved and validated.
......@@ -631,7 +698,8 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
* Return: Zero on success, -ERESTARTSYS if interrupted, negative error code
* on failure.
*/
int vmw_resource_validate(struct vmw_resource *res, bool intr)
int vmw_resource_validate(struct vmw_resource *res, bool intr,
bool dirtying)
{
int ret;
struct vmw_resource *evict_res;
......@@ -648,7 +716,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr)
if (res->backup)
val_buf.bo = &res->backup->base;
do {
ret = vmw_resource_do_validate(res, &val_buf);
ret = vmw_resource_do_validate(res, &val_buf, dirtying);
if (likely(ret != -EBUSY))
break;
......@@ -711,19 +779,20 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr)
*/
void vmw_resource_unbind_list(struct vmw_buffer_object *vbo)
{
struct vmw_resource *res, *next;
struct ttm_validate_buffer val_buf = {
.bo = &vbo->base,
.num_shared = 0
};
dma_resv_assert_held(vbo->base.base.resv);
list_for_each_entry_safe(res, next, &vbo->res_list, mob_head) {
if (!res->func->unbind)
continue;
while (!RB_EMPTY_ROOT(&vbo->res_tree)) {
struct rb_node *node = vbo->res_tree.rb_node;
struct vmw_resource *res =
container_of(node, struct vmw_resource, mob_node);
if (!WARN_ON_ONCE(!res->func->unbind))
(void) res->func->unbind(res, res->res_dirty, &val_buf);
(void) res->func->unbind(res, res->res_dirty, &val_buf);
res->backup_dirty = true;
res->res_dirty = false;
vmw_resource_mob_detach(res);
......@@ -947,7 +1016,7 @@ int vmw_resource_pin(struct vmw_resource *res, bool interruptible)
/* Do we really need to pin the MOB as well? */
vmw_bo_pin_reserved(vbo, true);
}
ret = vmw_resource_validate(res, interruptible);
ret = vmw_resource_validate(res, interruptible, true);
if (vbo)
ttm_bo_unreserve(&vbo->base);
if (ret)
......@@ -1007,3 +1076,101 @@ enum vmw_res_type vmw_res_type(const struct vmw_resource *res)
{
return res->func->res_type;
}
/**
* vmw_resource_update_dirty - Update a resource's dirty tracker with a
* sequential range of touched backing store memory.
* @res: The resource.
* @start: The first page touched.
* @end: The last page touched + 1.
*/
void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
pgoff_t end)
{
if (res->dirty)
res->func->dirty_range_add(res, start << PAGE_SHIFT,
end << PAGE_SHIFT);
}
/**
* vmw_resources_clean - Clean resources intersecting a mob range
* @vbo: The mob buffer object
* @start: The mob page offset starting the range
* @end: The mob page offset ending the range
* @num_prefault: Returns how many pages including the first have been
* cleaned and are ok to prefault
*/
int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
pgoff_t end, pgoff_t *num_prefault)
{
struct rb_node *cur = vbo->res_tree.rb_node;
struct vmw_resource *found = NULL;
unsigned long res_start = start << PAGE_SHIFT;
unsigned long res_end = end << PAGE_SHIFT;
unsigned long last_cleaned = 0;
/*
* Find the resource with lowest backup_offset that intersects the
* range.
*/
while (cur) {
struct vmw_resource *cur_res =
container_of(cur, struct vmw_resource, mob_node);
if (cur_res->backup_offset >= res_end) {
cur = cur->rb_left;
} else if (cur_res->backup_offset + cur_res->backup_size <=
res_start) {
cur = cur->rb_right;
} else {
found = cur_res;
cur = cur->rb_left;
/* Continue to look for resources with lower offsets */
}
}
/*
* In order of increasing backup_offset, clean dirty resorces
* intersecting the range.
*/
while (found) {
if (found->res_dirty) {
int ret;
if (!found->func->clean)
return -EINVAL;
ret = found->func->clean(found);
if (ret)
return ret;
found->res_dirty = false;
}
last_cleaned = found->backup_offset + found->backup_size;
cur = rb_next(&found->mob_node);
if (!cur)
break;
found = container_of(cur, struct vmw_resource, mob_node);
if (found->backup_offset >= res_end)
break;
}
/*
* Set number of pages allowed prefaulting and fence the buffer object
*/
*num_prefault = 1;
if (last_cleaned > res_start) {
struct ttm_buffer_object *bo = &vbo->base;
*num_prefault = __KERNEL_DIV_ROUND_UP(last_cleaned - res_start,
PAGE_SIZE);
vmw_bo_fence_single(bo, NULL);
if (bo->moving)
dma_fence_put(bo->moving);
bo->moving = dma_fence_get
(dma_resv_get_excl(bo->base.resv));
}
return 0;
}
......@@ -71,6 +71,13 @@ struct vmw_user_resource_conv {
* @commit_notify: If the resource is a command buffer managed resource,
* callback to notify that a define or remove command
* has been committed to the device.
* @dirty_alloc: Allocate a dirty tracker. NULL if dirty-tracking is not
* supported.
* @dirty_free: Free the dirty tracker.
* @dirty_sync: Upload the dirty mob contents to the resource.
* @dirty_add_range: Add a sequential dirty range to the resource
* dirty tracker.
* @clean: Clean the resource.
*/
struct vmw_res_func {
enum vmw_res_type res_type;
......@@ -90,6 +97,12 @@ struct vmw_res_func {
struct ttm_validate_buffer *val_buf);
void (*commit_notify)(struct vmw_resource *res,
enum vmw_cmdbuf_res_state state);
int (*dirty_alloc)(struct vmw_resource *res);
void (*dirty_free)(struct vmw_resource *res);
int (*dirty_sync)(struct vmw_resource *res);
void (*dirty_range_add)(struct vmw_resource *res, size_t start,
size_t end);
int (*clean)(struct vmw_resource *res);
};
/**
......
This diff is collapsed.
......@@ -29,10 +29,23 @@
int vmw_mmap(struct file *filp, struct vm_area_struct *vma)
{
static const struct vm_operations_struct vmw_vm_ops = {
.pfn_mkwrite = vmw_bo_vm_mkwrite,
.page_mkwrite = vmw_bo_vm_mkwrite,
.fault = vmw_bo_vm_fault,
.open = ttm_bo_vm_open,
.close = ttm_bo_vm_close
};
struct drm_file *file_priv = filp->private_data;
struct vmw_private *dev_priv = vmw_priv(file_priv->minor->dev);
int ret = ttm_bo_mmap(filp, vma, &dev_priv->bdev);
return ttm_bo_mmap(filp, vma, &dev_priv->bdev);
if (ret)
return ret;
vma->vm_ops = &vmw_vm_ops;
return 0;
}
/* struct vmw_validation_mem callback */
......
......@@ -33,6 +33,8 @@
* struct vmw_validation_bo_node - Buffer object validation metadata.
* @base: Metadata used for TTM reservation- and validation.
* @hash: A hash entry used for the duplicate detection hash table.
* @coherent_count: If switching backup buffers, number of new coherent
* resources that will have this buffer as a backup buffer.
* @as_mob: Validate as mob.
* @cpu_blit: Validate for cpu blit access.
*
......@@ -42,6 +44,7 @@
struct vmw_validation_bo_node {
struct ttm_validate_buffer base;
struct drm_hash_item hash;
unsigned int coherent_count;
u32 as_mob : 1;
u32 cpu_blit : 1;
};
......@@ -459,6 +462,19 @@ int vmw_validation_res_reserve(struct vmw_validation_context *ctx,
if (ret)
goto out_unreserve;
}
if (val->switching_backup && val->new_backup &&
res->coherent) {
struct vmw_validation_bo_node *bo_node =
vmw_validation_find_bo_dup(ctx,
val->new_backup);
if (WARN_ON(!bo_node)) {
ret = -EINVAL;
goto out_unreserve;
}
bo_node->coherent_count++;
}
}
return 0;
......@@ -565,6 +581,9 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
int ret;
list_for_each_entry(entry, &ctx->bo_list, base.head) {
struct vmw_buffer_object *vbo =
container_of(entry->base.bo, typeof(*vbo), base);
if (entry->cpu_blit) {
struct ttm_operation_ctx ctx = {
.interruptible = intr,
......@@ -579,6 +598,27 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
}
if (ret)
return ret;
/*
* Rather than having the resource code allocating the bo
* dirty tracker in resource_unreserve() where we can't fail,
* Do it here when validating the buffer object.
*/
if (entry->coherent_count) {
unsigned int coherent_count = entry->coherent_count;
while (coherent_count) {
ret = vmw_bo_dirty_add(vbo);
if (ret)
return ret;
coherent_count--;
}
entry->coherent_count -= coherent_count;
}
if (vbo->dirty)
vmw_bo_dirty_scan(vbo);
}
return 0;
}
......@@ -604,7 +644,8 @@ int vmw_validation_res_validate(struct vmw_validation_context *ctx, bool intr)
struct vmw_resource *res = val->res;
struct vmw_buffer_object *backup = res->backup;
ret = vmw_resource_validate(res, intr);
ret = vmw_resource_validate(res, intr, val->dirty_set &&
val->dirty);
if (ret) {
if (ret != -ERESTARTSYS)
DRM_ERROR("Failed to validate resource.\n");
......@@ -831,3 +872,34 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
ctx->mem_size_left += size;
return 0;
}
/**
* vmw_validation_bo_backoff - Unreserve buffer objects registered with a
* validation context
* @ctx: The validation context
*
* This function unreserves the buffer objects previously reserved using
* vmw_validation_bo_reserve. It's typically used as part of an error path
*/
void vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
{
struct vmw_validation_bo_node *entry;
/*
* Switching coherent resource backup buffers failed.
* Release corresponding buffer object dirty trackers.
*/
list_for_each_entry(entry, &ctx->bo_list, base.head) {
if (entry->coherent_count) {
unsigned int coherent_count = entry->coherent_count;
struct vmw_buffer_object *vbo =
container_of(entry->base.bo, typeof(*vbo),
base);
while (coherent_count--)
vmw_bo_dirty_release(vbo);
}
}
ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
}
......@@ -173,20 +173,6 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
NULL);
}
/**
* vmw_validation_bo_backoff - Unreserve buffer objects registered with a
* validation context
* @ctx: The validation context
*
* This function unreserves the buffer objects previously reserved using
* vmw_validation_bo_reserve. It's typically used as part of an error path
*/
static inline void
vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
{
ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
}
/**
* vmw_validation_bo_fence - Unreserve and fence buffer objects registered
* with a validation context
......@@ -269,4 +255,6 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
unsigned int size);
void vmw_validation_res_set_dirty(struct vmw_validation_context *ctx,
void *val_private, u32 dirty);
void vmw_validation_bo_backoff(struct vmw_validation_context *ctx);
#endif
......@@ -727,4 +727,18 @@ static inline bool ttm_bo_uses_embedded_gem_object(struct ttm_buffer_object *bo)
{
return bo->base.dev != NULL;
}
/* Default number of pre-faulted pages in the TTM fault handler */
#define TTM_BO_VM_NUM_PREFAULT 16
vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
struct vm_fault *vmf);
vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
pgprot_t prot,
pgoff_t num_prefault);
void ttm_bo_vm_open(struct vm_area_struct *vma);
void ttm_bo_vm_close(struct vm_area_struct *vma);
#endif
......@@ -216,7 +216,6 @@ static inline int is_swap_pmd(pmd_t pmd)
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
return __pmd_trans_huge_lock(pmd, vma);
else
......@@ -225,7 +224,6 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
if (pud_trans_huge(*pud) || pud_devmap(*pud))
return __pud_trans_huge_lock(pud, vma);
else
......
......@@ -2632,7 +2632,6 @@ typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
extern void kernel_poison_pages(struct page *page, int numpages, int enable);
......@@ -2873,5 +2872,17 @@ static inline int pages_identical(struct page *page1, struct page *page2)
return !memcmp_pages(page1, page2);
}
#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr,
pgoff_t bitmap_pgoff,
unsigned long *bitmap,
pgoff_t *start,
pgoff_t *end);
unsigned long wp_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr);
#endif
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
......@@ -24,6 +24,9 @@ struct mm_walk;
* "do page table walk over the current vma", returning
* a negative value means "abort current page table walk
* right now" and returning 1 means "skip the current vma"
* @pre_vma: if set, called before starting walk on a non-null vma.
* @post_vma: if set, called after a walk on a non-null vma, provided
* that @pre_vma and the vma walk succeeded.
*/
struct mm_walk_ops {
int (*pud_entry)(pud_t *pud, unsigned long addr,
......@@ -39,6 +42,9 @@ struct mm_walk_ops {
struct mm_walk *walk);
int (*test_walk)(unsigned long addr, unsigned long next,
struct mm_walk *walk);
int (*pre_vma)(unsigned long start, unsigned long end,
struct mm_walk *walk);
void (*post_vma)(struct mm_walk *walk);
};
/**
......@@ -62,5 +68,8 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
void *private);
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private);
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
pgoff_t nr, const struct mm_walk_ops *ops,
void *private);
#endif /* _LINUX_PAGEWALK_H */
......@@ -891,11 +891,13 @@ struct drm_vmw_shader_arg {
* surface.
* @drm_vmw_surface_flag_create_buffer: Create a backup buffer if none is
* given.
* @drm_vmw_surface_flag_coherent: Back surface with coherent memory.
*/
enum drm_vmw_surface_flags {
drm_vmw_surface_flag_shareable = (1 << 0),
drm_vmw_surface_flag_scanout = (1 << 1),
drm_vmw_surface_flag_create_buffer = (1 << 2)
drm_vmw_surface_flag_create_buffer = (1 << 2),
drm_vmw_surface_flag_coherent = (1 << 3),
};
/**
......
......@@ -736,4 +736,7 @@ config ARCH_HAS_PTE_SPECIAL
config ARCH_HAS_HUGEPD
bool
config MAPPING_DIRTY_HELPERS
bool
endmenu
......@@ -107,3 +107,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
#include <linux/hugetlb.h>
#include <linux/bitops.h>
#include <linux/mmu_notifier.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
/**
* struct wp_walk - Private struct for pagetable walk callbacks
* @range: Range for mmu notifiers
* @tlbflush_start: Address of first modified pte
* @tlbflush_end: Address of last modified pte + 1
* @total: Total number of modified ptes
*/
struct wp_walk {
struct mmu_notifier_range range;
unsigned long tlbflush_start;
unsigned long tlbflush_end;
unsigned long total;
};
/**
* wp_pte - Write-protect a pte
* @pte: Pointer to the pte
* @addr: The virtual page address
* @walk: pagetable walk callback argument
*
* The function write-protects a pte and records the range in
* virtual address space of touched ptes for efficient range TLB flushes.
*/
static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
pte_t ptent = *pte;
if (pte_write(ptent)) {
pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
ptent = pte_wrprotect(old_pte);
ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
wpwalk->total++;
wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
addr + PAGE_SIZE);
}
return 0;
}
/**
* struct clean_walk - Private struct for the clean_record_pte function.
* @base: struct wp_walk we derive from
* @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
* @bitmap: Bitmap with one bit for each page offset in the address_space range
* covered.
* @start: Address_space page offset of first modified pte relative
* to @bitmap_pgoff
* @end: Address_space page offset of last modified pte relative
* to @bitmap_pgoff
*/
struct clean_walk {
struct wp_walk base;
pgoff_t bitmap_pgoff;
unsigned long *bitmap;
pgoff_t start;
pgoff_t end;
};
#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base)
/**
* clean_record_pte - Clean a pte and record its address space offset in a
* bitmap
* @pte: Pointer to the pte
* @addr: The virtual page address
* @walk: pagetable walk callback argument
*
* The function cleans a pte and records the range in
* virtual address space of touched ptes for efficient TLB flushes.
* It also records dirty ptes in a bitmap representing page offsets
* in the address_space, as well as the first and last of the bits
* touched.
*/
static int clean_record_pte(pte_t *pte, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
struct clean_walk *cwalk = to_clean_walk(wpwalk);
pte_t ptent = *pte;
if (pte_dirty(ptent)) {
pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
ptent = pte_mkclean(old_pte);
ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
wpwalk->total++;
wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
addr + PAGE_SIZE);
__set_bit(pgoff, cwalk->bitmap);
cwalk->start = min(cwalk->start, pgoff);
cwalk->end = max(cwalk->end, pgoff + 1);
}
return 0;
}
/* wp_clean_pmd_entry - The pagewalk pmd callback. */
static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
/* Dirty-tracking should be handled on the pte level */
pmd_t pmdval = pmd_read_atomic(pmd);
if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
return 0;
}
/* wp_clean_pud_entry - The pagewalk pud callback. */
static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
/* Dirty-tracking should be handled on the pte level */
pud_t pudval = READ_ONCE(*pud);
if (pud_trans_huge(pudval) || pud_devmap(pudval))
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
return 0;
}
/*
* wp_clean_pre_vma - The pagewalk pre_vma callback.
*
* The pre_vma callback performs the cache flush, stages the tlb flush
* and calls the necessary mmu notifiers.
*/
static int wp_clean_pre_vma(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
wpwalk->tlbflush_start = end;
wpwalk->tlbflush_end = start;
mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
walk->vma, walk->mm, start, end);
mmu_notifier_invalidate_range_start(&wpwalk->range);
flush_cache_range(walk->vma, start, end);
/*
* We're not using tlb_gather_mmu() since typically
* only a small subrange of PTEs are affected, whereas
* tlb_gather_mmu() records the full range.
*/
inc_tlb_flush_pending(walk->mm);
return 0;
}
/*
* wp_clean_post_vma - The pagewalk post_vma callback.
*
* The post_vma callback performs the tlb flush and calls necessary mmu
* notifiers.
*/
static void wp_clean_post_vma(struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
if (mm_tlb_flush_nested(walk->mm))
flush_tlb_range(walk->vma, wpwalk->range.start,
wpwalk->range.end);
else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start)
flush_tlb_range(walk->vma, wpwalk->tlbflush_start,
wpwalk->tlbflush_end);
mmu_notifier_invalidate_range_end(&wpwalk->range);
dec_tlb_flush_pending(walk->mm);
}
/*
* wp_clean_test_walk - The pagewalk test_walk callback.
*
* Won't perform dirty-tracking on COW, read-only or HUGETLB vmas.
*/
static int wp_clean_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
/* Skip non-applicable VMAs */
if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
(VM_SHARED | VM_MAYWRITE))
return 1;
return 0;
}
static const struct mm_walk_ops clean_walk_ops = {
.pte_entry = clean_record_pte,
.pmd_entry = wp_clean_pmd_entry,
.pud_entry = wp_clean_pud_entry,
.test_walk = wp_clean_test_walk,
.pre_vma = wp_clean_pre_vma,
.post_vma = wp_clean_post_vma
};
static const struct mm_walk_ops wp_walk_ops = {
.pte_entry = wp_pte,
.pmd_entry = wp_clean_pmd_entry,
.pud_entry = wp_clean_pud_entry,
.test_walk = wp_clean_test_walk,
.pre_vma = wp_clean_pre_vma,
.post_vma = wp_clean_post_vma
};
/**
* wp_shared_mapping_range - Write-protect all ptes in an address space range
* @mapping: The address_space we want to write protect
* @first_index: The first page offset in the range
* @nr: Number of incremental page offsets to cover
*
* Note: This function currently skips transhuge page-table entries, since
* it's intended for dirty-tracking on the PTE level. It will warn on
* encountering transhuge write-enabled entries, though, and can easily be
* extended to handle them as well.
*
* Return: The number of ptes actually write-protected. Note that
* already write-protected ptes are not counted.
*/
unsigned long wp_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr)
{
struct wp_walk wpwalk = { .total = 0 };
i_mmap_lock_read(mapping);
WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops,
&wpwalk));
i_mmap_unlock_read(mapping);
return wpwalk.total;
}
EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
/**
* clean_record_shared_mapping_range - Clean and record all ptes in an
* address space range
* @mapping: The address_space we want to clean
* @first_index: The first page offset in the range
* @nr: Number of incremental page offsets to cover
* @bitmap_pgoff: The page offset of the first bit in @bitmap
* @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
* cover the whole range @first_index..@first_index + @nr.
* @start: Pointer to number of the first set bit in @bitmap.
* is modified as new bits are set by the function.
* @end: Pointer to the number of the last set bit in @bitmap.
* none set. The value is modified as new bits are set by the function.
*
* Note: When this function returns there is no guarantee that a CPU has
* not already dirtied new ptes. However it will not clean any ptes not
* reported in the bitmap. The guarantees are as follows:
* a) All ptes dirty when the function starts executing will end up recorded
* in the bitmap.
* b) All ptes dirtied after that will either remain dirty, be recorded in the
* bitmap or both.
*
* If a caller needs to make sure all dirty ptes are picked up and none
* additional are added, it first needs to write-protect the address-space
* range and make sure new writers are blocked in page_mkwrite() or
* pfn_mkwrite(). And then after a TLB flush following the write-protection
* pick up all dirty bits.
*
* Note: This function currently skips transhuge page-table entries, since
* it's intended for dirty-tracking on the PTE level. It will warn on
* encountering transhuge dirty entries, though, and can easily be extended
* to handle them as well.
*
* Return: The number of dirty ptes actually cleaned.
*/
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr,
pgoff_t bitmap_pgoff,
unsigned long *bitmap,
pgoff_t *start,
pgoff_t *end)
{
bool none_set = (*start >= *end);
struct clean_walk cwalk = {
.base = { .total = 0 },
.bitmap_pgoff = bitmap_pgoff,
.bitmap = bitmap,
.start = none_set ? nr : *start,
.end = none_set ? 0 : *end,
};
i_mmap_lock_read(mapping);
WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops,
&cwalk.base));
i_mmap_unlock_read(mapping);
*start = cwalk.start;
*end = cwalk.end;
return cwalk.base.total;
}
EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range);
......@@ -10,8 +10,9 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
int err = 0;
const struct mm_walk_ops *ops = walk->ops;
spinlock_t *ptl;
pte = pte_offset_map(pmd, addr);
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (;;) {
err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
......@@ -22,7 +23,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte++;
}
pte_unmap(pte);
pte_unmap_unlock(pte, ptl);
return err;
}
......@@ -253,13 +254,23 @@ static int __walk_page_range(unsigned long start, unsigned long end,
{
int err = 0;
struct vm_area_struct *vma = walk->vma;
const struct mm_walk_ops *ops = walk->ops;
if (vma && ops->pre_vma) {
err = ops->pre_vma(start, end, walk);
if (err)
return err;
}
if (vma && is_vm_hugetlb_page(vma)) {
if (walk->ops->hugetlb_entry)
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
if (vma && ops->post_vma)
ops->post_vma(walk);
return err;
}
......@@ -290,6 +301,11 @@ static int __walk_page_range(unsigned long start, unsigned long end,
* its vm_flags. walk_page_test() and @ops->test_walk() are used for this
* purpose.
*
* If operations need to be staged before and committed after a vma is walked,
* there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
* since it is intended to handle commit-type operations, can't return any
* errors.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
* caller-specific data to callbacks, @private should be helpful.
......@@ -376,3 +392,80 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
return err;
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
/**
* walk_page_mapping - walk all memory areas mapped into a struct address_space.
* @mapping: Pointer to the struct address_space
* @first_index: First page offset in the address_space
* @nr: Number of incremental page offsets to cover
* @ops: operation to call during the walk
* @private: private data for callbacks' usage
*
* This function walks all memory areas mapped into a struct address_space.
* The walk is limited to only the given page-size index range, but if
* the index boundaries cross a huge page-table entry, that entry will be
* included.
*
* Also see walk_page_range() for additional information.
*
* Locking:
* This function can't require that the struct mm_struct::mmap_sem is held,
* since @mapping may be mapped by multiple processes. Instead
* @mapping->i_mmap_rwsem must be held. This might have implications in the
* callbacks, and it's up tho the caller to ensure that the
* struct mm_struct::mmap_sem is not needed.
*
* Also this means that a caller can't rely on the struct
* vm_area_struct::vm_flags to be constant across a call,
* except for immutable flags. Callers requiring this shouldn't use
* this function.
*
* Return: 0 on success, negative error code on failure, positive number on
* caller defined premature termination.
*/
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
pgoff_t nr, const struct mm_walk_ops *ops,
void *private)
{
struct mm_walk walk = {
.ops = ops,
.private = private,
};
struct vm_area_struct *vma;
pgoff_t vba, vea, cba, cea;
unsigned long start_addr, end_addr;
int err = 0;
lockdep_assert_held(&mapping->i_mmap_rwsem);
vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
first_index + nr - 1) {
/* Clip to the vma */
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma);
cba = first_index;
cba = max(cba, vba);
cea = first_index + nr;
cea = min(cea, vea);
start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
if (start_addr >= end_addr)
continue;
walk.vma = vma;
walk.mm = vma->vm_mm;
err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0) {
err = 0;
break;
} else if (err < 0)
break;
err = __walk_page_range(start_addr, end_addr, &walk);
if (err)
break;
}
return err;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment